gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2013 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "tree.h"
  28 #include "stor-layout.h"
  29 #include "basic-block.h"
  30 #include "gimple-pretty-print.h"
  31 #include "tree-ssa-alias.h"
  32 #include "internal-fn.h"
  33 #include "gimple-expr.h"
  34 #include "is-a.h"
  35 #include "gimple.h"
  36 #include "gimplify.h"
  37 #include "gimple-iterator.h"
  38 #include "gimplify-me.h"
  39 #include "gimple-ssa.h"
  40 #include "tree-phinodes.h"
  41 #include "ssa-iterators.h"
  42 #include "stringpool.h"
  43 #include "tree-ssanames.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop-niter.h"
  47 #include "tree-pass.h"
  48 #include "cfgloop.h"
  49 #include "expr.h"
  50 #include "recog.h"
  51 #include "optabs.h"
  52 #include "params.h"
  53 #include "diagnostic-core.h"
  54 #include "tree-chrec.h"
  55 #include "tree-scalar-evolution.h"
  56 #include "tree-vectorizer.h"
  57 #include "target.h"
  58
  59 /* Loop Vectorization Pass.
  60
  61    This pass tries to vectorize loops.
  62
  63    For example, the vectorizer transforms the following simple loop:
  64
  65         short a[N]; short b[N]; short c[N]; int i;
  66
  67         for (i=0; i<N; i++){
  68           a[i] = b[i] + c[i];
  69         }
  70
  71    as if it was manually vectorized by rewriting the source code into:
  72
  73         typedef int __attribute__((mode(V8HI))) v8hi;
  74         short a[N];  short b[N]; short c[N];   int i;
  75         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  76         v8hi va, vb, vc;
  77
  78         for (i=0; i<N/8; i++){
  79           vb = pb[i];
  80           vc = pc[i];
  81           va = vb + vc;
  82           pa[i] = va;
  83         }
  84
  85         The main entry to this pass is vectorize_loops(), in which
  86    the vectorizer applies a set of analyses on a given set of loops,
  87    followed by the actual vectorization transformation for the loops that
  88    had successfully passed the analysis phase.
  89         Throughout this pass we make a distinction between two types of
  90    data: scalars (which are represented by SSA_NAMES), and memory references
  91    ("data-refs").  These two types of data require different handling both
  92    during analysis and transformation. The types of data-refs that the
  93    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  94    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  95    accesses are required to have a simple (consecutive) access pattern.
  96
  97    Analysis phase:
  98    ===============
  99         The driver for the analysis phase is vect_analyze_loop().
 100    It applies a set of analyses, some of which rely on the scalar evolution
 101    analyzer (scev) developed by Sebastian Pop.
 102
 103         During the analysis phase the vectorizer records some information
 104    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 105    loop, as well as general information about the loop as a whole, which is
 106    recorded in a "loop_vec_info" struct attached to each loop.
 107
 108    Transformation phase:
 109    =====================
 110         The loop transformation phase scans all the stmts in the loop, and
 111    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 112    the loop that needs to be vectorized.  It inserts the vector code sequence
 113    just before the scalar stmt S, and records a pointer to the vector code
 114    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 115    attached to S).  This pointer will be used for the vectorization of following
 116    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 117    otherwise, we rely on dead code elimination for removing it.
 118
 119         For example, say stmt S1 was vectorized into stmt VS1:
 120
 121    VS1: vb = px[i];
 122    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 123    S2:  a = b;
 124
 125    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 126    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 127    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 128    resulting sequence would be:
 129
 130    VS1: vb = px[i];
 131    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 132    VS2: va = vb;
 133    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 134
 135         Operands that are not SSA_NAMEs, are data-refs that appear in
 136    load/store operations (like 'x[i]' in S1), and are handled differently.
 137
 138    Target modeling:
 139    =================
 140         Currently the only target specific information that is used is the
 141    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 142    Targets that can support different sizes of vectors, for now will need
 143    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 144    flexibility will be added in the future.
 145
 146         Since we only vectorize operations which vector form can be
 147    expressed using existing tree codes, to verify that an operation is
 148    supported, the vectorizer checks the relevant optab at the relevant
 149    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 150    the value found is CODE_FOR_nothing, then there's no target support, and
 151    we can't vectorize the stmt.
 152
 153    For additional information on this project see:
 154    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 155 */
 156
 157 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 158
 159 /* Function vect_determine_vectorization_factor
 160
 161    Determine the vectorization factor (VF).  VF is the number of data elements
 162    that are operated upon in parallel in a single iteration of the vectorized
 163    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 164    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 165    elements can fit in a single vector register.
 166
 167    We currently support vectorization of loops in which all types operated upon
 168    are of the same size.  Therefore this function currently sets VF according to
 169    the size of the types operated upon, and fails if there are multiple sizes
 170    in the loop.
 171
 172    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 173    original loop:
 174         for (i=0; i<N; i++){
 175           a[i] = b[i] + c[i];
 176         }
 177
 178    vectorized loop:
 179         for (i=0; i<N; i+=VF){
 180           a[i:VF] = b[i:VF] + c[i:VF];
 181         }
 182 */
 183
 184 static bool
 185 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 186 {
 187   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 188   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 189   int nbbs = loop->num_nodes;
 190   gimple_stmt_iterator si;
 191   unsigned int vectorization_factor = 0;
 192   tree scalar_type;
 193   gimple phi;
 194   tree vectype;
 195   unsigned int nunits;
 196   stmt_vec_info stmt_info;
 197   int i;
 198   HOST_WIDE_INT dummy;
 199   gimple stmt, pattern_stmt = NULL;
 200   gimple_seq pattern_def_seq = NULL;
 201   gimple_stmt_iterator pattern_def_si = gsi_none ();
 202   bool analyze_pattern_stmt = false;
 203
 204   if (dump_enabled_p ())
 205     dump_printf_loc (MSG_NOTE, vect_location,
 206                      "=== vect_determine_vectorization_factor ===\n");
 207
 208   for (i = 0; i < nbbs; i++)
 209     {
 210       basic_block bb = bbs[i];
 211
 212       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 213         {
 214           phi = gsi_stmt (si);
 215           stmt_info = vinfo_for_stmt (phi);
 216           if (dump_enabled_p ())
 217             {
 218               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 219               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 220               dump_printf (MSG_NOTE, "\n");
 221             }
 222
 223           gcc_assert (stmt_info);
 224
 225           if (STMT_VINFO_RELEVANT_P (stmt_info))
 226             {
 227               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 228               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 229
 230               if (dump_enabled_p ())
 231                 {
 232                   dump_printf_loc (MSG_NOTE, vect_location,
 233                                    "get vectype for scalar type:  ");
 234                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 235                   dump_printf (MSG_NOTE, "\n");
 236                 }
 237
 238               vectype = get_vectype_for_scalar_type (scalar_type);
 239               if (!vectype)
 240                 {
 241                   if (dump_enabled_p ())
 242                     {
 243                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 244                                        "not vectorized: unsupported "
 245                                        "data-type ");
 246                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 247                                          scalar_type);
 248                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 249                     }
 250                   return false;
 251                 }
 252               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 253
 254               if (dump_enabled_p ())
 255                 {
 256                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 257                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 258                   dump_printf (MSG_NOTE, "\n");
 259                 }
 260
 261               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 262               if (dump_enabled_p ())
 263                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 264                                  nunits);
 265
 266               if (!vectorization_factor
 267                   || (nunits > vectorization_factor))
 268                 vectorization_factor = nunits;
 269             }
 270         }
 271
 272       for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
 273         {
 274           tree vf_vectype;
 275
 276           if (analyze_pattern_stmt)
 277             stmt = pattern_stmt;
 278           else
 279             stmt = gsi_stmt (si);
 280
 281           stmt_info = vinfo_for_stmt (stmt);
 282
 283           if (dump_enabled_p ())
 284             {
 285               dump_printf_loc (MSG_NOTE, vect_location,
 286                                "==> examining statement: ");
 287               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 288               dump_printf (MSG_NOTE, "\n");
 289             }
 290
 291           gcc_assert (stmt_info);
 292
 293           /* Skip stmts which do not need to be vectorized.  */
 294           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 295                && !STMT_VINFO_LIVE_P (stmt_info))
 296               || gimple_clobber_p (stmt))
 297             {
 298               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 299                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 300                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 301                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 302                 {
 303                   stmt = pattern_stmt;
 304                   stmt_info = vinfo_for_stmt (pattern_stmt);
 305                   if (dump_enabled_p ())
 306                     {
 307                       dump_printf_loc (MSG_NOTE, vect_location,
 308                                        "==> examining pattern statement: ");
 309                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 310                       dump_printf (MSG_NOTE, "\n");
 311                     }
 312                 }
 313               else
 314                 {
 315                   if (dump_enabled_p ())
 316                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 317                   gsi_next (&si);
 318                   continue;
 319                 }
 320             }
 321           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 322                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 323                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 324                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 325             analyze_pattern_stmt = true;
 326
 327           /* If a pattern statement has def stmts, analyze them too.  */
 328           if (is_pattern_stmt_p (stmt_info))
 329             {
 330               if (pattern_def_seq == NULL)
 331                 {
 332                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 333                   pattern_def_si = gsi_start (pattern_def_seq);
 334                 }
 335               else if (!gsi_end_p (pattern_def_si))
 336                 gsi_next (&pattern_def_si);
 337               if (pattern_def_seq != NULL)
 338                 {
 339                   gimple pattern_def_stmt = NULL;
 340                   stmt_vec_info pattern_def_stmt_info = NULL;
 341
 342                   while (!gsi_end_p (pattern_def_si))
 343                     {
 344                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 345                       pattern_def_stmt_info
 346                         = vinfo_for_stmt (pattern_def_stmt);
 347                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 348                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 349                         break;
 350                       gsi_next (&pattern_def_si);
 351                     }
 352
 353                   if (!gsi_end_p (pattern_def_si))
 354                     {
 355                       if (dump_enabled_p ())
 356                         {
 357                           dump_printf_loc (MSG_NOTE, vect_location,
 358                                            "==> examining pattern def stmt: ");
 359                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 360                                             pattern_def_stmt, 0);
 361                           dump_printf (MSG_NOTE, "\n");
 362                         }
 363
 364                       stmt = pattern_def_stmt;
 365                       stmt_info = pattern_def_stmt_info;
 366                     }
 367                   else
 368                     {
 369                       pattern_def_si = gsi_none ();
 370                       analyze_pattern_stmt = false;
 371                     }
 372                 }
 373               else
 374                 analyze_pattern_stmt = false;
 375             }
 376
 377           if (gimple_get_lhs (stmt) == NULL_TREE)
 378             {
 379               if (is_gimple_call (stmt))
 380                 {
 381                   /* Ignore calls with no lhs.  These must be calls to
 382                      #pragma omp simd functions, and what vectorization factor
 383                      it really needs can't be determined until
 384                      vectorizable_simd_clone_call.  */
 385                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 386                     {
 387                       pattern_def_seq = NULL;
 388                       gsi_next (&si);
 389                     }
 390                   continue;
 391                 }
 392               if (dump_enabled_p ())
 393                 {
 394                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 395                                    "not vectorized: irregular stmt.");
 396                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 397                                     0);
 398                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 399                 }
 400               return false;
 401             }
 402
 403           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 404             {
 405               if (dump_enabled_p ())
 406                 {
 407                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 408                                    "not vectorized: vector stmt in loop:");
 409                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 410                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 411                 }
 412               return false;
 413             }
 414
 415           if (STMT_VINFO_VECTYPE (stmt_info))
 416             {
 417               /* The only case when a vectype had been already set is for stmts
 418                  that contain a dataref, or for "pattern-stmts" (stmts
 419                  generated by the vectorizer to represent/replace a certain
 420                  idiom).  */
 421               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 422                           || is_pattern_stmt_p (stmt_info)
 423                           || !gsi_end_p (pattern_def_si));
 424               vectype = STMT_VINFO_VECTYPE (stmt_info);
 425             }
 426           else
 427             {
 428               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 429               scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 430               if (dump_enabled_p ())
 431                 {
 432                   dump_printf_loc (MSG_NOTE, vect_location,
 433                                    "get vectype for scalar type:  ");
 434                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 435                   dump_printf (MSG_NOTE, "\n");
 436                 }
 437               vectype = get_vectype_for_scalar_type (scalar_type);
 438               if (!vectype)
 439                 {
 440                   if (dump_enabled_p ())
 441                     {
 442                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 443                                        "not vectorized: unsupported "
 444                                        "data-type ");
 445                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 446                                          scalar_type);
 447                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 448                     }
 449                   return false;
 450                 }
 451
 452               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 453
 454               if (dump_enabled_p ())
 455                 {
 456                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 457                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 458                   dump_printf (MSG_NOTE, "\n");
 459                 }
 460             }
 461
 462           /* The vectorization factor is according to the smallest
 463              scalar type (or the largest vector size, but we only
 464              support one vector size per loop).  */
 465           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 466                                                        &dummy);
 467           if (dump_enabled_p ())
 468             {
 469               dump_printf_loc (MSG_NOTE, vect_location,
 470                                "get vectype for scalar type:  ");
 471               dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 472               dump_printf (MSG_NOTE, "\n");
 473             }
 474           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 475           if (!vf_vectype)
 476             {
 477               if (dump_enabled_p ())
 478                 {
 479                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 480                                    "not vectorized: unsupported data-type ");
 481                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 482                                      scalar_type);
 483                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 484                 }
 485               return false;
 486             }
 487
 488           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 489                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 490             {
 491               if (dump_enabled_p ())
 492                 {
 493                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 494                                    "not vectorized: different sized vector "
 495                                    "types in statement, ");
 496                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 497                                      vectype);
 498                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 499                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 500                                      vf_vectype);
 501                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 502                 }
 503               return false;
 504             }
 505
 506           if (dump_enabled_p ())
 507             {
 508               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 509               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 510               dump_printf (MSG_NOTE, "\n");
 511             }
 512
 513           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 514           if (dump_enabled_p ())
 515             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 516           if (!vectorization_factor
 517               || (nunits > vectorization_factor))
 518             vectorization_factor = nunits;
 519
 520           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 521             {
 522               pattern_def_seq = NULL;
 523               gsi_next (&si);
 524             }
 525         }
 526     }
 527
 528   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 529   if (dump_enabled_p ())
 530     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 531                      vectorization_factor);
 532   if (vectorization_factor <= 1)
 533     {
 534       if (dump_enabled_p ())
 535         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 536                          "not vectorized: unsupported data-type\n");
 537       return false;
 538     }
 539   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 540
 541   return true;
 542 }
 543
 544
 545 /* Function vect_is_simple_iv_evolution.
 546
 547    FORNOW: A simple evolution of an induction variables in the loop is
 548    considered a polynomial evolution.  */
 549
 550 static bool
 551 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 552                              tree * step)
 553 {
 554   tree init_expr;
 555   tree step_expr;
 556   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 557   basic_block bb;
 558
 559   /* When there is no evolution in this loop, the evolution function
 560      is not "simple".  */
 561   if (evolution_part == NULL_TREE)
 562     return false;
 563
 564   /* When the evolution is a polynomial of degree >= 2
 565      the evolution function is not "simple".  */
 566   if (tree_is_chrec (evolution_part))
 567     return false;
 568
 569   step_expr = evolution_part;
 570   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 571
 572   if (dump_enabled_p ())
 573     {
 574       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 575       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 576       dump_printf (MSG_NOTE, ",  init: ");
 577       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 578       dump_printf (MSG_NOTE, "\n");
 579     }
 580
 581   *init = init_expr;
 582   *step = step_expr;
 583
 584   if (TREE_CODE (step_expr) != INTEGER_CST
 585       && (TREE_CODE (step_expr) != SSA_NAME
 586           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 587               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 588           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 589               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 590                   || !flag_associative_math)))
 591       && (TREE_CODE (step_expr) != REAL_CST
 592           || !flag_associative_math))
 593     {
 594       if (dump_enabled_p ())
 595         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 596                          "step unknown.\n");
 597       return false;
 598     }
 599
 600   return true;
 601 }
 602
 603 /* Function vect_analyze_scalar_cycles_1.
 604
 605    Examine the cross iteration def-use cycles of scalar variables
 606    in LOOP.  LOOP_VINFO represents the loop that is now being
 607    considered for vectorization (can be LOOP, or an outer-loop
 608    enclosing LOOP).  */
 609
 610 static void
 611 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 612 {
 613   basic_block bb = loop->header;
 614   tree init, step;
 615   stack_vec<gimple, 64> worklist;
 616   gimple_stmt_iterator gsi;
 617   bool double_reduc;
 618
 619   if (dump_enabled_p ())
 620     dump_printf_loc (MSG_NOTE, vect_location,
 621                      "=== vect_analyze_scalar_cycles ===\n");
 622
 623   /* First - identify all inductions.  Reduction detection assumes that all the
 624      inductions have been identified, therefore, this order must not be
 625      changed.  */
 626   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 627     {
 628       gimple phi = gsi_stmt (gsi);
 629       tree access_fn = NULL;
 630       tree def = PHI_RESULT (phi);
 631       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 632
 633       if (dump_enabled_p ())
 634         {
 635           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 636           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 637           dump_printf (MSG_NOTE, "\n");
 638         }
 639
 640       /* Skip virtual phi's.  The data dependences that are associated with
 641          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 642       if (virtual_operand_p (def))
 643         continue;
 644
 645       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 646
 647       /* Analyze the evolution function.  */
 648       access_fn = analyze_scalar_evolution (loop, def);
 649       if (access_fn)
 650         {
 651           STRIP_NOPS (access_fn);
 652           if (dump_enabled_p ())
 653             {
 654               dump_printf_loc (MSG_NOTE, vect_location,
 655                                "Access function of PHI: ");
 656               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 657               dump_printf (MSG_NOTE, "\n");
 658             }
 659           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 660             = evolution_part_in_loop_num (access_fn, loop->num);
 661         }
 662
 663       if (!access_fn
 664           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 665           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 666               && TREE_CODE (step) != INTEGER_CST))
 667         {
 668           worklist.safe_push (phi);
 669           continue;
 670         }
 671
 672       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 673
 674       if (dump_enabled_p ())
 675         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 676       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 677     }
 678
 679
 680   /* Second - identify all reductions and nested cycles.  */
 681   while (worklist.length () > 0)
 682     {
 683       gimple phi = worklist.pop ();
 684       tree def = PHI_RESULT (phi);
 685       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 686       gimple reduc_stmt;
 687       bool nested_cycle;
 688
 689       if (dump_enabled_p ())
 690         {
 691           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 692           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 693           dump_printf (MSG_NOTE, "\n");
 694         }
 695
 696       gcc_assert (!virtual_operand_p (def)
 697                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 698
 699       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 700       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 701                                                 &double_reduc);
 702       if (reduc_stmt)
 703         {
 704           if (double_reduc)
 705             {
 706               if (dump_enabled_p ())
 707                 dump_printf_loc (MSG_NOTE, vect_location,
 708                                  "Detected double reduction.\n");
 709
 710               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 711               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 712                                                     vect_double_reduction_def;
 713             }
 714           else
 715             {
 716               if (nested_cycle)
 717                 {
 718                   if (dump_enabled_p ())
 719                     dump_printf_loc (MSG_NOTE, vect_location,
 720                                      "Detected vectorizable nested cycle.\n");
 721
 722                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 723                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 724                                                              vect_nested_cycle;
 725                 }
 726               else
 727                 {
 728                   if (dump_enabled_p ())
 729                     dump_printf_loc (MSG_NOTE, vect_location,
 730                                      "Detected reduction.\n");
 731
 732                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 733                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 734                                                            vect_reduction_def;
 735                   /* Store the reduction cycles for possible vectorization in
 736                      loop-aware SLP.  */
 737                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 738                 }
 739             }
 740         }
 741       else
 742         if (dump_enabled_p ())
 743           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 744                            "Unknown def-use cycle pattern.\n");
 745     }
 746 }
 747
 748
 749 /* Function vect_analyze_scalar_cycles.
 750
 751    Examine the cross iteration def-use cycles of scalar variables, by
 752    analyzing the loop-header PHIs of scalar variables.  Classify each
 753    cycle as one of the following: invariant, induction, reduction, unknown.
 754    We do that for the loop represented by LOOP_VINFO, and also to its
 755    inner-loop, if exists.
 756    Examples for scalar cycles:
 757
 758    Example1: reduction:
 759
 760               loop1:
 761               for (i=0; i<N; i++)
 762                  sum += a[i];
 763
 764    Example2: induction:
 765
 766               loop2:
 767               for (i=0; i<N; i++)
 768                  a[i] = i;  */
 769
 770 static void
 771 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 772 {
 773   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 774
 775   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 776
 777   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 778      Reductions in such inner-loop therefore have different properties than
 779      the reductions in the nest that gets vectorized:
 780      1. When vectorized, they are executed in the same order as in the original
 781         scalar loop, so we can't change the order of computation when
 782         vectorizing them.
 783      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 784         current checks are too strict.  */
 785
 786   if (loop->inner)
 787     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 788 }
 789
 790
 791 /* Function vect_get_loop_niters.
 792
 793    Determine how many iterations the loop is executed and place it
 794    in NUMBER_OF_ITERATIONS.
 795
 796    Return the loop exit condition.  */
 797
 798 static gimple
 799 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
 800 {
 801   tree niters;
 802
 803   if (dump_enabled_p ())
 804     dump_printf_loc (MSG_NOTE, vect_location,
 805                      "=== get_loop_niters ===\n");
 806
 807   niters = number_of_latch_executions (loop);
 808   /* We want the number of loop header executions which is the number
 809      of latch executions plus one.
 810      ???  For UINT_MAX latch executions this number overflows to zero
 811      for loops like do { n++; } while (n != 0);  */
 812   if (niters && !chrec_contains_undetermined (niters))
 813     niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), niters,
 814                           build_int_cst (TREE_TYPE (niters), 1));
 815   *number_of_iterations = niters;
 816
 817   return get_loop_exit_condition (loop);
 818 }
 819
 820
 821 /* Function bb_in_loop_p
 822
 823    Used as predicate for dfs order traversal of the loop bbs.  */
 824
 825 static bool
 826 bb_in_loop_p (const_basic_block bb, const void *data)
 827 {
 828   const struct loop *const loop = (const struct loop *)data;
 829   if (flow_bb_inside_loop_p (loop, bb))
 830     return true;
 831   return false;
 832 }
 833
 834
 835 /* Function new_loop_vec_info.
 836
 837    Create and initialize a new loop_vec_info struct for LOOP, as well as
 838    stmt_vec_info structs for all the stmts in LOOP.  */
 839
 840 static loop_vec_info
 841 new_loop_vec_info (struct loop *loop)
 842 {
 843   loop_vec_info res;
 844   basic_block *bbs;
 845   gimple_stmt_iterator si;
 846   unsigned int i, nbbs;
 847
 848   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 849   LOOP_VINFO_LOOP (res) = loop;
 850
 851   bbs = get_loop_body (loop);
 852
 853   /* Create/Update stmt_info for all stmts in the loop.  */
 854   for (i = 0; i < loop->num_nodes; i++)
 855     {
 856       basic_block bb = bbs[i];
 857
 858       /* BBs in a nested inner-loop will have been already processed (because
 859          we will have called vect_analyze_loop_form for any nested inner-loop).
 860          Therefore, for stmts in an inner-loop we just want to update the
 861          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 862          loop_info of the outer-loop we are currently considering to vectorize
 863          (instead of the loop_info of the inner-loop).
 864          For stmts in other BBs we need to create a stmt_info from scratch.  */
 865       if (bb->loop_father != loop)
 866         {
 867           /* Inner-loop bb.  */
 868           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 869           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 870             {
 871               gimple phi = gsi_stmt (si);
 872               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 873               loop_vec_info inner_loop_vinfo =
 874                 STMT_VINFO_LOOP_VINFO (stmt_info);
 875               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 876               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 877             }
 878           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 879            {
 880               gimple stmt = gsi_stmt (si);
 881               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 882               loop_vec_info inner_loop_vinfo =
 883                  STMT_VINFO_LOOP_VINFO (stmt_info);
 884               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 885               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 886            }
 887         }
 888       else
 889         {
 890           /* bb in current nest.  */
 891           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 892             {
 893               gimple phi = gsi_stmt (si);
 894               gimple_set_uid (phi, 0);
 895               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 896             }
 897
 898           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 899             {
 900               gimple stmt = gsi_stmt (si);
 901               gimple_set_uid (stmt, 0);
 902               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 903             }
 904         }
 905     }
 906
 907   /* CHECKME: We want to visit all BBs before their successors (except for
 908      latch blocks, for which this assertion wouldn't hold).  In the simple
 909      case of the loop forms we allow, a dfs order of the BBs would the same
 910      as reversed postorder traversal, so we are safe.  */
 911
 912    free (bbs);
 913    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 914    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 915                               bbs, loop->num_nodes, loop);
 916    gcc_assert (nbbs == loop->num_nodes);
 917
 918   LOOP_VINFO_BBS (res) = bbs;
 919   LOOP_VINFO_NITERS (res) = NULL;
 920   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 921   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 922   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 923   LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
 924   LOOP_VINFO_VECT_FACTOR (res) = 0;
 925   LOOP_VINFO_LOOP_NEST (res).create (3);
 926   LOOP_VINFO_DATAREFS (res).create (10);
 927   LOOP_VINFO_DDRS (res).create (10 * 10);
 928   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 929   LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
 930              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 931   LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
 932              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 933   LOOP_VINFO_GROUPED_STORES (res).create (10);
 934   LOOP_VINFO_REDUCTIONS (res).create (10);
 935   LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
 936   LOOP_VINFO_SLP_INSTANCES (res).create (10);
 937   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 938   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
 939   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
 940   LOOP_VINFO_PEELING_FOR_NITER (res) = false;
 941   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
 942
 943   return res;
 944 }
 945
 946
 947 /* Function destroy_loop_vec_info.
 948
 949    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 950    stmts in the loop.  */
 951
 952 void
 953 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 954 {
 955   struct loop *loop;
 956   basic_block *bbs;
 957   int nbbs;
 958   gimple_stmt_iterator si;
 959   int j;
 960   vec<slp_instance> slp_instances;
 961   slp_instance instance;
 962   bool swapped;
 963
 964   if (!loop_vinfo)
 965     return;
 966
 967   loop = LOOP_VINFO_LOOP (loop_vinfo);
 968
 969   bbs = LOOP_VINFO_BBS (loop_vinfo);
 970   nbbs = clean_stmts ? loop->num_nodes : 0;
 971   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
 972
 973   for (j = 0; j < nbbs; j++)
 974     {
 975       basic_block bb = bbs[j];
 976       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 977         free_stmt_vec_info (gsi_stmt (si));
 978
 979       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 980         {
 981           gimple stmt = gsi_stmt (si);
 982
 983           /* We may have broken canonical form by moving a constant
 984              into RHS1 of a commutative op.  Fix such occurrences.  */
 985           if (swapped && is_gimple_assign (stmt))
 986             {
 987               enum tree_code code = gimple_assign_rhs_code (stmt);
 988
 989               if ((code == PLUS_EXPR
 990                    || code == POINTER_PLUS_EXPR
 991                    || code == MULT_EXPR)
 992                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 993                 swap_ssa_operands (stmt,
 994                                    gimple_assign_rhs1_ptr (stmt),
 995                                    gimple_assign_rhs2_ptr (stmt));
 996             }
 997
 998           /* Free stmt_vec_info.  */
 999           free_stmt_vec_info (stmt);
1000           gsi_next (&si);
1001         }
1002     }
1003
1004   free (LOOP_VINFO_BBS (loop_vinfo));
1005   vect_destroy_datarefs (loop_vinfo, NULL);
1006   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1007   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1008   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1009   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1010   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1011   FOR_EACH_VEC_ELT (slp_instances, j, instance)
1012     vect_free_slp_instance (instance);
1013
1014   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1015   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1016   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1017   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1018
1019   if (LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
1020     LOOP_VINFO_PEELING_HTAB (loop_vinfo).dispose ();
1021
1022   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1023
1024   free (loop_vinfo);
1025   loop->aux = NULL;
1026 }
1027
1028
1029 /* Function vect_analyze_loop_1.
1030
1031    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1032    for it. The different analyses will record information in the
1033    loop_vec_info struct.  This is a subset of the analyses applied in
1034    vect_analyze_loop, to be applied on an inner-loop nested in the loop
1035    that is now considered for (outer-loop) vectorization.  */
1036
1037 static loop_vec_info
1038 vect_analyze_loop_1 (struct loop *loop)
1039 {
1040   loop_vec_info loop_vinfo;
1041
1042   if (dump_enabled_p ())
1043     dump_printf_loc (MSG_NOTE, vect_location,
1044                      "===== analyze_loop_nest_1 =====\n");
1045
1046   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1047
1048   loop_vinfo = vect_analyze_loop_form (loop);
1049   if (!loop_vinfo)
1050     {
1051       if (dump_enabled_p ())
1052         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1053                          "bad inner-loop form.\n");
1054       return NULL;
1055     }
1056
1057   return loop_vinfo;
1058 }
1059
1060
1061 /* Function vect_analyze_loop_form.
1062
1063    Verify that certain CFG restrictions hold, including:
1064    - the loop has a pre-header
1065    - the loop has a single entry and exit
1066    - the loop exit condition is simple enough, and the number of iterations
1067      can be analyzed (a countable loop).  */
1068
1069 loop_vec_info
1070 vect_analyze_loop_form (struct loop *loop)
1071 {
1072   loop_vec_info loop_vinfo;
1073   gimple loop_cond;
1074   tree number_of_iterations = NULL;
1075   loop_vec_info inner_loop_vinfo = NULL;
1076
1077   if (dump_enabled_p ())
1078     dump_printf_loc (MSG_NOTE, vect_location,
1079                      "=== vect_analyze_loop_form ===\n");
1080
1081   /* Different restrictions apply when we are considering an inner-most loop,
1082      vs. an outer (nested) loop.
1083      (FORNOW. May want to relax some of these restrictions in the future).  */
1084
1085   if (!loop->inner)
1086     {
1087       /* Inner-most loop.  We currently require that the number of BBs is
1088          exactly 2 (the header and latch).  Vectorizable inner-most loops
1089          look like this:
1090
1091                         (pre-header)
1092                            |
1093                           header <--------+
1094                            | |            |
1095                            | +--> latch --+
1096                            |
1097                         (exit-bb)  */
1098
1099       if (loop->num_nodes != 2)
1100         {
1101           if (dump_enabled_p ())
1102             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1103                              "not vectorized: control flow in loop.\n");
1104           return NULL;
1105         }
1106
1107       if (empty_block_p (loop->header))
1108         {
1109           if (dump_enabled_p ())
1110             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1111                              "not vectorized: empty loop.\n");
1112           return NULL;
1113         }
1114     }
1115   else
1116     {
1117       struct loop *innerloop = loop->inner;
1118       edge entryedge;
1119
1120       /* Nested loop. We currently require that the loop is doubly-nested,
1121          contains a single inner loop, and the number of BBs is exactly 5.
1122          Vectorizable outer-loops look like this:
1123
1124                         (pre-header)
1125                            |
1126                           header <---+
1127                            |         |
1128                           inner-loop |
1129                            |         |
1130                           tail ------+
1131                            |
1132                         (exit-bb)
1133
1134          The inner-loop has the properties expected of inner-most loops
1135          as described above.  */
1136
1137       if ((loop->inner)->inner || (loop->inner)->next)
1138         {
1139           if (dump_enabled_p ())
1140             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1141                              "not vectorized: multiple nested loops.\n");
1142           return NULL;
1143         }
1144
1145       /* Analyze the inner-loop.  */
1146       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1147       if (!inner_loop_vinfo)
1148         {
1149           if (dump_enabled_p ())
1150             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1151                              "not vectorized: Bad inner loop.\n");
1152           return NULL;
1153         }
1154
1155       if (!expr_invariant_in_loop_p (loop,
1156                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
1157         {
1158           if (dump_enabled_p ())
1159             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1160                              "not vectorized: inner-loop count not"
1161                              " invariant.\n");
1162           destroy_loop_vec_info (inner_loop_vinfo, true);
1163           return NULL;
1164         }
1165
1166       if (loop->num_nodes != 5)
1167         {
1168           if (dump_enabled_p ())
1169             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1170                              "not vectorized: control flow in loop.\n");
1171           destroy_loop_vec_info (inner_loop_vinfo, true);
1172           return NULL;
1173         }
1174
1175       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1176       entryedge = EDGE_PRED (innerloop->header, 0);
1177       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1178         entryedge = EDGE_PRED (innerloop->header, 1);
1179
1180       if (entryedge->src != loop->header
1181           || !single_exit (innerloop)
1182           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1183         {
1184           if (dump_enabled_p ())
1185             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1186                              "not vectorized: unsupported outerloop form.\n");
1187           destroy_loop_vec_info (inner_loop_vinfo, true);
1188           return NULL;
1189         }
1190
1191       if (dump_enabled_p ())
1192         dump_printf_loc (MSG_NOTE, vect_location,
1193                          "Considering outer-loop vectorization.\n");
1194     }
1195
1196   if (!single_exit (loop)
1197       || EDGE_COUNT (loop->header->preds) != 2)
1198     {
1199       if (dump_enabled_p ())
1200         {
1201           if (!single_exit (loop))
1202             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1203                              "not vectorized: multiple exits.\n");
1204           else if (EDGE_COUNT (loop->header->preds) != 2)
1205             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1206                              "not vectorized: too many incoming edges.\n");
1207         }
1208       if (inner_loop_vinfo)
1209         destroy_loop_vec_info (inner_loop_vinfo, true);
1210       return NULL;
1211     }
1212
1213   /* We assume that the loop exit condition is at the end of the loop. i.e,
1214      that the loop is represented as a do-while (with a proper if-guard
1215      before the loop if needed), where the loop header contains all the
1216      executable statements, and the latch is empty.  */
1217   if (!empty_block_p (loop->latch)
1218       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1219     {
1220       if (dump_enabled_p ())
1221         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1222                          "not vectorized: latch block not empty.\n");
1223       if (inner_loop_vinfo)
1224         destroy_loop_vec_info (inner_loop_vinfo, true);
1225       return NULL;
1226     }
1227
1228   /* Make sure there exists a single-predecessor exit bb:  */
1229   if (!single_pred_p (single_exit (loop)->dest))
1230     {
1231       edge e = single_exit (loop);
1232       if (!(e->flags & EDGE_ABNORMAL))
1233         {
1234           split_loop_exit_edge (e);
1235           if (dump_enabled_p ())
1236             dump_printf (MSG_NOTE, "split exit edge.\n");
1237         }
1238       else
1239         {
1240           if (dump_enabled_p ())
1241             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1242                              "not vectorized: abnormal loop exit edge.\n");
1243           if (inner_loop_vinfo)
1244             destroy_loop_vec_info (inner_loop_vinfo, true);
1245           return NULL;
1246         }
1247     }
1248
1249   loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1250   if (!loop_cond)
1251     {
1252       if (dump_enabled_p ())
1253         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1254                          "not vectorized: complicated exit condition.\n");
1255       if (inner_loop_vinfo)
1256         destroy_loop_vec_info (inner_loop_vinfo, true);
1257       return NULL;
1258     }
1259
1260   if (!number_of_iterations
1261       || chrec_contains_undetermined (number_of_iterations))
1262     {
1263       if (dump_enabled_p ())
1264         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1265                          "not vectorized: number of iterations cannot be "
1266                          "computed.\n");
1267       if (inner_loop_vinfo)
1268         destroy_loop_vec_info (inner_loop_vinfo, true);
1269       return NULL;
1270     }
1271
1272   if (integer_zerop (number_of_iterations))
1273     {
1274       if (dump_enabled_p ())
1275         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1276                          "not vectorized: number of iterations = 0.\n");
1277       if (inner_loop_vinfo)
1278         destroy_loop_vec_info (inner_loop_vinfo, true);
1279       return NULL;
1280     }
1281
1282   loop_vinfo = new_loop_vec_info (loop);
1283   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1284   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1285
1286   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1287     {
1288       if (dump_enabled_p ())
1289         {
1290           dump_printf_loc (MSG_NOTE, vect_location,
1291                            "Symbolic number of iterations is ");
1292           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1293           dump_printf (MSG_NOTE, "\n");
1294         }
1295     }
1296
1297   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1298
1299   /* CHECKME: May want to keep it around it in the future.  */
1300   if (inner_loop_vinfo)
1301     destroy_loop_vec_info (inner_loop_vinfo, false);
1302
1303   gcc_assert (!loop->aux);
1304   loop->aux = loop_vinfo;
1305   return loop_vinfo;
1306 }
1307
1308
1309 /* Function vect_analyze_loop_operations.
1310
1311    Scan the loop stmts and make sure they are all vectorizable.  */
1312
1313 static bool
1314 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1315 {
1316   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1317   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1318   int nbbs = loop->num_nodes;
1319   gimple_stmt_iterator si;
1320   unsigned int vectorization_factor = 0;
1321   int i;
1322   gimple phi;
1323   stmt_vec_info stmt_info;
1324   bool need_to_vectorize = false;
1325   int min_profitable_iters;
1326   int min_scalar_loop_bound;
1327   unsigned int th;
1328   bool only_slp_in_loop = true, ok;
1329   HOST_WIDE_INT max_niter;
1330   HOST_WIDE_INT estimated_niter;
1331   int min_profitable_estimate;
1332
1333   if (dump_enabled_p ())
1334     dump_printf_loc (MSG_NOTE, vect_location,
1335                      "=== vect_analyze_loop_operations ===\n");
1336
1337   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1338   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1339   if (slp)
1340     {
1341       /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1342          vectorization factor of the loop is the unrolling factor required by
1343          the SLP instances.  If that unrolling factor is 1, we say, that we
1344          perform pure SLP on loop - cross iteration parallelism is not
1345          exploited.  */
1346       for (i = 0; i < nbbs; i++)
1347         {
1348           basic_block bb = bbs[i];
1349           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1350             {
1351               gimple stmt = gsi_stmt (si);
1352               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1353               gcc_assert (stmt_info);
1354               if ((STMT_VINFO_RELEVANT_P (stmt_info)
1355                    || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1356                   && !PURE_SLP_STMT (stmt_info))
1357                 /* STMT needs both SLP and loop-based vectorization.  */
1358                 only_slp_in_loop = false;
1359             }
1360         }
1361
1362       if (only_slp_in_loop)
1363         vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1364       else
1365         vectorization_factor = least_common_multiple (vectorization_factor,
1366                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1367
1368       LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1369       if (dump_enabled_p ())
1370         dump_printf_loc (MSG_NOTE, vect_location,
1371                          "Updating vectorization factor to %d\n",
1372                          vectorization_factor);
1373     }
1374
1375   for (i = 0; i < nbbs; i++)
1376     {
1377       basic_block bb = bbs[i];
1378
1379       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1380         {
1381           phi = gsi_stmt (si);
1382           ok = true;
1383
1384           stmt_info = vinfo_for_stmt (phi);
1385           if (dump_enabled_p ())
1386             {
1387               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1388               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1389               dump_printf (MSG_NOTE, "\n");
1390             }
1391
1392           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1393              (i.e., a phi in the tail of the outer-loop).  */
1394           if (! is_loop_header_bb_p (bb))
1395             {
1396               /* FORNOW: we currently don't support the case that these phis
1397                  are not used in the outerloop (unless it is double reduction,
1398                  i.e., this phi is vect_reduction_def), cause this case
1399                  requires to actually do something here.  */
1400               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1401                    || STMT_VINFO_LIVE_P (stmt_info))
1402                   && STMT_VINFO_DEF_TYPE (stmt_info)
1403                      != vect_double_reduction_def)
1404                 {
1405                   if (dump_enabled_p ())
1406                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1407                                      "Unsupported loop-closed phi in "
1408                                      "outer-loop.\n");
1409                   return false;
1410                 }
1411
1412               /* If PHI is used in the outer loop, we check that its operand
1413                  is defined in the inner loop.  */
1414               if (STMT_VINFO_RELEVANT_P (stmt_info))
1415                 {
1416                   tree phi_op;
1417                   gimple op_def_stmt;
1418
1419                   if (gimple_phi_num_args (phi) != 1)
1420                     return false;
1421
1422                   phi_op = PHI_ARG_DEF (phi, 0);
1423                   if (TREE_CODE (phi_op) != SSA_NAME)
1424                     return false;
1425
1426                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1427                   if (gimple_nop_p (op_def_stmt)
1428                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1429                       || !vinfo_for_stmt (op_def_stmt))
1430                     return false;
1431
1432                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1433                         != vect_used_in_outer
1434                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1435                            != vect_used_in_outer_by_reduction)
1436                     return false;
1437                 }
1438
1439               continue;
1440             }
1441
1442           gcc_assert (stmt_info);
1443
1444           if (STMT_VINFO_LIVE_P (stmt_info))
1445             {
1446               /* FORNOW: not yet supported.  */
1447               if (dump_enabled_p ())
1448                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1449                                  "not vectorized: value used after loop.\n");
1450               return false;
1451             }
1452
1453           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1454               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1455             {
1456               /* A scalar-dependence cycle that we don't support.  */
1457               if (dump_enabled_p ())
1458                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1459                                  "not vectorized: scalar dependence cycle.\n");
1460               return false;
1461             }
1462
1463           if (STMT_VINFO_RELEVANT_P (stmt_info))
1464             {
1465               need_to_vectorize = true;
1466               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1467                 ok = vectorizable_induction (phi, NULL, NULL);
1468             }
1469
1470           if (!ok)
1471             {
1472               if (dump_enabled_p ())
1473                 {
1474                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1475                                    "not vectorized: relevant phi not "
1476                                    "supported: ");
1477                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1478                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1479                 }
1480               return false;
1481             }
1482         }
1483
1484       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1485         {
1486           gimple stmt = gsi_stmt (si);
1487           if (!gimple_clobber_p (stmt)
1488               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1489             return false;
1490         }
1491     } /* bbs */
1492
1493   /* All operations in the loop are either irrelevant (deal with loop
1494      control, or dead), or only used outside the loop and can be moved
1495      out of the loop (e.g. invariants, inductions).  The loop can be
1496      optimized away by scalar optimizations.  We're better off not
1497      touching this loop.  */
1498   if (!need_to_vectorize)
1499     {
1500       if (dump_enabled_p ())
1501         dump_printf_loc (MSG_NOTE, vect_location,
1502                          "All the computation can be taken out of the loop.\n");
1503       if (dump_enabled_p ())
1504         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1505                          "not vectorized: redundant loop. no profit to "
1506                          "vectorize.\n");
1507       return false;
1508     }
1509
1510   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1511     dump_printf_loc (MSG_NOTE, vect_location,
1512                      "vectorization_factor = %d, niters = "
1513                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1514                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1515
1516   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1517        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1518       || ((max_niter = max_stmt_executions_int (loop)) != -1
1519           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1520     {
1521       if (dump_enabled_p ())
1522         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1523                          "not vectorized: iteration count too small.\n");
1524       if (dump_enabled_p ())
1525         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1526                          "not vectorized: iteration count smaller than "
1527                          "vectorization factor.\n");
1528       return false;
1529     }
1530
1531   /* Analyze cost.  Decide if worth while to vectorize.  */
1532
1533   /* Once VF is set, SLP costs should be updated since the number of created
1534      vector stmts depends on VF.  */
1535   vect_update_slp_costs_according_to_vf (loop_vinfo);
1536
1537   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1538                                       &min_profitable_estimate);
1539   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1540
1541   if (min_profitable_iters < 0)
1542     {
1543       if (dump_enabled_p ())
1544         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1545                          "not vectorized: vectorization not profitable.\n");
1546       if (dump_enabled_p ())
1547         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1548                          "not vectorized: vector version will never be "
1549                          "profitable.\n");
1550       return false;
1551     }
1552
1553   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1554                             * vectorization_factor) - 1);
1555
1556
1557   /* Use the cost model only if it is more conservative than user specified
1558      threshold.  */
1559
1560   th = (unsigned) min_scalar_loop_bound;
1561   if (min_profitable_iters
1562       && (!min_scalar_loop_bound
1563           || min_profitable_iters > min_scalar_loop_bound))
1564     th = (unsigned) min_profitable_iters;
1565
1566   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1567       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1568     {
1569       if (dump_enabled_p ())
1570         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1571                          "not vectorized: vectorization not profitable.\n");
1572       if (dump_enabled_p ())
1573         dump_printf_loc (MSG_NOTE, vect_location,
1574                          "not vectorized: iteration count smaller than user "
1575                          "specified loop bound parameter or minimum profitable "
1576                          "iterations (whichever is more conservative).\n");
1577       return false;
1578     }
1579
1580   if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1581       && ((unsigned HOST_WIDE_INT) estimated_niter
1582           <= MAX (th, (unsigned)min_profitable_estimate)))
1583     {
1584       if (dump_enabled_p ())
1585         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1586                          "not vectorized: estimated iteration count too "
1587                          "small.\n");
1588       if (dump_enabled_p ())
1589         dump_printf_loc (MSG_NOTE, vect_location,
1590                          "not vectorized: estimated iteration count smaller "
1591                          "than specified loop bound parameter or minimum "
1592                          "profitable iterations (whichever is more "
1593                          "conservative).\n");
1594       return false;
1595     }
1596
1597   return true;
1598 }
1599
1600
1601 /* Function vect_analyze_loop_2.
1602
1603    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1604    for it.  The different analyses will record information in the
1605    loop_vec_info struct.  */
1606 static bool
1607 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1608 {
1609   bool ok, slp = false;
1610   int max_vf = MAX_VECTORIZATION_FACTOR;
1611   int min_vf = 2;
1612
1613   /* Find all data references in the loop (which correspond to vdefs/vuses)
1614      and analyze their evolution in the loop.  Also adjust the minimal
1615      vectorization factor according to the loads and stores.
1616
1617      FORNOW: Handle only simple, array references, which
1618      alignment can be forced, and aligned pointer-references.  */
1619
1620   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1621   if (!ok)
1622     {
1623       if (dump_enabled_p ())
1624         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1625                          "bad data references.\n");
1626       return false;
1627     }
1628
1629   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1630      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1631
1632   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1633   if (!ok)
1634     {
1635       if (dump_enabled_p ())
1636         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1637                          "bad data access.\n");
1638       return false;
1639     }
1640
1641   /* Classify all cross-iteration scalar data-flow cycles.
1642      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1643
1644   vect_analyze_scalar_cycles (loop_vinfo);
1645
1646   vect_pattern_recog (loop_vinfo, NULL);
1647
1648   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1649
1650   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1651   if (!ok)
1652     {
1653       if (dump_enabled_p ())
1654         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1655                          "unexpected pattern.\n");
1656       return false;
1657     }
1658
1659   /* Analyze data dependences between the data-refs in the loop
1660      and adjust the maximum vectorization factor according to
1661      the dependences.
1662      FORNOW: fail at the first data dependence that we encounter.  */
1663
1664   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1665   if (!ok
1666       || max_vf < min_vf)
1667     {
1668       if (dump_enabled_p ())
1669             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1670                              "bad data dependence.\n");
1671       return false;
1672     }
1673
1674   ok = vect_determine_vectorization_factor (loop_vinfo);
1675   if (!ok)
1676     {
1677       if (dump_enabled_p ())
1678         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1679                          "can't determine vectorization factor.\n");
1680       return false;
1681     }
1682   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1683     {
1684       if (dump_enabled_p ())
1685         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1686                          "bad data dependence.\n");
1687       return false;
1688     }
1689
1690   /* Analyze the alignment of the data-refs in the loop.
1691      Fail if a data reference is found that cannot be vectorized.  */
1692
1693   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1694   if (!ok)
1695     {
1696       if (dump_enabled_p ())
1697         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1698                          "bad data alignment.\n");
1699       return false;
1700     }
1701
1702   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1703      It is important to call pruning after vect_analyze_data_ref_accesses,
1704      since we use grouping information gathered by interleaving analysis.  */
1705   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1706   if (!ok)
1707     {
1708       if (dump_enabled_p ())
1709         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1710                          "too long list of versioning for alias "
1711                          "run-time tests.\n");
1712       return false;
1713     }
1714
1715   /* This pass will decide on using loop versioning and/or loop peeling in
1716      order to enhance the alignment of data references in the loop.  */
1717
1718   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1719   if (!ok)
1720     {
1721       if (dump_enabled_p ())
1722         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1723                          "bad data alignment.\n");
1724       return false;
1725     }
1726
1727   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1728   ok = vect_analyze_slp (loop_vinfo, NULL);
1729   if (ok)
1730     {
1731       /* Decide which possible SLP instances to SLP.  */
1732       slp = vect_make_slp_decision (loop_vinfo);
1733
1734       /* Find stmts that need to be both vectorized and SLPed.  */
1735       vect_detect_hybrid_slp (loop_vinfo);
1736     }
1737   else
1738     return false;
1739
1740   /* Scan all the operations in the loop and make sure they are
1741      vectorizable.  */
1742
1743   ok = vect_analyze_loop_operations (loop_vinfo, slp);
1744   if (!ok)
1745     {
1746       if (dump_enabled_p ())
1747         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1748                          "bad operation or unsupported loop bound.\n");
1749       return false;
1750     }
1751
1752   /* Decide whether we need to create an epilogue loop to handle
1753      remaining scalar iterations.  */
1754   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1755       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1756     {
1757       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1758                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1759           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1760         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1761     }
1762   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1763            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1764                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
1765     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1766
1767   /* If an epilogue loop is required make sure we can create one.  */
1768   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1769       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1770     {
1771       if (dump_enabled_p ())
1772         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1773       if (!vect_can_advance_ivs_p (loop_vinfo)
1774           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1775                                            single_exit (LOOP_VINFO_LOOP
1776                                                          (loop_vinfo))))
1777         {
1778           if (dump_enabled_p ())
1779             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1780                              "not vectorized: can't create required "
1781                              "epilog loop\n");
1782           return false;
1783         }
1784     }
1785
1786   return true;
1787 }
1788
1789 /* Function vect_analyze_loop.
1790
1791    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1792    for it.  The different analyses will record information in the
1793    loop_vec_info struct.  */
1794 loop_vec_info
1795 vect_analyze_loop (struct loop *loop)
1796 {
1797   loop_vec_info loop_vinfo;
1798   unsigned int vector_sizes;
1799
1800   /* Autodetect first vector size we try.  */
1801   current_vector_size = 0;
1802   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1803
1804   if (dump_enabled_p ())
1805     dump_printf_loc (MSG_NOTE, vect_location,
1806                      "===== analyze_loop_nest =====\n");
1807
1808   if (loop_outer (loop)
1809       && loop_vec_info_for_loop (loop_outer (loop))
1810       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1811     {
1812       if (dump_enabled_p ())
1813         dump_printf_loc (MSG_NOTE, vect_location,
1814                          "outer-loop already vectorized.\n");
1815       return NULL;
1816     }
1817
1818   while (1)
1819     {
1820       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1821       loop_vinfo = vect_analyze_loop_form (loop);
1822       if (!loop_vinfo)
1823         {
1824           if (dump_enabled_p ())
1825             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1826                              "bad loop form.\n");
1827           return NULL;
1828         }
1829
1830       if (vect_analyze_loop_2 (loop_vinfo))
1831         {
1832           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1833
1834           return loop_vinfo;
1835         }
1836
1837       destroy_loop_vec_info (loop_vinfo, true);
1838
1839       vector_sizes &= ~current_vector_size;
1840       if (vector_sizes == 0
1841           || current_vector_size == 0)
1842         return NULL;
1843
1844       /* Try the next biggest vector size.  */
1845       current_vector_size = 1 << floor_log2 (vector_sizes);
1846       if (dump_enabled_p ())
1847         dump_printf_loc (MSG_NOTE, vect_location,
1848                          "***** Re-trying analysis with "
1849                          "vector size %d\n", current_vector_size);
1850     }
1851 }
1852
1853
1854 /* Function reduction_code_for_scalar_code
1855
1856    Input:
1857    CODE - tree_code of a reduction operations.
1858
1859    Output:
1860    REDUC_CODE - the corresponding tree-code to be used to reduce the
1861       vector of partial results into a single scalar result (which
1862       will also reside in a vector) or ERROR_MARK if the operation is
1863       a supported reduction operation, but does not have such tree-code.
1864
1865    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1866
1867 static bool
1868 reduction_code_for_scalar_code (enum tree_code code,
1869                                 enum tree_code *reduc_code)
1870 {
1871   switch (code)
1872     {
1873       case MAX_EXPR:
1874         *reduc_code = REDUC_MAX_EXPR;
1875         return true;
1876
1877       case MIN_EXPR:
1878         *reduc_code = REDUC_MIN_EXPR;
1879         return true;
1880
1881       case PLUS_EXPR:
1882         *reduc_code = REDUC_PLUS_EXPR;
1883         return true;
1884
1885       case MULT_EXPR:
1886       case MINUS_EXPR:
1887       case BIT_IOR_EXPR:
1888       case BIT_XOR_EXPR:
1889       case BIT_AND_EXPR:
1890         *reduc_code = ERROR_MARK;
1891         return true;
1892
1893       default:
1894        return false;
1895     }
1896 }
1897
1898
1899 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
1900    STMT is printed with a message MSG. */
1901
1902 static void
1903 report_vect_op (int msg_type, gimple stmt, const char *msg)
1904 {
1905   dump_printf_loc (msg_type, vect_location, "%s", msg);
1906   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1907   dump_printf (msg_type, "\n");
1908 }
1909
1910
1911 /* Detect SLP reduction of the form:
1912
1913    #a1 = phi <a5, a0>
1914    a2 = operation (a1)
1915    a3 = operation (a2)
1916    a4 = operation (a3)
1917    a5 = operation (a4)
1918
1919    #a = phi <a5>
1920
1921    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1922    FIRST_STMT is the first reduction stmt in the chain
1923    (a2 = operation (a1)).
1924
1925    Return TRUE if a reduction chain was detected.  */
1926
1927 static bool
1928 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1929 {
1930   struct loop *loop = (gimple_bb (phi))->loop_father;
1931   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1932   enum tree_code code;
1933   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1934   stmt_vec_info use_stmt_info, current_stmt_info;
1935   tree lhs;
1936   imm_use_iterator imm_iter;
1937   use_operand_p use_p;
1938   int nloop_uses, size = 0, n_out_of_loop_uses;
1939   bool found = false;
1940
1941   if (loop != vect_loop)
1942     return false;
1943
1944   lhs = PHI_RESULT (phi);
1945   code = gimple_assign_rhs_code (first_stmt);
1946   while (1)
1947     {
1948       nloop_uses = 0;
1949       n_out_of_loop_uses = 0;
1950       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1951         {
1952           gimple use_stmt = USE_STMT (use_p);
1953           if (is_gimple_debug (use_stmt))
1954             continue;
1955
1956           use_stmt = USE_STMT (use_p);
1957
1958           /* Check if we got back to the reduction phi.  */
1959           if (use_stmt == phi)
1960             {
1961               loop_use_stmt = use_stmt;
1962               found = true;
1963               break;
1964             }
1965
1966           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1967             {
1968               if (vinfo_for_stmt (use_stmt)
1969                   && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1970                 {
1971                   loop_use_stmt = use_stmt;
1972                   nloop_uses++;
1973                 }
1974             }
1975            else
1976              n_out_of_loop_uses++;
1977
1978            /* There are can be either a single use in the loop or two uses in
1979               phi nodes.  */
1980            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1981              return false;
1982         }
1983
1984       if (found)
1985         break;
1986
1987       /* We reached a statement with no loop uses.  */
1988       if (nloop_uses == 0)
1989         return false;
1990
1991       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
1992       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
1993         return false;
1994
1995       if (!is_gimple_assign (loop_use_stmt)
1996           || code != gimple_assign_rhs_code (loop_use_stmt)
1997           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
1998         return false;
1999
2000       /* Insert USE_STMT into reduction chain.  */
2001       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2002       if (current_stmt)
2003         {
2004           current_stmt_info = vinfo_for_stmt (current_stmt);
2005           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2006           GROUP_FIRST_ELEMENT (use_stmt_info)
2007             = GROUP_FIRST_ELEMENT (current_stmt_info);
2008         }
2009       else
2010         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2011
2012       lhs = gimple_assign_lhs (loop_use_stmt);
2013       current_stmt = loop_use_stmt;
2014       size++;
2015    }
2016
2017   if (!found || loop_use_stmt != phi || size < 2)
2018     return false;
2019
2020   /* Swap the operands, if needed, to make the reduction operand be the second
2021      operand.  */
2022   lhs = PHI_RESULT (phi);
2023   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2024   while (next_stmt)
2025     {
2026       if (gimple_assign_rhs2 (next_stmt) == lhs)
2027         {
2028           tree op = gimple_assign_rhs1 (next_stmt);
2029           gimple def_stmt = NULL;
2030
2031           if (TREE_CODE (op) == SSA_NAME)
2032             def_stmt = SSA_NAME_DEF_STMT (op);
2033
2034           /* Check that the other def is either defined in the loop
2035              ("vect_internal_def"), or it's an induction (defined by a
2036              loop-header phi-node).  */
2037           if (def_stmt
2038               && gimple_bb (def_stmt)
2039               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2040               && (is_gimple_assign (def_stmt)
2041                   || is_gimple_call (def_stmt)
2042                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2043                            == vect_induction_def
2044                   || (gimple_code (def_stmt) == GIMPLE_PHI
2045                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2046                                   == vect_internal_def
2047                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2048             {
2049               lhs = gimple_assign_lhs (next_stmt);
2050               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2051               continue;
2052             }
2053
2054           return false;
2055         }
2056       else
2057         {
2058           tree op = gimple_assign_rhs2 (next_stmt);
2059           gimple def_stmt = NULL;
2060
2061           if (TREE_CODE (op) == SSA_NAME)
2062             def_stmt = SSA_NAME_DEF_STMT (op);
2063
2064           /* Check that the other def is either defined in the loop
2065             ("vect_internal_def"), or it's an induction (defined by a
2066             loop-header phi-node).  */
2067           if (def_stmt
2068               && gimple_bb (def_stmt)
2069               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2070               && (is_gimple_assign (def_stmt)
2071                   || is_gimple_call (def_stmt)
2072                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2073                               == vect_induction_def
2074                   || (gimple_code (def_stmt) == GIMPLE_PHI
2075                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2076                                   == vect_internal_def
2077                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2078             {
2079               if (dump_enabled_p ())
2080                 {
2081                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2082                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2083                   dump_printf (MSG_NOTE, "\n");
2084                 }
2085
2086               swap_ssa_operands (next_stmt,
2087                                  gimple_assign_rhs1_ptr (next_stmt),
2088                                  gimple_assign_rhs2_ptr (next_stmt));
2089               update_stmt (next_stmt);
2090
2091               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2092                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2093             }
2094           else
2095             return false;
2096         }
2097
2098       lhs = gimple_assign_lhs (next_stmt);
2099       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2100     }
2101
2102   /* Save the chain for further analysis in SLP detection.  */
2103   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2104   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2105   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2106
2107   return true;
2108 }
2109
2110
2111 /* Function vect_is_simple_reduction_1
2112
2113    (1) Detect a cross-iteration def-use cycle that represents a simple
2114    reduction computation.  We look for the following pattern:
2115
2116    loop_header:
2117      a1 = phi < a0, a2 >
2118      a3 = ...
2119      a2 = operation (a3, a1)
2120
2121    or
2122
2123    a3 = ...
2124    loop_header:
2125      a1 = phi < a0, a2 >
2126      a2 = operation (a3, a1)
2127
2128    such that:
2129    1. operation is commutative and associative and it is safe to
2130       change the order of the computation (if CHECK_REDUCTION is true)
2131    2. no uses for a2 in the loop (a2 is used out of the loop)
2132    3. no uses of a1 in the loop besides the reduction operation
2133    4. no uses of a1 outside the loop.
2134
2135    Conditions 1,4 are tested here.
2136    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2137
2138    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2139    nested cycles, if CHECK_REDUCTION is false.
2140
2141    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2142    reductions:
2143
2144      a1 = phi < a0, a2 >
2145      inner loop (def of a3)
2146      a2 = phi < a3 >
2147
2148    If MODIFY is true it tries also to rework the code in-place to enable
2149    detection of more reduction patterns.  For the time being we rewrite
2150    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2151 */
2152
2153 static gimple
2154 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2155                             bool check_reduction, bool *double_reduc,
2156                             bool modify)
2157 {
2158   struct loop *loop = (gimple_bb (phi))->loop_father;
2159   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2160   edge latch_e = loop_latch_edge (loop);
2161   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2162   gimple def_stmt, def1 = NULL, def2 = NULL;
2163   enum tree_code orig_code, code;
2164   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2165   tree type;
2166   int nloop_uses;
2167   tree name;
2168   imm_use_iterator imm_iter;
2169   use_operand_p use_p;
2170   bool phi_def;
2171
2172   *double_reduc = false;
2173
2174   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2175      otherwise, we assume outer loop vectorization.  */
2176   gcc_assert ((check_reduction && loop == vect_loop)
2177               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2178
2179   name = PHI_RESULT (phi);
2180   nloop_uses = 0;
2181   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2182     {
2183       gimple use_stmt = USE_STMT (use_p);
2184       if (is_gimple_debug (use_stmt))
2185         continue;
2186
2187       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2188         {
2189           if (dump_enabled_p ())
2190             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2191                              "intermediate value used outside loop.\n");
2192
2193           return NULL;
2194         }
2195
2196       if (vinfo_for_stmt (use_stmt)
2197           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2198         nloop_uses++;
2199       if (nloop_uses > 1)
2200         {
2201           if (dump_enabled_p ())
2202             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2203                              "reduction used in loop.\n");
2204           return NULL;
2205         }
2206     }
2207
2208   if (TREE_CODE (loop_arg) != SSA_NAME)
2209     {
2210       if (dump_enabled_p ())
2211         {
2212           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2213                            "reduction: not ssa_name: ");
2214           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2215           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2216         }
2217       return NULL;
2218     }
2219
2220   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2221   if (!def_stmt)
2222     {
2223       if (dump_enabled_p ())
2224         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2225                          "reduction: no def_stmt.\n");
2226       return NULL;
2227     }
2228
2229   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2230     {
2231       if (dump_enabled_p ())
2232         {
2233           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2234           dump_printf (MSG_NOTE, "\n");
2235         }
2236       return NULL;
2237     }
2238
2239   if (is_gimple_assign (def_stmt))
2240     {
2241       name = gimple_assign_lhs (def_stmt);
2242       phi_def = false;
2243     }
2244   else
2245     {
2246       name = PHI_RESULT (def_stmt);
2247       phi_def = true;
2248     }
2249
2250   nloop_uses = 0;
2251   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2252     {
2253       gimple use_stmt = USE_STMT (use_p);
2254       if (is_gimple_debug (use_stmt))
2255         continue;
2256       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2257           && vinfo_for_stmt (use_stmt)
2258           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2259         nloop_uses++;
2260       if (nloop_uses > 1)
2261         {
2262           if (dump_enabled_p ())
2263             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2264                              "reduction used in loop.\n");
2265           return NULL;
2266         }
2267     }
2268
2269   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2270      defined in the inner loop.  */
2271   if (phi_def)
2272     {
2273       op1 = PHI_ARG_DEF (def_stmt, 0);
2274
2275       if (gimple_phi_num_args (def_stmt) != 1
2276           || TREE_CODE (op1) != SSA_NAME)
2277         {
2278           if (dump_enabled_p ())
2279             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2280                              "unsupported phi node definition.\n");
2281
2282           return NULL;
2283         }
2284
2285       def1 = SSA_NAME_DEF_STMT (op1);
2286       if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2287           && loop->inner
2288           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2289           && is_gimple_assign (def1))
2290         {
2291           if (dump_enabled_p ())
2292             report_vect_op (MSG_NOTE, def_stmt,
2293                             "detected double reduction: ");
2294
2295           *double_reduc = true;
2296           return def_stmt;
2297         }
2298
2299       return NULL;
2300     }
2301
2302   code = orig_code = gimple_assign_rhs_code (def_stmt);
2303
2304   /* We can handle "res -= x[i]", which is non-associative by
2305      simply rewriting this into "res += -x[i]".  Avoid changing
2306      gimple instruction for the first simple tests and only do this
2307      if we're allowed to change code at all.  */
2308   if (code == MINUS_EXPR
2309       && modify
2310       && (op1 = gimple_assign_rhs1 (def_stmt))
2311       && TREE_CODE (op1) == SSA_NAME
2312       && SSA_NAME_DEF_STMT (op1) == phi)
2313     code = PLUS_EXPR;
2314
2315   if (check_reduction
2316       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2317     {
2318       if (dump_enabled_p ())
2319         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2320                         "reduction: not commutative/associative: ");
2321       return NULL;
2322     }
2323
2324   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2325     {
2326       if (code != COND_EXPR)
2327         {
2328           if (dump_enabled_p ())
2329             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2330                             "reduction: not binary operation: ");
2331
2332           return NULL;
2333         }
2334
2335       op3 = gimple_assign_rhs1 (def_stmt);
2336       if (COMPARISON_CLASS_P (op3))
2337         {
2338           op4 = TREE_OPERAND (op3, 1);
2339           op3 = TREE_OPERAND (op3, 0);
2340         }
2341
2342       op1 = gimple_assign_rhs2 (def_stmt);
2343       op2 = gimple_assign_rhs3 (def_stmt);
2344
2345       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2346         {
2347           if (dump_enabled_p ())
2348             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2349                             "reduction: uses not ssa_names: ");
2350
2351           return NULL;
2352         }
2353     }
2354   else
2355     {
2356       op1 = gimple_assign_rhs1 (def_stmt);
2357       op2 = gimple_assign_rhs2 (def_stmt);
2358
2359       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2360         {
2361           if (dump_enabled_p ())
2362             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2363                             "reduction: uses not ssa_names: ");
2364
2365           return NULL;
2366         }
2367    }
2368
2369   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2370   if ((TREE_CODE (op1) == SSA_NAME
2371        && !types_compatible_p (type,TREE_TYPE (op1)))
2372       || (TREE_CODE (op2) == SSA_NAME
2373           && !types_compatible_p (type, TREE_TYPE (op2)))
2374       || (op3 && TREE_CODE (op3) == SSA_NAME
2375           && !types_compatible_p (type, TREE_TYPE (op3)))
2376       || (op4 && TREE_CODE (op4) == SSA_NAME
2377           && !types_compatible_p (type, TREE_TYPE (op4))))
2378     {
2379       if (dump_enabled_p ())
2380         {
2381           dump_printf_loc (MSG_NOTE, vect_location,
2382                            "reduction: multiple types: operation type: ");
2383           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2384           dump_printf (MSG_NOTE, ", operands types: ");
2385           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2386                              TREE_TYPE (op1));
2387           dump_printf (MSG_NOTE, ",");
2388           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2389                              TREE_TYPE (op2));
2390           if (op3)
2391             {
2392               dump_printf (MSG_NOTE, ",");
2393               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2394                                  TREE_TYPE (op3));
2395             }
2396
2397           if (op4)
2398             {
2399               dump_printf (MSG_NOTE, ",");
2400               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2401                                  TREE_TYPE (op4));
2402             }
2403           dump_printf (MSG_NOTE, "\n");
2404         }
2405
2406       return NULL;
2407     }
2408
2409   /* Check that it's ok to change the order of the computation.
2410      Generally, when vectorizing a reduction we change the order of the
2411      computation.  This may change the behavior of the program in some
2412      cases, so we need to check that this is ok.  One exception is when
2413      vectorizing an outer-loop: the inner-loop is executed sequentially,
2414      and therefore vectorizing reductions in the inner-loop during
2415      outer-loop vectorization is safe.  */
2416
2417   /* CHECKME: check for !flag_finite_math_only too?  */
2418   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2419       && check_reduction)
2420     {
2421       /* Changing the order of operations changes the semantics.  */
2422       if (dump_enabled_p ())
2423         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2424                         "reduction: unsafe fp math optimization: ");
2425       return NULL;
2426     }
2427   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2428            && check_reduction)
2429     {
2430       /* Changing the order of operations changes the semantics.  */
2431       if (dump_enabled_p ())
2432         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2433                         "reduction: unsafe int math optimization: ");
2434       return NULL;
2435     }
2436   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2437     {
2438       /* Changing the order of operations changes the semantics.  */
2439       if (dump_enabled_p ())
2440         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2441                         "reduction: unsafe fixed-point math optimization: ");
2442       return NULL;
2443     }
2444
2445   /* If we detected "res -= x[i]" earlier, rewrite it into
2446      "res += -x[i]" now.  If this turns out to be useless reassoc
2447      will clean it up again.  */
2448   if (orig_code == MINUS_EXPR)
2449     {
2450       tree rhs = gimple_assign_rhs2 (def_stmt);
2451       tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
2452       gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2453                                                          rhs, NULL);
2454       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2455       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2456                                                           loop_info, NULL));
2457       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2458       gimple_assign_set_rhs2 (def_stmt, negrhs);
2459       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2460       update_stmt (def_stmt);
2461     }
2462
2463   /* Reduction is safe. We're dealing with one of the following:
2464      1) integer arithmetic and no trapv
2465      2) floating point arithmetic, and special flags permit this optimization
2466      3) nested cycle (i.e., outer loop vectorization).  */
2467   if (TREE_CODE (op1) == SSA_NAME)
2468     def1 = SSA_NAME_DEF_STMT (op1);
2469
2470   if (TREE_CODE (op2) == SSA_NAME)
2471     def2 = SSA_NAME_DEF_STMT (op2);
2472
2473   if (code != COND_EXPR
2474       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2475     {
2476       if (dump_enabled_p ())
2477         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2478       return NULL;
2479     }
2480
2481   /* Check that one def is the reduction def, defined by PHI,
2482      the other def is either defined in the loop ("vect_internal_def"),
2483      or it's an induction (defined by a loop-header phi-node).  */
2484
2485   if (def2 && def2 == phi
2486       && (code == COND_EXPR
2487           || !def1 || gimple_nop_p (def1)
2488           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2489           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2490               && (is_gimple_assign (def1)
2491                   || is_gimple_call (def1)
2492                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2493                       == vect_induction_def
2494                   || (gimple_code (def1) == GIMPLE_PHI
2495                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2496                           == vect_internal_def
2497                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2498     {
2499       if (dump_enabled_p ())
2500         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2501       return def_stmt;
2502     }
2503
2504   if (def1 && def1 == phi
2505       && (code == COND_EXPR
2506           || !def2 || gimple_nop_p (def2)
2507           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2508           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2509               && (is_gimple_assign (def2)
2510                   || is_gimple_call (def2)
2511                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2512                       == vect_induction_def
2513                   || (gimple_code (def2) == GIMPLE_PHI
2514                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2515                           == vect_internal_def
2516                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2517     {
2518       if (check_reduction)
2519         {
2520           /* Swap operands (just for simplicity - so that the rest of the code
2521              can assume that the reduction variable is always the last (second)
2522              argument).  */
2523           if (dump_enabled_p ())
2524             report_vect_op (MSG_NOTE, def_stmt,
2525                             "detected reduction: need to swap operands: ");
2526
2527           swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2528                              gimple_assign_rhs2_ptr (def_stmt));
2529
2530           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2531             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2532         }
2533       else
2534         {
2535           if (dump_enabled_p ())
2536             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2537         }
2538
2539       return def_stmt;
2540     }
2541
2542   /* Try to find SLP reduction chain.  */
2543   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2544     {
2545       if (dump_enabled_p ())
2546         report_vect_op (MSG_NOTE, def_stmt,
2547                         "reduction: detected reduction chain: ");
2548
2549       return def_stmt;
2550     }
2551
2552   if (dump_enabled_p ())
2553     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2554                     "reduction: unknown pattern: ");
2555
2556   return NULL;
2557 }
2558
2559 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2560    in-place.  Arguments as there.  */
2561
2562 static gimple
2563 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2564                           bool check_reduction, bool *double_reduc)
2565 {
2566   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2567                                      double_reduc, false);
2568 }
2569
2570 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2571    in-place if it enables detection of more reductions.  Arguments
2572    as there.  */
2573
2574 gimple
2575 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2576                           bool check_reduction, bool *double_reduc)
2577 {
2578   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2579                                      double_reduc, true);
2580 }
2581
2582 /* Calculate the cost of one scalar iteration of the loop.  */
2583 int
2584 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2585 {
2586   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2587   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2588   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2589   int innerloop_iters, i, stmt_cost;
2590
2591   /* Count statements in scalar loop.  Using this as scalar cost for a single
2592      iteration for now.
2593
2594      TODO: Add outer loop support.
2595
2596      TODO: Consider assigning different costs to different scalar
2597      statements.  */
2598
2599   /* FORNOW.  */
2600   innerloop_iters = 1;
2601   if (loop->inner)
2602     innerloop_iters = 50; /* FIXME */
2603
2604   for (i = 0; i < nbbs; i++)
2605     {
2606       gimple_stmt_iterator si;
2607       basic_block bb = bbs[i];
2608
2609       if (bb->loop_father == loop->inner)
2610         factor = innerloop_iters;
2611       else
2612         factor = 1;
2613
2614       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2615         {
2616           gimple stmt = gsi_stmt (si);
2617           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2618
2619           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2620             continue;
2621
2622           /* Skip stmts that are not vectorized inside the loop.  */
2623           if (stmt_info
2624               && !STMT_VINFO_RELEVANT_P (stmt_info)
2625               && (!STMT_VINFO_LIVE_P (stmt_info)
2626                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2627               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2628             continue;
2629
2630           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2631             {
2632               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2633                stmt_cost = vect_get_stmt_cost (scalar_load);
2634              else
2635                stmt_cost = vect_get_stmt_cost (scalar_store);
2636             }
2637           else
2638             stmt_cost = vect_get_stmt_cost (scalar_stmt);
2639
2640           scalar_single_iter_cost += stmt_cost * factor;
2641         }
2642     }
2643   return scalar_single_iter_cost;
2644 }
2645
2646 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2647 int
2648 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2649                              int *peel_iters_epilogue,
2650                              int scalar_single_iter_cost,
2651                              stmt_vector_for_cost *prologue_cost_vec,
2652                              stmt_vector_for_cost *epilogue_cost_vec)
2653 {
2654   int retval = 0;
2655   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2656
2657   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2658     {
2659       *peel_iters_epilogue = vf/2;
2660       if (dump_enabled_p ())
2661         dump_printf_loc (MSG_NOTE, vect_location,
2662                          "cost model: epilogue peel iters set to vf/2 "
2663                          "because loop iterations are unknown .\n");
2664
2665       /* If peeled iterations are known but number of scalar loop
2666          iterations are unknown, count a taken branch per peeled loop.  */
2667       retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2668                                  NULL, 0, vect_prologue);
2669     }
2670   else
2671     {
2672       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2673       peel_iters_prologue = niters < peel_iters_prologue ?
2674                             niters : peel_iters_prologue;
2675       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2676       /* If we need to peel for gaps, but no peeling is required, we have to
2677          peel VF iterations.  */
2678       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2679         *peel_iters_epilogue = vf;
2680     }
2681
2682   if (peel_iters_prologue)
2683     retval += record_stmt_cost (prologue_cost_vec,
2684                                 peel_iters_prologue * scalar_single_iter_cost,
2685                                 scalar_stmt, NULL, 0, vect_prologue);
2686   if (*peel_iters_epilogue)
2687     retval += record_stmt_cost (epilogue_cost_vec,
2688                                 *peel_iters_epilogue * scalar_single_iter_cost,
2689                                 scalar_stmt, NULL, 0, vect_epilogue);
2690   return retval;
2691 }
2692
2693 /* Function vect_estimate_min_profitable_iters
2694
2695    Return the number of iterations required for the vector version of the
2696    loop to be profitable relative to the cost of the scalar version of the
2697    loop.  */
2698
2699 static void
2700 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2701                                     int *ret_min_profitable_niters,
2702                                     int *ret_min_profitable_estimate)
2703 {
2704   int min_profitable_iters;
2705   int min_profitable_estimate;
2706   int peel_iters_prologue;
2707   int peel_iters_epilogue;
2708   unsigned vec_inside_cost = 0;
2709   int vec_outside_cost = 0;
2710   unsigned vec_prologue_cost = 0;
2711   unsigned vec_epilogue_cost = 0;
2712   int scalar_single_iter_cost = 0;
2713   int scalar_outside_cost = 0;
2714   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2715   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2716   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2717
2718   /* Cost model disabled.  */
2719   if (unlimited_cost_model ())
2720     {
2721       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2722       *ret_min_profitable_niters = 0;
2723       *ret_min_profitable_estimate = 0;
2724       return;
2725     }
2726
2727   /* Requires loop versioning tests to handle misalignment.  */
2728   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2729     {
2730       /*  FIXME: Make cost depend on complexity of individual check.  */
2731       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2732       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2733                             vect_prologue);
2734       dump_printf (MSG_NOTE,
2735                    "cost model: Adding cost of checks for loop "
2736                    "versioning to treat misalignment.\n");
2737     }
2738
2739   /* Requires loop versioning with alias checks.  */
2740   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2741     {
2742       /*  FIXME: Make cost depend on complexity of individual check.  */
2743       unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2744       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2745                             vect_prologue);
2746       dump_printf (MSG_NOTE,
2747                    "cost model: Adding cost of checks for loop "
2748                    "versioning aliasing.\n");
2749     }
2750
2751   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2752       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2753     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2754                           vect_prologue);
2755
2756   /* Count statements in scalar loop.  Using this as scalar cost for a single
2757      iteration for now.
2758
2759      TODO: Add outer loop support.
2760
2761      TODO: Consider assigning different costs to different scalar
2762      statements.  */
2763
2764   scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2765
2766   /* Add additional cost for the peeled instructions in prologue and epilogue
2767      loop.
2768
2769      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2770      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2771
2772      TODO: Build an expression that represents peel_iters for prologue and
2773      epilogue to be used in a run-time test.  */
2774
2775   if (npeel  < 0)
2776     {
2777       peel_iters_prologue = vf/2;
2778       dump_printf (MSG_NOTE, "cost model: "
2779                    "prologue peel iters set to vf/2.\n");
2780
2781       /* If peeling for alignment is unknown, loop bound of main loop becomes
2782          unknown.  */
2783       peel_iters_epilogue = vf/2;
2784       dump_printf (MSG_NOTE, "cost model: "
2785                    "epilogue peel iters set to vf/2 because "
2786                    "peeling for alignment is unknown.\n");
2787
2788       /* If peeled iterations are unknown, count a taken branch and a not taken
2789          branch per peeled loop. Even if scalar loop iterations are known,
2790          vector iterations are not known since peeled prologue iterations are
2791          not known. Hence guards remain the same.  */
2792       (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2793                             NULL, 0, vect_prologue);
2794       (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2795                             NULL, 0, vect_prologue);
2796       /* FORNOW: Don't attempt to pass individual scalar instructions to
2797          the model; just assume linear cost for scalar iterations.  */
2798       (void) add_stmt_cost (target_cost_data,
2799                             peel_iters_prologue * scalar_single_iter_cost,
2800                             scalar_stmt, NULL, 0, vect_prologue);
2801       (void) add_stmt_cost (target_cost_data,
2802                             peel_iters_epilogue * scalar_single_iter_cost,
2803                             scalar_stmt, NULL, 0, vect_epilogue);
2804     }
2805   else
2806     {
2807       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2808       stmt_info_for_cost *si;
2809       int j;
2810       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2811
2812       prologue_cost_vec.create (2);
2813       epilogue_cost_vec.create (2);
2814       peel_iters_prologue = npeel;
2815
2816       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2817                                           &peel_iters_epilogue,
2818                                           scalar_single_iter_cost,
2819                                           &prologue_cost_vec,
2820                                           &epilogue_cost_vec);
2821
2822       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2823         {
2824           struct _stmt_vec_info *stmt_info
2825             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2826           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2827                                 si->misalign, vect_prologue);
2828         }
2829
2830       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2831         {
2832           struct _stmt_vec_info *stmt_info
2833             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2834           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2835                                 si->misalign, vect_epilogue);
2836         }
2837
2838       prologue_cost_vec.release ();
2839       epilogue_cost_vec.release ();
2840     }
2841
2842   /* FORNOW: The scalar outside cost is incremented in one of the
2843      following ways:
2844
2845      1. The vectorizer checks for alignment and aliasing and generates
2846      a condition that allows dynamic vectorization.  A cost model
2847      check is ANDED with the versioning condition.  Hence scalar code
2848      path now has the added cost of the versioning check.
2849
2850        if (cost > th & versioning_check)
2851          jmp to vector code
2852
2853      Hence run-time scalar is incremented by not-taken branch cost.
2854
2855      2. The vectorizer then checks if a prologue is required.  If the
2856      cost model check was not done before during versioning, it has to
2857      be done before the prologue check.
2858
2859        if (cost <= th)
2860          prologue = scalar_iters
2861        if (prologue == 0)
2862          jmp to vector code
2863        else
2864          execute prologue
2865        if (prologue == num_iters)
2866          go to exit
2867
2868      Hence the run-time scalar cost is incremented by a taken branch,
2869      plus a not-taken branch, plus a taken branch cost.
2870
2871      3. The vectorizer then checks if an epilogue is required.  If the
2872      cost model check was not done before during prologue check, it
2873      has to be done with the epilogue check.
2874
2875        if (prologue == 0)
2876          jmp to vector code
2877        else
2878          execute prologue
2879        if (prologue == num_iters)
2880          go to exit
2881        vector code:
2882          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2883            jmp to epilogue
2884
2885      Hence the run-time scalar cost should be incremented by 2 taken
2886      branches.
2887
2888      TODO: The back end may reorder the BBS's differently and reverse
2889      conditions/branch directions.  Change the estimates below to
2890      something more reasonable.  */
2891
2892   /* If the number of iterations is known and we do not do versioning, we can
2893      decide whether to vectorize at compile time.  Hence the scalar version
2894      do not carry cost model guard costs.  */
2895   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2896       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2897       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2898     {
2899       /* Cost model check occurs at versioning.  */
2900       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2901           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2902         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2903       else
2904         {
2905           /* Cost model check occurs at prologue generation.  */
2906           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2907             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2908               + vect_get_stmt_cost (cond_branch_not_taken);
2909           /* Cost model check occurs at epilogue generation.  */
2910           else
2911             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2912         }
2913     }
2914
2915   /* Complete the target-specific cost calculations.  */
2916   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2917                &vec_inside_cost, &vec_epilogue_cost);
2918
2919   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2920
2921   /* Calculate number of iterations required to make the vector version
2922      profitable, relative to the loop bodies only.  The following condition
2923      must hold true:
2924      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2925      where
2926      SIC = scalar iteration cost, VIC = vector iteration cost,
2927      VOC = vector outside cost, VF = vectorization factor,
2928      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2929      SOC = scalar outside cost for run time cost model check.  */
2930
2931   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2932     {
2933       if (vec_outside_cost <= 0)
2934         min_profitable_iters = 1;
2935       else
2936         {
2937           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2938                                   - vec_inside_cost * peel_iters_prologue
2939                                   - vec_inside_cost * peel_iters_epilogue)
2940                                  / ((scalar_single_iter_cost * vf)
2941                                     - vec_inside_cost);
2942
2943           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2944               <= (((int) vec_inside_cost * min_profitable_iters)
2945                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
2946             min_profitable_iters++;
2947         }
2948     }
2949   /* vector version will never be profitable.  */
2950   else
2951     {
2952       if (dump_enabled_p ())
2953         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2954                          "cost model: the vector iteration cost = %d "
2955                          "divided by the scalar iteration cost = %d "
2956                          "is greater or equal to the vectorization factor = %d"
2957                          ".\n",
2958                          vec_inside_cost, scalar_single_iter_cost, vf);
2959       *ret_min_profitable_niters = -1;
2960       *ret_min_profitable_estimate = -1;
2961       return;
2962     }
2963
2964   if (dump_enabled_p ())
2965     {
2966       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2967       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
2968                    vec_inside_cost);
2969       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
2970                    vec_prologue_cost);
2971       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
2972                    vec_epilogue_cost);
2973       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
2974                    scalar_single_iter_cost);
2975       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
2976                    scalar_outside_cost);
2977       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
2978                    vec_outside_cost);
2979       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
2980                    peel_iters_prologue);
2981       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
2982                    peel_iters_epilogue);
2983       dump_printf (MSG_NOTE,
2984                    "  Calculated minimum iters for profitability: %d\n",
2985                    min_profitable_iters);
2986       dump_printf (MSG_NOTE, "\n");
2987     }
2988
2989   min_profitable_iters =
2990         min_profitable_iters < vf ? vf : min_profitable_iters;
2991
2992   /* Because the condition we create is:
2993      if (niters <= min_profitable_iters)
2994        then skip the vectorized loop.  */
2995   min_profitable_iters--;
2996
2997   if (dump_enabled_p ())
2998     dump_printf_loc (MSG_NOTE, vect_location,
2999                      "  Runtime profitability threshold = %d\n",
3000                      min_profitable_iters);
3001
3002   *ret_min_profitable_niters = min_profitable_iters;
3003
3004   /* Calculate number of iterations required to make the vector version
3005      profitable, relative to the loop bodies only.
3006
3007      Non-vectorized variant is SIC * niters and it must win over vector
3008      variant on the expected loop trip count.  The following condition must hold true:
3009      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3010
3011   if (vec_outside_cost <= 0)
3012     min_profitable_estimate = 1;
3013   else
3014     {
3015       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3016                                  - vec_inside_cost * peel_iters_prologue
3017                                  - vec_inside_cost * peel_iters_epilogue)
3018                                  / ((scalar_single_iter_cost * vf)
3019                                    - vec_inside_cost);
3020     }
3021   min_profitable_estimate --;
3022   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3023   if (dump_enabled_p ())
3024     dump_printf_loc (MSG_NOTE, vect_location,
3025                      "  Static estimate profitability threshold = %d\n",
3026                       min_profitable_iters);
3027
3028   *ret_min_profitable_estimate = min_profitable_estimate;
3029 }
3030
3031
3032 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3033    functions. Design better to avoid maintenance issues.  */
3034
3035 /* Function vect_model_reduction_cost.
3036
3037    Models cost for a reduction operation, including the vector ops
3038    generated within the strip-mine loop, the initial definition before
3039    the loop, and the epilogue code that must be generated.  */
3040
3041 static bool
3042 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3043                            int ncopies)
3044 {
3045   int prologue_cost = 0, epilogue_cost = 0;
3046   enum tree_code code;
3047   optab optab;
3048   tree vectype;
3049   gimple stmt, orig_stmt;
3050   tree reduction_op;
3051   enum machine_mode mode;
3052   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3053   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3054   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3055
3056   /* Cost of reduction op inside loop.  */
3057   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3058                                         stmt_info, 0, vect_body);
3059   stmt = STMT_VINFO_STMT (stmt_info);
3060
3061   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3062     {
3063     case GIMPLE_SINGLE_RHS:
3064       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
3065       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
3066       break;
3067     case GIMPLE_UNARY_RHS:
3068       reduction_op = gimple_assign_rhs1 (stmt);
3069       break;
3070     case GIMPLE_BINARY_RHS:
3071       reduction_op = gimple_assign_rhs2 (stmt);
3072       break;
3073     case GIMPLE_TERNARY_RHS:
3074       reduction_op = gimple_assign_rhs3 (stmt);
3075       break;
3076     default:
3077       gcc_unreachable ();
3078     }
3079
3080   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3081   if (!vectype)
3082     {
3083       if (dump_enabled_p ())
3084         {
3085           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3086                            "unsupported data-type ");
3087           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3088                              TREE_TYPE (reduction_op));
3089           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3090         }
3091       return false;
3092    }
3093
3094   mode = TYPE_MODE (vectype);
3095   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3096
3097   if (!orig_stmt)
3098     orig_stmt = STMT_VINFO_STMT (stmt_info);
3099
3100   code = gimple_assign_rhs_code (orig_stmt);
3101
3102   /* Add in cost for initial definition.  */
3103   prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3104                                   stmt_info, 0, vect_prologue);
3105
3106   /* Determine cost of epilogue code.
3107
3108      We have a reduction operator that will reduce the vector in one statement.
3109      Also requires scalar extract.  */
3110
3111   if (!nested_in_vect_loop_p (loop, orig_stmt))
3112     {
3113       if (reduc_code != ERROR_MARK)
3114         {
3115           epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3116                                           stmt_info, 0, vect_epilogue);
3117           epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3118                                           stmt_info, 0, vect_epilogue);
3119         }
3120       else
3121         {
3122           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3123           tree bitsize =
3124             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3125           int element_bitsize = tree_to_uhwi (bitsize);
3126           int nelements = vec_size_in_bits / element_bitsize;
3127
3128           optab = optab_for_tree_code (code, vectype, optab_default);
3129
3130           /* We have a whole vector shift available.  */
3131           if (VECTOR_MODE_P (mode)
3132               && optab_handler (optab, mode) != CODE_FOR_nothing
3133               && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3134             {
3135               /* Final reduction via vector shifts and the reduction operator.
3136                  Also requires scalar extract.  */
3137               epilogue_cost += add_stmt_cost (target_cost_data,
3138                                               exact_log2 (nelements) * 2,
3139                                               vector_stmt, stmt_info, 0,
3140                                               vect_epilogue);
3141               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3142                                               vec_to_scalar, stmt_info, 0,
3143                                               vect_epilogue);
3144             }
3145           else
3146             /* Use extracts and reduction op for final reduction.  For N
3147                elements, we have N extracts and N-1 reduction ops.  */
3148             epilogue_cost += add_stmt_cost (target_cost_data,
3149                                             nelements + nelements - 1,
3150                                             vector_stmt, stmt_info, 0,
3151                                             vect_epilogue);
3152         }
3153     }
3154
3155   if (dump_enabled_p ())
3156     dump_printf (MSG_NOTE,
3157                  "vect_model_reduction_cost: inside_cost = %d, "
3158                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3159                  prologue_cost, epilogue_cost);
3160
3161   return true;
3162 }
3163
3164
3165 /* Function vect_model_induction_cost.
3166
3167    Models cost for induction operations.  */
3168
3169 static void
3170 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3171 {
3172   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3173   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3174   unsigned inside_cost, prologue_cost;
3175
3176   /* loop cost for vec_loop.  */
3177   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3178                                stmt_info, 0, vect_body);
3179
3180   /* prologue cost for vec_init and vec_step.  */
3181   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3182                                  stmt_info, 0, vect_prologue);
3183
3184   if (dump_enabled_p ())
3185     dump_printf_loc (MSG_NOTE, vect_location,
3186                      "vect_model_induction_cost: inside_cost = %d, "
3187                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3188 }
3189
3190
3191 /* Function get_initial_def_for_induction
3192
3193    Input:
3194    STMT - a stmt that performs an induction operation in the loop.
3195    IV_PHI - the initial value of the induction variable
3196
3197    Output:
3198    Return a vector variable, initialized with the first VF values of
3199    the induction variable.  E.g., for an iv with IV_PHI='X' and
3200    evolution S, for a vector of 4 units, we want to return:
3201    [X, X + S, X + 2*S, X + 3*S].  */
3202
3203 static tree
3204 get_initial_def_for_induction (gimple iv_phi)
3205 {
3206   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3207   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3208   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3209   tree vectype;
3210   int nunits;
3211   edge pe = loop_preheader_edge (loop);
3212   struct loop *iv_loop;
3213   basic_block new_bb;
3214   tree new_vec, vec_init, vec_step, t;
3215   tree new_var;
3216   tree new_name;
3217   gimple init_stmt, induction_phi, new_stmt;
3218   tree induc_def, vec_def, vec_dest;
3219   tree init_expr, step_expr;
3220   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3221   int i;
3222   int ncopies;
3223   tree expr;
3224   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3225   bool nested_in_vect_loop = false;
3226   gimple_seq stmts = NULL;
3227   imm_use_iterator imm_iter;
3228   use_operand_p use_p;
3229   gimple exit_phi;
3230   edge latch_e;
3231   tree loop_arg;
3232   gimple_stmt_iterator si;
3233   basic_block bb = gimple_bb (iv_phi);
3234   tree stepvectype;
3235   tree resvectype;
3236
3237   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3238   if (nested_in_vect_loop_p (loop, iv_phi))
3239     {
3240       nested_in_vect_loop = true;
3241       iv_loop = loop->inner;
3242     }
3243   else
3244     iv_loop = loop;
3245   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3246
3247   latch_e = loop_latch_edge (iv_loop);
3248   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3249
3250   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3251   gcc_assert (step_expr != NULL_TREE);
3252
3253   pe = loop_preheader_edge (iv_loop);
3254   init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3255                                      loop_preheader_edge (iv_loop));
3256
3257   vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3258   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3259   gcc_assert (vectype);
3260   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3261   ncopies = vf / nunits;
3262
3263   gcc_assert (phi_info);
3264   gcc_assert (ncopies >= 1);
3265
3266   /* Convert the step to the desired type.  */
3267   step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3268                                                   step_expr),
3269                                     &stmts, true, NULL_TREE);
3270   if (stmts)
3271     {
3272       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3273       gcc_assert (!new_bb);
3274     }
3275
3276   /* Find the first insertion point in the BB.  */
3277   si = gsi_after_labels (bb);
3278
3279   /* Create the vector that holds the initial_value of the induction.  */
3280   if (nested_in_vect_loop)
3281     {
3282       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3283          been created during vectorization of previous stmts.  We obtain it
3284          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3285       vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
3286       /* If the initial value is not of proper type, convert it.  */
3287       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3288         {
3289           new_stmt = gimple_build_assign_with_ops
3290               (VIEW_CONVERT_EXPR,
3291                vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"),
3292                build1 (VIEW_CONVERT_EXPR, vectype, vec_init), NULL_TREE);
3293           vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3294           gimple_assign_set_lhs (new_stmt, vec_init);
3295           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3296                                                  new_stmt);
3297           gcc_assert (!new_bb);
3298           set_vinfo_for_stmt (new_stmt,
3299                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3300         }
3301     }
3302   else
3303     {
3304       vec<constructor_elt, va_gc> *v;
3305
3306       /* iv_loop is the loop to be vectorized. Create:
3307          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3308       new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3309                                        vect_scalar_var, "var_");
3310       new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3311                                                      init_expr),
3312                                        &stmts, false, new_var);
3313       if (stmts)
3314         {
3315           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3316           gcc_assert (!new_bb);
3317         }
3318
3319       vec_alloc (v, nunits);
3320       bool constant_p = is_gimple_min_invariant (new_name);
3321       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3322       for (i = 1; i < nunits; i++)
3323         {
3324           /* Create: new_name_i = new_name + step_expr  */
3325           new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3326                                   new_name, step_expr);
3327           if (!is_gimple_min_invariant (new_name))
3328             {
3329               init_stmt = gimple_build_assign (new_var, new_name);
3330               new_name = make_ssa_name (new_var, init_stmt);
3331               gimple_assign_set_lhs (init_stmt, new_name);
3332               new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3333               gcc_assert (!new_bb);
3334               if (dump_enabled_p ())
3335                 {
3336                   dump_printf_loc (MSG_NOTE, vect_location,
3337                                    "created new init_stmt: ");
3338                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3339                   dump_printf (MSG_NOTE, "\n");
3340                 }
3341               constant_p = false;
3342             }
3343           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3344         }
3345       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3346       if (constant_p)
3347         new_vec = build_vector_from_ctor (vectype, v);
3348       else
3349         new_vec = build_constructor (vectype, v);
3350       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3351     }
3352
3353
3354   /* Create the vector that holds the step of the induction.  */
3355   if (nested_in_vect_loop)
3356     /* iv_loop is nested in the loop to be vectorized. Generate:
3357        vec_step = [S, S, S, S]  */
3358     new_name = step_expr;
3359   else
3360     {
3361       /* iv_loop is the loop to be vectorized. Generate:
3362           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3363       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3364         {
3365           expr = build_int_cst (integer_type_node, vf);
3366           expr = fold_convert (TREE_TYPE (step_expr), expr);
3367         }
3368       else
3369         expr = build_int_cst (TREE_TYPE (step_expr), vf);
3370       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3371                               expr, step_expr);
3372       if (TREE_CODE (step_expr) == SSA_NAME)
3373         new_name = vect_init_vector (iv_phi, new_name,
3374                                      TREE_TYPE (step_expr), NULL);
3375     }
3376
3377   t = unshare_expr (new_name);
3378   gcc_assert (CONSTANT_CLASS_P (new_name)
3379               || TREE_CODE (new_name) == SSA_NAME);
3380   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3381   gcc_assert (stepvectype);
3382   new_vec = build_vector_from_val (stepvectype, t);
3383   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3384
3385
3386   /* Create the following def-use cycle:
3387      loop prolog:
3388          vec_init = ...
3389          vec_step = ...
3390      loop:
3391          vec_iv = PHI <vec_init, vec_loop>
3392          ...
3393          STMT
3394          ...
3395          vec_loop = vec_iv + vec_step;  */
3396
3397   /* Create the induction-phi that defines the induction-operand.  */
3398   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3399   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3400   set_vinfo_for_stmt (induction_phi,
3401                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3402   induc_def = PHI_RESULT (induction_phi);
3403
3404   /* Create the iv update inside the loop  */
3405   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3406                                            induc_def, vec_step);
3407   vec_def = make_ssa_name (vec_dest, new_stmt);
3408   gimple_assign_set_lhs (new_stmt, vec_def);
3409   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3410   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3411                                                    NULL));
3412
3413   /* Set the arguments of the phi node:  */
3414   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3415   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3416                UNKNOWN_LOCATION);
3417
3418
3419   /* In case that vectorization factor (VF) is bigger than the number
3420      of elements that we can fit in a vectype (nunits), we have to generate
3421      more than one vector stmt - i.e - we need to "unroll" the
3422      vector stmt by a factor VF/nunits.  For more details see documentation
3423      in vectorizable_operation.  */
3424
3425   if (ncopies > 1)
3426     {
3427       stmt_vec_info prev_stmt_vinfo;
3428       /* FORNOW. This restriction should be relaxed.  */
3429       gcc_assert (!nested_in_vect_loop);
3430
3431       /* Create the vector that holds the step of the induction.  */
3432       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3433         {
3434           expr = build_int_cst (integer_type_node, nunits);
3435           expr = fold_convert (TREE_TYPE (step_expr), expr);
3436         }
3437       else
3438         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3439       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3440                               expr, step_expr);
3441       if (TREE_CODE (step_expr) == SSA_NAME)
3442         new_name = vect_init_vector (iv_phi, new_name,
3443                                      TREE_TYPE (step_expr), NULL);
3444       t = unshare_expr (new_name);
3445       gcc_assert (CONSTANT_CLASS_P (new_name)
3446                   || TREE_CODE (new_name) == SSA_NAME);
3447       new_vec = build_vector_from_val (stepvectype, t);
3448       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3449
3450       vec_def = induc_def;
3451       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3452       for (i = 1; i < ncopies; i++)
3453         {
3454           /* vec_i = vec_prev + vec_step  */
3455           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3456                                                    vec_def, vec_step);
3457           vec_def = make_ssa_name (vec_dest, new_stmt);
3458           gimple_assign_set_lhs (new_stmt, vec_def);
3459
3460           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3461           if (!useless_type_conversion_p (resvectype, vectype))
3462             {
3463               new_stmt = gimple_build_assign_with_ops
3464                   (VIEW_CONVERT_EXPR,
3465                    vect_get_new_vect_var (resvectype, vect_simple_var,
3466                                           "vec_iv_"),
3467                    build1 (VIEW_CONVERT_EXPR, resvectype,
3468                            gimple_assign_lhs (new_stmt)), NULL_TREE);
3469               gimple_assign_set_lhs (new_stmt,
3470                                      make_ssa_name
3471                                        (gimple_assign_lhs (new_stmt), new_stmt));
3472               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3473             }
3474           set_vinfo_for_stmt (new_stmt,
3475                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3476           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3477           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3478         }
3479     }
3480
3481   if (nested_in_vect_loop)
3482     {
3483       /* Find the loop-closed exit-phi of the induction, and record
3484          the final vector of induction results:  */
3485       exit_phi = NULL;
3486       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3487         {
3488           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3489             {
3490               exit_phi = USE_STMT (use_p);
3491               break;
3492             }
3493         }
3494       if (exit_phi)
3495         {
3496           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3497           /* FORNOW. Currently not supporting the case that an inner-loop induction
3498              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3499           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3500                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3501
3502           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3503           if (dump_enabled_p ())
3504             {
3505               dump_printf_loc (MSG_NOTE, vect_location,
3506                                "vector of inductions after inner-loop:");
3507               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3508               dump_printf (MSG_NOTE, "\n");
3509             }
3510         }
3511     }
3512
3513
3514   if (dump_enabled_p ())
3515     {
3516       dump_printf_loc (MSG_NOTE, vect_location,
3517                        "transform induction: created def-use cycle: ");
3518       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3519       dump_printf (MSG_NOTE, "\n");
3520       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3521                         SSA_NAME_DEF_STMT (vec_def), 0);
3522       dump_printf (MSG_NOTE, "\n");
3523     }
3524
3525   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3526   if (!useless_type_conversion_p (resvectype, vectype))
3527     {
3528       new_stmt = gimple_build_assign_with_ops
3529          (VIEW_CONVERT_EXPR,
3530           vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3531           build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3532       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3533       gimple_assign_set_lhs (new_stmt, induc_def);
3534       si = gsi_after_labels (bb);
3535       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3536       set_vinfo_for_stmt (new_stmt,
3537                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3538       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3539         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3540     }
3541
3542   return induc_def;
3543 }
3544
3545
3546 /* Function get_initial_def_for_reduction
3547
3548    Input:
3549    STMT - a stmt that performs a reduction operation in the loop.
3550    INIT_VAL - the initial value of the reduction variable
3551
3552    Output:
3553    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3554         of the reduction (used for adjusting the epilog - see below).
3555    Return a vector variable, initialized according to the operation that STMT
3556         performs. This vector will be used as the initial value of the
3557         vector of partial results.
3558
3559    Option1 (adjust in epilog): Initialize the vector as follows:
3560      add/bit or/xor:    [0,0,...,0,0]
3561      mult/bit and:      [1,1,...,1,1]
3562      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3563    and when necessary (e.g. add/mult case) let the caller know
3564    that it needs to adjust the result by init_val.
3565
3566    Option2: Initialize the vector as follows:
3567      add/bit or/xor:    [init_val,0,0,...,0]
3568      mult/bit and:      [init_val,1,1,...,1]
3569      min/max/cond_expr: [init_val,init_val,...,init_val]
3570    and no adjustments are needed.
3571
3572    For example, for the following code:
3573
3574    s = init_val;
3575    for (i=0;i<n;i++)
3576      s = s + a[i];
3577
3578    STMT is 's = s + a[i]', and the reduction variable is 's'.
3579    For a vector of 4 units, we want to return either [0,0,0,init_val],
3580    or [0,0,0,0] and let the caller know that it needs to adjust
3581    the result at the end by 'init_val'.
3582
3583    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3584    initialization vector is simpler (same element in all entries), if
3585    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3586
3587    A cost model should help decide between these two schemes.  */
3588
3589 tree
3590 get_initial_def_for_reduction (gimple stmt, tree init_val,
3591                                tree *adjustment_def)
3592 {
3593   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3594   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3595   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3596   tree scalar_type = TREE_TYPE (init_val);
3597   tree vectype = get_vectype_for_scalar_type (scalar_type);
3598   int nunits;
3599   enum tree_code code = gimple_assign_rhs_code (stmt);
3600   tree def_for_init;
3601   tree init_def;
3602   tree *elts;
3603   int i;
3604   bool nested_in_vect_loop = false;
3605   tree init_value;
3606   REAL_VALUE_TYPE real_init_val = dconst0;
3607   int int_init_val = 0;
3608   gimple def_stmt = NULL;
3609
3610   gcc_assert (vectype);
3611   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3612
3613   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3614               || SCALAR_FLOAT_TYPE_P (scalar_type));
3615
3616   if (nested_in_vect_loop_p (loop, stmt))
3617     nested_in_vect_loop = true;
3618   else
3619     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3620
3621   /* In case of double reduction we only create a vector variable to be put
3622      in the reduction phi node.  The actual statement creation is done in
3623      vect_create_epilog_for_reduction.  */
3624   if (adjustment_def && nested_in_vect_loop
3625       && TREE_CODE (init_val) == SSA_NAME
3626       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3627       && gimple_code (def_stmt) == GIMPLE_PHI
3628       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3629       && vinfo_for_stmt (def_stmt)
3630       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3631           == vect_double_reduction_def)
3632     {
3633       *adjustment_def = NULL;
3634       return vect_create_destination_var (init_val, vectype);
3635     }
3636
3637   if (TREE_CONSTANT (init_val))
3638     {
3639       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3640         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3641       else
3642         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3643     }
3644   else
3645     init_value = init_val;
3646
3647   switch (code)
3648     {
3649       case WIDEN_SUM_EXPR:
3650       case DOT_PROD_EXPR:
3651       case PLUS_EXPR:
3652       case MINUS_EXPR:
3653       case BIT_IOR_EXPR:
3654       case BIT_XOR_EXPR:
3655       case MULT_EXPR:
3656       case BIT_AND_EXPR:
3657         /* ADJUSMENT_DEF is NULL when called from
3658            vect_create_epilog_for_reduction to vectorize double reduction.  */
3659         if (adjustment_def)
3660           {
3661             if (nested_in_vect_loop)
3662               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3663                                                               NULL);
3664             else
3665               *adjustment_def = init_val;
3666           }
3667
3668         if (code == MULT_EXPR)
3669           {
3670             real_init_val = dconst1;
3671             int_init_val = 1;
3672           }
3673
3674         if (code == BIT_AND_EXPR)
3675           int_init_val = -1;
3676
3677         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3678           def_for_init = build_real (scalar_type, real_init_val);
3679         else
3680           def_for_init = build_int_cst (scalar_type, int_init_val);
3681
3682         /* Create a vector of '0' or '1' except the first element.  */
3683         elts = XALLOCAVEC (tree, nunits);
3684         for (i = nunits - 2; i >= 0; --i)
3685           elts[i + 1] = def_for_init;
3686
3687         /* Option1: the first element is '0' or '1' as well.  */
3688         if (adjustment_def)
3689           {
3690             elts[0] = def_for_init;
3691             init_def = build_vector (vectype, elts);
3692             break;
3693           }
3694
3695         /* Option2: the first element is INIT_VAL.  */
3696         elts[0] = init_val;
3697         if (TREE_CONSTANT (init_val))
3698           init_def = build_vector (vectype, elts);
3699         else
3700           {
3701             vec<constructor_elt, va_gc> *v;
3702             vec_alloc (v, nunits);
3703             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3704             for (i = 1; i < nunits; ++i)
3705               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3706             init_def = build_constructor (vectype, v);
3707           }
3708
3709         break;
3710
3711       case MIN_EXPR:
3712       case MAX_EXPR:
3713       case COND_EXPR:
3714         if (adjustment_def)
3715           {
3716             *adjustment_def = NULL_TREE;
3717             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3718             break;
3719           }
3720
3721         init_def = build_vector_from_val (vectype, init_value);
3722         break;
3723
3724       default:
3725         gcc_unreachable ();
3726     }
3727
3728   return init_def;
3729 }
3730
3731
3732 /* Function vect_create_epilog_for_reduction
3733
3734    Create code at the loop-epilog to finalize the result of a reduction
3735    computation.
3736
3737    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3738      reduction statements.
3739    STMT is the scalar reduction stmt that is being vectorized.
3740    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3741      number of elements that we can fit in a vectype (nunits).  In this case
3742      we have to generate more than one vector stmt - i.e - we need to "unroll"
3743      the vector stmt by a factor VF/nunits.  For more details see documentation
3744      in vectorizable_operation.
3745    REDUC_CODE is the tree-code for the epilog reduction.
3746    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3747      computation.
3748    REDUC_INDEX is the index of the operand in the right hand side of the
3749      statement that is defined by REDUCTION_PHI.
3750    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3751    SLP_NODE is an SLP node containing a group of reduction statements. The
3752      first one in this group is STMT.
3753
3754    This function:
3755    1. Creates the reduction def-use cycles: sets the arguments for
3756       REDUCTION_PHIS:
3757       The loop-entry argument is the vectorized initial-value of the reduction.
3758       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3759       sums.
3760    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3761       by applying the operation specified by REDUC_CODE if available, or by
3762       other means (whole-vector shifts or a scalar loop).
3763       The function also creates a new phi node at the loop exit to preserve
3764       loop-closed form, as illustrated below.
3765
3766      The flow at the entry to this function:
3767
3768         loop:
3769           vec_def = phi <null, null>            # REDUCTION_PHI
3770           VECT_DEF = vector_stmt                # vectorized form of STMT
3771           s_loop = scalar_stmt                  # (scalar) STMT
3772         loop_exit:
3773           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3774           use <s_out0>
3775           use <s_out0>
3776
3777      The above is transformed by this function into:
3778
3779         loop:
3780           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3781           VECT_DEF = vector_stmt                # vectorized form of STMT
3782           s_loop = scalar_stmt                  # (scalar) STMT
3783         loop_exit:
3784           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3785           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3786           v_out2 = reduce <v_out1>
3787           s_out3 = extract_field <v_out2, 0>
3788           s_out4 = adjust_result <s_out3>
3789           use <s_out4>
3790           use <s_out4>
3791 */
3792
3793 static void
3794 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3795                                   int ncopies, enum tree_code reduc_code,
3796                                   vec<gimple> reduction_phis,
3797                                   int reduc_index, bool double_reduc,
3798                                   slp_tree slp_node)
3799 {
3800   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3801   stmt_vec_info prev_phi_info;
3802   tree vectype;
3803   enum machine_mode mode;
3804   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3805   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3806   basic_block exit_bb;
3807   tree scalar_dest;
3808   tree scalar_type;
3809   gimple new_phi = NULL, phi;
3810   gimple_stmt_iterator exit_gsi;
3811   tree vec_dest;
3812   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3813   gimple epilog_stmt = NULL;
3814   enum tree_code code = gimple_assign_rhs_code (stmt);
3815   gimple exit_phi;
3816   tree bitsize, bitpos;
3817   tree adjustment_def = NULL;
3818   tree vec_initial_def = NULL;
3819   tree reduction_op, expr, def;
3820   tree orig_name, scalar_result;
3821   imm_use_iterator imm_iter, phi_imm_iter;
3822   use_operand_p use_p, phi_use_p;
3823   bool extract_scalar_result = false;
3824   gimple use_stmt, orig_stmt, reduction_phi = NULL;
3825   bool nested_in_vect_loop = false;
3826   auto_vec<gimple> new_phis;
3827   auto_vec<gimple> inner_phis;
3828   enum vect_def_type dt = vect_unknown_def_type;
3829   int j, i;
3830   auto_vec<tree> scalar_results;
3831   unsigned int group_size = 1, k, ratio;
3832   auto_vec<tree> vec_initial_defs;
3833   auto_vec<gimple> phis;
3834   bool slp_reduc = false;
3835   tree new_phi_result;
3836   gimple inner_phi = NULL;
3837
3838   if (slp_node)
3839     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3840
3841   if (nested_in_vect_loop_p (loop, stmt))
3842     {
3843       outer_loop = loop;
3844       loop = loop->inner;
3845       nested_in_vect_loop = true;
3846       gcc_assert (!slp_node);
3847     }
3848
3849   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3850     {
3851     case GIMPLE_SINGLE_RHS:
3852       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3853                   == ternary_op);
3854       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3855       break;
3856     case GIMPLE_UNARY_RHS:
3857       reduction_op = gimple_assign_rhs1 (stmt);
3858       break;
3859     case GIMPLE_BINARY_RHS:
3860       reduction_op = reduc_index ?
3861                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3862       break;
3863     case GIMPLE_TERNARY_RHS:
3864       reduction_op = gimple_op (stmt, reduc_index + 1);
3865       break;
3866     default:
3867       gcc_unreachable ();
3868     }
3869
3870   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3871   gcc_assert (vectype);
3872   mode = TYPE_MODE (vectype);
3873
3874   /* 1. Create the reduction def-use cycle:
3875      Set the arguments of REDUCTION_PHIS, i.e., transform
3876
3877         loop:
3878           vec_def = phi <null, null>            # REDUCTION_PHI
3879           VECT_DEF = vector_stmt                # vectorized form of STMT
3880           ...
3881
3882      into:
3883
3884         loop:
3885           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3886           VECT_DEF = vector_stmt                # vectorized form of STMT
3887           ...
3888
3889      (in case of SLP, do it for all the phis). */
3890
3891   /* Get the loop-entry arguments.  */
3892   if (slp_node)
3893     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3894                        NULL, slp_node, reduc_index);
3895   else
3896     {
3897       vec_initial_defs.create (1);
3898      /* For the case of reduction, vect_get_vec_def_for_operand returns
3899         the scalar def before the loop, that defines the initial value
3900         of the reduction variable.  */
3901       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3902                                                       &adjustment_def);
3903       vec_initial_defs.quick_push (vec_initial_def);
3904     }
3905
3906   /* Set phi nodes arguments.  */
3907   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
3908     {
3909       tree vec_init_def = vec_initial_defs[i];
3910       tree def = vect_defs[i];
3911       for (j = 0; j < ncopies; j++)
3912         {
3913           /* Set the loop-entry arg of the reduction-phi.  */
3914           add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3915                        UNKNOWN_LOCATION);
3916
3917           /* Set the loop-latch arg for the reduction-phi.  */
3918           if (j > 0)
3919             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3920
3921           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3922
3923           if (dump_enabled_p ())
3924             {
3925               dump_printf_loc (MSG_NOTE, vect_location,
3926                                "transform reduction: created def-use cycle: ");
3927               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
3928               dump_printf (MSG_NOTE, "\n");
3929               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
3930               dump_printf (MSG_NOTE, "\n");
3931             }
3932
3933           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3934         }
3935     }
3936
3937   /* 2. Create epilog code.
3938         The reduction epilog code operates across the elements of the vector
3939         of partial results computed by the vectorized loop.
3940         The reduction epilog code consists of:
3941
3942         step 1: compute the scalar result in a vector (v_out2)
3943         step 2: extract the scalar result (s_out3) from the vector (v_out2)
3944         step 3: adjust the scalar result (s_out3) if needed.
3945
3946         Step 1 can be accomplished using one the following three schemes:
3947           (scheme 1) using reduc_code, if available.
3948           (scheme 2) using whole-vector shifts, if available.
3949           (scheme 3) using a scalar loop. In this case steps 1+2 above are
3950                      combined.
3951
3952           The overall epilog code looks like this:
3953
3954           s_out0 = phi <s_loop>         # original EXIT_PHI
3955           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
3956           v_out2 = reduce <v_out1>              # step 1
3957           s_out3 = extract_field <v_out2, 0>    # step 2
3958           s_out4 = adjust_result <s_out3>       # step 3
3959
3960           (step 3 is optional, and steps 1 and 2 may be combined).
3961           Lastly, the uses of s_out0 are replaced by s_out4.  */
3962
3963
3964   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3965          v_out1 = phi <VECT_DEF>
3966          Store them in NEW_PHIS.  */
3967
3968   exit_bb = single_exit (loop)->dest;
3969   prev_phi_info = NULL;
3970   new_phis.create (vect_defs.length ());
3971   FOR_EACH_VEC_ELT (vect_defs, i, def)
3972     {
3973       for (j = 0; j < ncopies; j++)
3974         {
3975           tree new_def = copy_ssa_name (def, NULL);
3976           phi = create_phi_node (new_def, exit_bb);
3977           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3978           if (j == 0)
3979             new_phis.quick_push (phi);
3980           else
3981             {
3982               def = vect_get_vec_def_for_stmt_copy (dt, def);
3983               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3984             }
3985
3986           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3987           prev_phi_info = vinfo_for_stmt (phi);
3988         }
3989     }
3990
3991   /* The epilogue is created for the outer-loop, i.e., for the loop being
3992      vectorized.  Create exit phis for the outer loop.  */
3993   if (double_reduc)
3994     {
3995       loop = outer_loop;
3996       exit_bb = single_exit (loop)->dest;
3997       inner_phis.create (vect_defs.length ());
3998       FOR_EACH_VEC_ELT (new_phis, i, phi)
3999         {
4000           tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
4001           gimple outer_phi = create_phi_node (new_result, exit_bb);
4002           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4003                            PHI_RESULT (phi));
4004           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4005                                                             loop_vinfo, NULL));
4006           inner_phis.quick_push (phi);
4007           new_phis[i] = outer_phi;
4008           prev_phi_info = vinfo_for_stmt (outer_phi);
4009           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4010             {
4011               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4012               new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
4013               outer_phi = create_phi_node (new_result, exit_bb);
4014               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4015                                PHI_RESULT (phi));
4016               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4017                                                         loop_vinfo, NULL));
4018               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4019               prev_phi_info = vinfo_for_stmt (outer_phi);
4020             }
4021         }
4022     }
4023
4024   exit_gsi = gsi_after_labels (exit_bb);
4025
4026   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4027          (i.e. when reduc_code is not available) and in the final adjustment
4028          code (if needed).  Also get the original scalar reduction variable as
4029          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4030          represents a reduction pattern), the tree-code and scalar-def are
4031          taken from the original stmt that the pattern-stmt (STMT) replaces.
4032          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4033          are taken from STMT.  */
4034
4035   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4036   if (!orig_stmt)
4037     {
4038       /* Regular reduction  */
4039       orig_stmt = stmt;
4040     }
4041   else
4042     {
4043       /* Reduction pattern  */
4044       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4045       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4046       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4047     }
4048
4049   code = gimple_assign_rhs_code (orig_stmt);
4050   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4051      partial results are added and not subtracted.  */
4052   if (code == MINUS_EXPR)
4053     code = PLUS_EXPR;
4054
4055   scalar_dest = gimple_assign_lhs (orig_stmt);
4056   scalar_type = TREE_TYPE (scalar_dest);
4057   scalar_results.create (group_size);
4058   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4059   bitsize = TYPE_SIZE (scalar_type);
4060
4061   /* In case this is a reduction in an inner-loop while vectorizing an outer
4062      loop - we don't need to extract a single scalar result at the end of the
4063      inner-loop (unless it is double reduction, i.e., the use of reduction is
4064      outside the outer-loop).  The final vector of partial results will be used
4065      in the vectorized outer-loop, or reduced to a scalar result at the end of
4066      the outer-loop.  */
4067   if (nested_in_vect_loop && !double_reduc)
4068     goto vect_finalize_reduction;
4069
4070   /* SLP reduction without reduction chain, e.g.,
4071      # a1 = phi <a2, a0>
4072      # b1 = phi <b2, b0>
4073      a2 = operation (a1)
4074      b2 = operation (b1)  */
4075   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4076
4077   /* In case of reduction chain, e.g.,
4078      # a1 = phi <a3, a0>
4079      a2 = operation (a1)
4080      a3 = operation (a2),
4081
4082      we may end up with more than one vector result.  Here we reduce them to
4083      one vector.  */
4084   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4085     {
4086       tree first_vect = PHI_RESULT (new_phis[0]);
4087       tree tmp;
4088       gimple new_vec_stmt = NULL;
4089
4090       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4091       for (k = 1; k < new_phis.length (); k++)
4092         {
4093           gimple next_phi = new_phis[k];
4094           tree second_vect = PHI_RESULT (next_phi);
4095
4096           tmp = build2 (code, vectype,  first_vect, second_vect);
4097           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4098           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4099           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4100           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4101         }
4102
4103       new_phi_result = first_vect;
4104       if (new_vec_stmt)
4105         {
4106           new_phis.truncate (0);
4107           new_phis.safe_push (new_vec_stmt);
4108         }
4109     }
4110   else
4111     new_phi_result = PHI_RESULT (new_phis[0]);
4112
4113   /* 2.3 Create the reduction code, using one of the three schemes described
4114          above. In SLP we simply need to extract all the elements from the
4115          vector (without reducing them), so we use scalar shifts.  */
4116   if (reduc_code != ERROR_MARK && !slp_reduc)
4117     {
4118       tree tmp;
4119
4120       /*** Case 1:  Create:
4121            v_out2 = reduc_expr <v_out1>  */
4122
4123       if (dump_enabled_p ())
4124         dump_printf_loc (MSG_NOTE, vect_location,
4125                          "Reduce using direct vector reduction.\n");
4126
4127       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4128       tmp = build1 (reduc_code, vectype, new_phi_result);
4129       epilog_stmt = gimple_build_assign (vec_dest, tmp);
4130       new_temp = make_ssa_name (vec_dest, epilog_stmt);
4131       gimple_assign_set_lhs (epilog_stmt, new_temp);
4132       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4133
4134       extract_scalar_result = true;
4135     }
4136   else
4137     {
4138       enum tree_code shift_code = ERROR_MARK;
4139       bool have_whole_vector_shift = true;
4140       int bit_offset;
4141       int element_bitsize = tree_to_uhwi (bitsize);
4142       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4143       tree vec_temp;
4144
4145       if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4146         shift_code = VEC_RSHIFT_EXPR;
4147       else
4148         have_whole_vector_shift = false;
4149
4150       /* Regardless of whether we have a whole vector shift, if we're
4151          emulating the operation via tree-vect-generic, we don't want
4152          to use it.  Only the first round of the reduction is likely
4153          to still be profitable via emulation.  */
4154       /* ??? It might be better to emit a reduction tree code here, so that
4155          tree-vect-generic can expand the first round via bit tricks.  */
4156       if (!VECTOR_MODE_P (mode))
4157         have_whole_vector_shift = false;
4158       else
4159         {
4160           optab optab = optab_for_tree_code (code, vectype, optab_default);
4161           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4162             have_whole_vector_shift = false;
4163         }
4164
4165       if (have_whole_vector_shift && !slp_reduc)
4166         {
4167           /*** Case 2: Create:
4168              for (offset = VS/2; offset >= element_size; offset/=2)
4169                 {
4170                   Create:  va' = vec_shift <va, offset>
4171                   Create:  va = vop <va, va'>
4172                 }  */
4173
4174           if (dump_enabled_p ())
4175             dump_printf_loc (MSG_NOTE, vect_location,
4176                              "Reduce using vector shifts\n");
4177
4178           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4179           new_temp = new_phi_result;
4180           for (bit_offset = vec_size_in_bits/2;
4181                bit_offset >= element_bitsize;
4182                bit_offset /= 2)
4183             {
4184               tree bitpos = size_int (bit_offset);
4185
4186               epilog_stmt = gimple_build_assign_with_ops (shift_code,
4187                                                vec_dest, new_temp, bitpos);
4188               new_name = make_ssa_name (vec_dest, epilog_stmt);
4189               gimple_assign_set_lhs (epilog_stmt, new_name);
4190               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4191
4192               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4193                                                           new_name, new_temp);
4194               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4195               gimple_assign_set_lhs (epilog_stmt, new_temp);
4196               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4197             }
4198
4199           extract_scalar_result = true;
4200         }
4201       else
4202         {
4203           tree rhs;
4204
4205           /*** Case 3: Create:
4206              s = extract_field <v_out2, 0>
4207              for (offset = element_size;
4208                   offset < vector_size;
4209                   offset += element_size;)
4210                {
4211                  Create:  s' = extract_field <v_out2, offset>
4212                  Create:  s = op <s, s'>  // For non SLP cases
4213                }  */
4214
4215           if (dump_enabled_p ())
4216             dump_printf_loc (MSG_NOTE, vect_location,
4217                              "Reduce using scalar code.\n");
4218
4219           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4220           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4221             {
4222               if (gimple_code (new_phi) == GIMPLE_PHI)
4223                 vec_temp = PHI_RESULT (new_phi);
4224               else
4225                 vec_temp = gimple_assign_lhs (new_phi);
4226               rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4227                             bitsize_zero_node);
4228               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4229               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4230               gimple_assign_set_lhs (epilog_stmt, new_temp);
4231               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4232
4233               /* In SLP we don't need to apply reduction operation, so we just
4234                  collect s' values in SCALAR_RESULTS.  */
4235               if (slp_reduc)
4236                 scalar_results.safe_push (new_temp);
4237
4238               for (bit_offset = element_bitsize;
4239                    bit_offset < vec_size_in_bits;
4240                    bit_offset += element_bitsize)
4241                 {
4242                   tree bitpos = bitsize_int (bit_offset);
4243                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4244                                      bitsize, bitpos);
4245
4246                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4247                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4248                   gimple_assign_set_lhs (epilog_stmt, new_name);
4249                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4250
4251                   if (slp_reduc)
4252                     {
4253                       /* In SLP we don't need to apply reduction operation, so
4254                          we just collect s' values in SCALAR_RESULTS.  */
4255                       new_temp = new_name;
4256                       scalar_results.safe_push (new_name);
4257                     }
4258                   else
4259                     {
4260                       epilog_stmt = gimple_build_assign_with_ops (code,
4261                                           new_scalar_dest, new_name, new_temp);
4262                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4263                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4264                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4265                     }
4266                 }
4267             }
4268
4269           /* The only case where we need to reduce scalar results in SLP, is
4270              unrolling.  If the size of SCALAR_RESULTS is greater than
4271              GROUP_SIZE, we reduce them combining elements modulo
4272              GROUP_SIZE.  */
4273           if (slp_reduc)
4274             {
4275               tree res, first_res, new_res;
4276               gimple new_stmt;
4277
4278               /* Reduce multiple scalar results in case of SLP unrolling.  */
4279               for (j = group_size; scalar_results.iterate (j, &res);
4280                    j++)
4281                 {
4282                   first_res = scalar_results[j % group_size];
4283                   new_stmt = gimple_build_assign_with_ops (code,
4284                                               new_scalar_dest, first_res, res);
4285                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4286                   gimple_assign_set_lhs (new_stmt, new_res);
4287                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4288                   scalar_results[j % group_size] = new_res;
4289                 }
4290             }
4291           else
4292             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4293             scalar_results.safe_push (new_temp);
4294
4295           extract_scalar_result = false;
4296         }
4297     }
4298
4299   /* 2.4  Extract the final scalar result.  Create:
4300           s_out3 = extract_field <v_out2, bitpos>  */
4301
4302   if (extract_scalar_result)
4303     {
4304       tree rhs;
4305
4306       if (dump_enabled_p ())
4307         dump_printf_loc (MSG_NOTE, vect_location,
4308                          "extract scalar result\n");
4309
4310       if (BYTES_BIG_ENDIAN)
4311         bitpos = size_binop (MULT_EXPR,
4312                              bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4313                              TYPE_SIZE (scalar_type));
4314       else
4315         bitpos = bitsize_zero_node;
4316
4317       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4318       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4319       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4320       gimple_assign_set_lhs (epilog_stmt, new_temp);
4321       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4322       scalar_results.safe_push (new_temp);
4323     }
4324
4325 vect_finalize_reduction:
4326
4327   if (double_reduc)
4328     loop = loop->inner;
4329
4330   /* 2.5 Adjust the final result by the initial value of the reduction
4331          variable. (When such adjustment is not needed, then
4332          'adjustment_def' is zero).  For example, if code is PLUS we create:
4333          new_temp = loop_exit_def + adjustment_def  */
4334
4335   if (adjustment_def)
4336     {
4337       gcc_assert (!slp_reduc);
4338       if (nested_in_vect_loop)
4339         {
4340           new_phi = new_phis[0];
4341           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4342           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4343           new_dest = vect_create_destination_var (scalar_dest, vectype);
4344         }
4345       else
4346         {
4347           new_temp = scalar_results[0];
4348           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4349           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4350           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4351         }
4352
4353       epilog_stmt = gimple_build_assign (new_dest, expr);
4354       new_temp = make_ssa_name (new_dest, epilog_stmt);
4355       gimple_assign_set_lhs (epilog_stmt, new_temp);
4356       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4357       if (nested_in_vect_loop)
4358         {
4359           set_vinfo_for_stmt (epilog_stmt,
4360                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
4361                                                  NULL));
4362           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4363                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4364
4365           if (!double_reduc)
4366             scalar_results.quick_push (new_temp);
4367           else
4368             scalar_results[0] = new_temp;
4369         }
4370       else
4371         scalar_results[0] = new_temp;
4372
4373       new_phis[0] = epilog_stmt;
4374     }
4375
4376   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4377           phis with new adjusted scalar results, i.e., replace use <s_out0>
4378           with use <s_out4>.
4379
4380      Transform:
4381         loop_exit:
4382           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4383           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4384           v_out2 = reduce <v_out1>
4385           s_out3 = extract_field <v_out2, 0>
4386           s_out4 = adjust_result <s_out3>
4387           use <s_out0>
4388           use <s_out0>
4389
4390      into:
4391
4392         loop_exit:
4393           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4394           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4395           v_out2 = reduce <v_out1>
4396           s_out3 = extract_field <v_out2, 0>
4397           s_out4 = adjust_result <s_out3>
4398           use <s_out4>
4399           use <s_out4> */
4400
4401
4402   /* In SLP reduction chain we reduce vector results into one vector if
4403      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4404      the last stmt in the reduction chain, since we are looking for the loop
4405      exit phi node.  */
4406   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4407     {
4408       scalar_dest = gimple_assign_lhs (
4409                         SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4410       group_size = 1;
4411     }
4412
4413   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4414      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4415      need to match SCALAR_RESULTS with corresponding statements.  The first
4416      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4417      the first vector stmt, etc.
4418      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4419   if (group_size > new_phis.length ())
4420     {
4421       ratio = group_size / new_phis.length ();
4422       gcc_assert (!(group_size % new_phis.length ()));
4423     }
4424   else
4425     ratio = 1;
4426
4427   for (k = 0; k < group_size; k++)
4428     {
4429       if (k % ratio == 0)
4430         {
4431           epilog_stmt = new_phis[k / ratio];
4432           reduction_phi = reduction_phis[k / ratio];
4433           if (double_reduc)
4434             inner_phi = inner_phis[k / ratio];
4435         }
4436
4437       if (slp_reduc)
4438         {
4439           gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4440
4441           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4442           /* SLP statements can't participate in patterns.  */
4443           gcc_assert (!orig_stmt);
4444           scalar_dest = gimple_assign_lhs (current_stmt);
4445         }
4446
4447       phis.create (3);
4448       /* Find the loop-closed-use at the loop exit of the original scalar
4449          result.  (The reduction result is expected to have two immediate uses -
4450          one at the latch block, and one at the loop exit).  */
4451       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4452         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4453             && !is_gimple_debug (USE_STMT (use_p)))
4454           phis.safe_push (USE_STMT (use_p));
4455
4456       /* While we expect to have found an exit_phi because of loop-closed-ssa
4457          form we can end up without one if the scalar cycle is dead.  */
4458
4459       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4460         {
4461           if (outer_loop)
4462             {
4463               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4464               gimple vect_phi;
4465
4466               /* FORNOW. Currently not supporting the case that an inner-loop
4467                  reduction is not used in the outer-loop (but only outside the
4468                  outer-loop), unless it is double reduction.  */
4469               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4470                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4471                           || double_reduc);
4472
4473               STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4474               if (!double_reduc
4475                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4476                       != vect_double_reduction_def)
4477                 continue;
4478
4479               /* Handle double reduction:
4480
4481                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4482                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4483                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4484                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4485
4486                  At that point the regular reduction (stmt2 and stmt3) is
4487                  already vectorized, as well as the exit phi node, stmt4.
4488                  Here we vectorize the phi node of double reduction, stmt1, and
4489                  update all relevant statements.  */
4490
4491               /* Go through all the uses of s2 to find double reduction phi
4492                  node, i.e., stmt1 above.  */
4493               orig_name = PHI_RESULT (exit_phi);
4494               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4495                 {
4496                   stmt_vec_info use_stmt_vinfo;
4497                   stmt_vec_info new_phi_vinfo;
4498                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4499                   basic_block bb = gimple_bb (use_stmt);
4500                   gimple use;
4501
4502                   /* Check that USE_STMT is really double reduction phi
4503                      node.  */
4504                   if (gimple_code (use_stmt) != GIMPLE_PHI
4505                       || gimple_phi_num_args (use_stmt) != 2
4506                       || bb->loop_father != outer_loop)
4507                     continue;
4508                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4509                   if (!use_stmt_vinfo
4510                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4511                           != vect_double_reduction_def)
4512                     continue;
4513
4514                   /* Create vector phi node for double reduction:
4515                      vs1 = phi <vs0, vs2>
4516                      vs1 was created previously in this function by a call to
4517                        vect_get_vec_def_for_operand and is stored in
4518                        vec_initial_def;
4519                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4520                      vs0 is created here.  */
4521
4522                   /* Create vector phi node.  */
4523                   vect_phi = create_phi_node (vec_initial_def, bb);
4524                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4525                                     loop_vec_info_for_loop (outer_loop), NULL);
4526                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4527
4528                   /* Create vs0 - initial def of the double reduction phi.  */
4529                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4530                                              loop_preheader_edge (outer_loop));
4531                   init_def = get_initial_def_for_reduction (stmt,
4532                                                           preheader_arg, NULL);
4533                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4534                                                     vectype, NULL);
4535
4536                   /* Update phi node arguments with vs0 and vs2.  */
4537                   add_phi_arg (vect_phi, vect_phi_init,
4538                                loop_preheader_edge (outer_loop),
4539                                UNKNOWN_LOCATION);
4540                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4541                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4542                   if (dump_enabled_p ())
4543                     {
4544                       dump_printf_loc (MSG_NOTE, vect_location,
4545                                        "created double reduction phi node: ");
4546                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4547                       dump_printf (MSG_NOTE, "\n");
4548                     }
4549
4550                   vect_phi_res = PHI_RESULT (vect_phi);
4551
4552                   /* Replace the use, i.e., set the correct vs1 in the regular
4553                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4554                      loop is redundant.  */
4555                   use = reduction_phi;
4556                   for (j = 0; j < ncopies; j++)
4557                     {
4558                       edge pr_edge = loop_preheader_edge (loop);
4559                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4560                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4561                     }
4562                 }
4563             }
4564         }
4565
4566       phis.release ();
4567       if (nested_in_vect_loop)
4568         {
4569           if (double_reduc)
4570             loop = outer_loop;
4571           else
4572             continue;
4573         }
4574
4575       phis.create (3);
4576       /* Find the loop-closed-use at the loop exit of the original scalar
4577          result.  (The reduction result is expected to have two immediate uses,
4578          one at the latch block, and one at the loop exit).  For double
4579          reductions we are looking for exit phis of the outer loop.  */
4580       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4581         {
4582           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4583             {
4584               if (!is_gimple_debug (USE_STMT (use_p)))
4585                 phis.safe_push (USE_STMT (use_p));
4586             }
4587           else
4588             {
4589               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4590                 {
4591                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4592
4593                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4594                     {
4595                       if (!flow_bb_inside_loop_p (loop,
4596                                              gimple_bb (USE_STMT (phi_use_p)))
4597                           && !is_gimple_debug (USE_STMT (phi_use_p)))
4598                         phis.safe_push (USE_STMT (phi_use_p));
4599                     }
4600                 }
4601             }
4602         }
4603
4604       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4605         {
4606           /* Replace the uses:  */
4607           orig_name = PHI_RESULT (exit_phi);
4608           scalar_result = scalar_results[k];
4609           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4610             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4611               SET_USE (use_p, scalar_result);
4612         }
4613
4614       phis.release ();
4615     }
4616 }
4617
4618
4619 /* Function vectorizable_reduction.
4620
4621    Check if STMT performs a reduction operation that can be vectorized.
4622    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4623    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4624    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4625
4626    This function also handles reduction idioms (patterns) that have been
4627    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4628    of this form:
4629      X = pattern_expr (arg0, arg1, ..., X)
4630    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4631    sequence that had been detected and replaced by the pattern-stmt (STMT).
4632
4633    In some cases of reduction patterns, the type of the reduction variable X is
4634    different than the type of the other arguments of STMT.
4635    In such cases, the vectype that is used when transforming STMT into a vector
4636    stmt is different than the vectype that is used to determine the
4637    vectorization factor, because it consists of a different number of elements
4638    than the actual number of elements that are being operated upon in parallel.
4639
4640    For example, consider an accumulation of shorts into an int accumulator.
4641    On some targets it's possible to vectorize this pattern operating on 8
4642    shorts at a time (hence, the vectype for purposes of determining the
4643    vectorization factor should be V8HI); on the other hand, the vectype that
4644    is used to create the vector form is actually V4SI (the type of the result).
4645
4646    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4647    indicates what is the actual level of parallelism (V8HI in the example), so
4648    that the right vectorization factor would be derived.  This vectype
4649    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4650    be used to create the vectorized stmt.  The right vectype for the vectorized
4651    stmt is obtained from the type of the result X:
4652         get_vectype_for_scalar_type (TREE_TYPE (X))
4653
4654    This means that, contrary to "regular" reductions (or "regular" stmts in
4655    general), the following equation:
4656       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4657    does *NOT* necessarily hold for reduction patterns.  */
4658
4659 bool
4660 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4661                         gimple *vec_stmt, slp_tree slp_node)
4662 {
4663   tree vec_dest;
4664   tree scalar_dest;
4665   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4666   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4667   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4668   tree vectype_in = NULL_TREE;
4669   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4670   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4671   enum tree_code code, orig_code, epilog_reduc_code;
4672   enum machine_mode vec_mode;
4673   int op_type;
4674   optab optab, reduc_optab;
4675   tree new_temp = NULL_TREE;
4676   tree def;
4677   gimple def_stmt;
4678   enum vect_def_type dt;
4679   gimple new_phi = NULL;
4680   tree scalar_type;
4681   bool is_simple_use;
4682   gimple orig_stmt;
4683   stmt_vec_info orig_stmt_info;
4684   tree expr = NULL_TREE;
4685   int i;
4686   int ncopies;
4687   int epilog_copies;
4688   stmt_vec_info prev_stmt_info, prev_phi_info;
4689   bool single_defuse_cycle = false;
4690   tree reduc_def = NULL_TREE;
4691   gimple new_stmt = NULL;
4692   int j;
4693   tree ops[3];
4694   bool nested_cycle = false, found_nested_cycle_def = false;
4695   gimple reduc_def_stmt = NULL;
4696   /* The default is that the reduction variable is the last in statement.  */
4697   int reduc_index = 2;
4698   bool double_reduc = false, dummy;
4699   basic_block def_bb;
4700   struct loop * def_stmt_loop, *outer_loop = NULL;
4701   tree def_arg;
4702   gimple def_arg_stmt;
4703   auto_vec<tree> vec_oprnds0;
4704   auto_vec<tree> vec_oprnds1;
4705   auto_vec<tree> vect_defs;
4706   auto_vec<gimple> phis;
4707   int vec_num;
4708   tree def0, def1, tem, op0, op1 = NULL_TREE;
4709
4710   /* In case of reduction chain we switch to the first stmt in the chain, but
4711      we don't update STMT_INFO, since only the last stmt is marked as reduction
4712      and has reduction properties.  */
4713   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4714     stmt = GROUP_FIRST_ELEMENT (stmt_info);
4715
4716   if (nested_in_vect_loop_p (loop, stmt))
4717     {
4718       outer_loop = loop;
4719       loop = loop->inner;
4720       nested_cycle = true;
4721     }
4722
4723   /* 1. Is vectorizable reduction?  */
4724   /* Not supportable if the reduction variable is used in the loop, unless
4725      it's a reduction chain.  */
4726   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4727       && !GROUP_FIRST_ELEMENT (stmt_info))
4728     return false;
4729
4730   /* Reductions that are not used even in an enclosing outer-loop,
4731      are expected to be "live" (used out of the loop).  */
4732   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4733       && !STMT_VINFO_LIVE_P (stmt_info))
4734     return false;
4735
4736   /* Make sure it was already recognized as a reduction computation.  */
4737   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4738       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4739     return false;
4740
4741   /* 2. Has this been recognized as a reduction pattern?
4742
4743      Check if STMT represents a pattern that has been recognized
4744      in earlier analysis stages.  For stmts that represent a pattern,
4745      the STMT_VINFO_RELATED_STMT field records the last stmt in
4746      the original sequence that constitutes the pattern.  */
4747
4748   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4749   if (orig_stmt)
4750     {
4751       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4752       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4753       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4754     }
4755
4756   /* 3. Check the operands of the operation.  The first operands are defined
4757         inside the loop body. The last operand is the reduction variable,
4758         which is defined by the loop-header-phi.  */
4759
4760   gcc_assert (is_gimple_assign (stmt));
4761
4762   /* Flatten RHS.  */
4763   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4764     {
4765     case GIMPLE_SINGLE_RHS:
4766       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4767       if (op_type == ternary_op)
4768         {
4769           tree rhs = gimple_assign_rhs1 (stmt);
4770           ops[0] = TREE_OPERAND (rhs, 0);
4771           ops[1] = TREE_OPERAND (rhs, 1);
4772           ops[2] = TREE_OPERAND (rhs, 2);
4773           code = TREE_CODE (rhs);
4774         }
4775       else
4776         return false;
4777       break;
4778
4779     case GIMPLE_BINARY_RHS:
4780       code = gimple_assign_rhs_code (stmt);
4781       op_type = TREE_CODE_LENGTH (code);
4782       gcc_assert (op_type == binary_op);
4783       ops[0] = gimple_assign_rhs1 (stmt);
4784       ops[1] = gimple_assign_rhs2 (stmt);
4785       break;
4786
4787     case GIMPLE_TERNARY_RHS:
4788       code = gimple_assign_rhs_code (stmt);
4789       op_type = TREE_CODE_LENGTH (code);
4790       gcc_assert (op_type == ternary_op);
4791       ops[0] = gimple_assign_rhs1 (stmt);
4792       ops[1] = gimple_assign_rhs2 (stmt);
4793       ops[2] = gimple_assign_rhs3 (stmt);
4794       break;
4795
4796     case GIMPLE_UNARY_RHS:
4797       return false;
4798
4799     default:
4800       gcc_unreachable ();
4801     }
4802
4803   if (code == COND_EXPR && slp_node)
4804     return false;
4805
4806   scalar_dest = gimple_assign_lhs (stmt);
4807   scalar_type = TREE_TYPE (scalar_dest);
4808   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4809       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4810     return false;
4811
4812   /* Do not try to vectorize bit-precision reductions.  */
4813   if ((TYPE_PRECISION (scalar_type)
4814        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4815     return false;
4816
4817   /* All uses but the last are expected to be defined in the loop.
4818      The last use is the reduction variable.  In case of nested cycle this
4819      assumption is not true: we use reduc_index to record the index of the
4820      reduction variable.  */
4821   for (i = 0; i < op_type - 1; i++)
4822     {
4823       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4824       if (i == 0 && code == COND_EXPR)
4825         continue;
4826
4827       is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4828                                             &def_stmt, &def, &dt, &tem);
4829       if (!vectype_in)
4830         vectype_in = tem;
4831       gcc_assert (is_simple_use);
4832
4833       if (dt != vect_internal_def
4834           && dt != vect_external_def
4835           && dt != vect_constant_def
4836           && dt != vect_induction_def
4837           && !(dt == vect_nested_cycle && nested_cycle))
4838         return false;
4839
4840       if (dt == vect_nested_cycle)
4841         {
4842           found_nested_cycle_def = true;
4843           reduc_def_stmt = def_stmt;
4844           reduc_index = i;
4845         }
4846     }
4847
4848   is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4849                                         &def_stmt, &def, &dt, &tem);
4850   if (!vectype_in)
4851     vectype_in = tem;
4852   gcc_assert (is_simple_use);
4853   if (!(dt == vect_reduction_def
4854         || dt == vect_nested_cycle
4855         || ((dt == vect_internal_def || dt == vect_external_def
4856              || dt == vect_constant_def || dt == vect_induction_def)
4857             && nested_cycle && found_nested_cycle_def)))
4858     {
4859       /* For pattern recognized stmts, orig_stmt might be a reduction,
4860          but some helper statements for the pattern might not, or
4861          might be COND_EXPRs with reduction uses in the condition.  */
4862       gcc_assert (orig_stmt);
4863       return false;
4864     }
4865   if (!found_nested_cycle_def)
4866     reduc_def_stmt = def_stmt;
4867
4868   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4869   if (orig_stmt)
4870     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4871                                                        reduc_def_stmt,
4872                                                        !nested_cycle,
4873                                                        &dummy));
4874   else
4875     {
4876       gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4877                                              !nested_cycle, &dummy);
4878       /* We changed STMT to be the first stmt in reduction chain, hence we
4879          check that in this case the first element in the chain is STMT.  */
4880       gcc_assert (stmt == tmp
4881                   || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4882     }
4883
4884   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4885     return false;
4886
4887   if (slp_node || PURE_SLP_STMT (stmt_info))
4888     ncopies = 1;
4889   else
4890     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4891                / TYPE_VECTOR_SUBPARTS (vectype_in));
4892
4893   gcc_assert (ncopies >= 1);
4894
4895   vec_mode = TYPE_MODE (vectype_in);
4896
4897   if (code == COND_EXPR)
4898     {
4899       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4900         {
4901           if (dump_enabled_p ())
4902             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4903                              "unsupported condition in reduction\n");
4904
4905             return false;
4906         }
4907     }
4908   else
4909     {
4910       /* 4. Supportable by target?  */
4911
4912       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
4913           || code == LROTATE_EXPR || code == RROTATE_EXPR)
4914         {
4915           /* Shifts and rotates are only supported by vectorizable_shifts,
4916              not vectorizable_reduction.  */
4917           if (dump_enabled_p ())
4918             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4919                              "unsupported shift or rotation.\n");
4920           return false;
4921         }
4922
4923       /* 4.1. check support for the operation in the loop  */
4924       optab = optab_for_tree_code (code, vectype_in, optab_default);
4925       if (!optab)
4926         {
4927           if (dump_enabled_p ())
4928             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4929                              "no optab.\n");
4930
4931           return false;
4932         }
4933
4934       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4935         {
4936           if (dump_enabled_p ())
4937             dump_printf (MSG_NOTE, "op not supported by target.\n");
4938
4939           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4940               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4941                   < vect_min_worthwhile_factor (code))
4942             return false;
4943
4944           if (dump_enabled_p ())
4945             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
4946         }
4947
4948       /* Worthwhile without SIMD support?  */
4949       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4950           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4951              < vect_min_worthwhile_factor (code))
4952         {
4953           if (dump_enabled_p ())
4954             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4955                              "not worthwhile without SIMD support.\n");
4956
4957           return false;
4958         }
4959     }
4960
4961   /* 4.2. Check support for the epilog operation.
4962
4963           If STMT represents a reduction pattern, then the type of the
4964           reduction variable may be different than the type of the rest
4965           of the arguments.  For example, consider the case of accumulation
4966           of shorts into an int accumulator; The original code:
4967                         S1: int_a = (int) short_a;
4968           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
4969
4970           was replaced with:
4971                         STMT: int_acc = widen_sum <short_a, int_acc>
4972
4973           This means that:
4974           1. The tree-code that is used to create the vector operation in the
4975              epilog code (that reduces the partial results) is not the
4976              tree-code of STMT, but is rather the tree-code of the original
4977              stmt from the pattern that STMT is replacing.  I.e, in the example
4978              above we want to use 'widen_sum' in the loop, but 'plus' in the
4979              epilog.
4980           2. The type (mode) we use to check available target support
4981              for the vector operation to be created in the *epilog*, is
4982              determined by the type of the reduction variable (in the example
4983              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4984              However the type (mode) we use to check available target support
4985              for the vector operation to be created *inside the loop*, is
4986              determined by the type of the other arguments to STMT (in the
4987              example we'd check this: optab_handler (widen_sum_optab,
4988              vect_short_mode)).
4989
4990           This is contrary to "regular" reductions, in which the types of all
4991           the arguments are the same as the type of the reduction variable.
4992           For "regular" reductions we can therefore use the same vector type
4993           (and also the same tree-code) when generating the epilog code and
4994           when generating the code inside the loop.  */
4995
4996   if (orig_stmt)
4997     {
4998       /* This is a reduction pattern: get the vectype from the type of the
4999          reduction variable, and get the tree-code from orig_stmt.  */
5000       orig_code = gimple_assign_rhs_code (orig_stmt);
5001       gcc_assert (vectype_out);
5002       vec_mode = TYPE_MODE (vectype_out);
5003     }
5004   else
5005     {
5006       /* Regular reduction: use the same vectype and tree-code as used for
5007          the vector code inside the loop can be used for the epilog code. */
5008       orig_code = code;
5009     }
5010
5011   if (nested_cycle)
5012     {
5013       def_bb = gimple_bb (reduc_def_stmt);
5014       def_stmt_loop = def_bb->loop_father;
5015       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5016                                        loop_preheader_edge (def_stmt_loop));
5017       if (TREE_CODE (def_arg) == SSA_NAME
5018           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5019           && gimple_code (def_arg_stmt) == GIMPLE_PHI
5020           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5021           && vinfo_for_stmt (def_arg_stmt)
5022           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5023               == vect_double_reduction_def)
5024         double_reduc = true;
5025     }
5026
5027   epilog_reduc_code = ERROR_MARK;
5028   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5029     {
5030       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5031                                          optab_default);
5032       if (!reduc_optab)
5033         {
5034           if (dump_enabled_p ())
5035             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5036                              "no optab for reduction.\n");
5037
5038           epilog_reduc_code = ERROR_MARK;
5039         }
5040
5041       if (reduc_optab
5042           && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5043         {
5044           if (dump_enabled_p ())
5045             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5046                              "reduc op not supported by target.\n");
5047
5048           epilog_reduc_code = ERROR_MARK;
5049         }
5050     }
5051   else
5052     {
5053       if (!nested_cycle || double_reduc)
5054         {
5055           if (dump_enabled_p ())
5056             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5057                              "no reduc code for scalar code.\n");
5058
5059           return false;
5060         }
5061     }
5062
5063   if (double_reduc && ncopies > 1)
5064     {
5065       if (dump_enabled_p ())
5066         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5067                          "multiple types in double reduction\n");
5068
5069       return false;
5070     }
5071
5072   /* In case of widenning multiplication by a constant, we update the type
5073      of the constant to be the type of the other operand.  We check that the
5074      constant fits the type in the pattern recognition pass.  */
5075   if (code == DOT_PROD_EXPR
5076       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5077     {
5078       if (TREE_CODE (ops[0]) == INTEGER_CST)
5079         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5080       else if (TREE_CODE (ops[1]) == INTEGER_CST)
5081         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5082       else
5083         {
5084           if (dump_enabled_p ())
5085             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5086                              "invalid types in dot-prod\n");
5087
5088           return false;
5089         }
5090     }
5091
5092   if (!vec_stmt) /* transformation not required.  */
5093     {
5094       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
5095         return false;
5096       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5097       return true;
5098     }
5099
5100   /** Transform.  **/
5101
5102   if (dump_enabled_p ())
5103     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5104
5105   /* FORNOW: Multiple types are not supported for condition.  */
5106   if (code == COND_EXPR)
5107     gcc_assert (ncopies == 1);
5108
5109   /* Create the destination vector  */
5110   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5111
5112   /* In case the vectorization factor (VF) is bigger than the number
5113      of elements that we can fit in a vectype (nunits), we have to generate
5114      more than one vector stmt - i.e - we need to "unroll" the
5115      vector stmt by a factor VF/nunits.  For more details see documentation
5116      in vectorizable_operation.  */
5117
5118   /* If the reduction is used in an outer loop we need to generate
5119      VF intermediate results, like so (e.g. for ncopies=2):
5120         r0 = phi (init, r0)
5121         r1 = phi (init, r1)
5122         r0 = x0 + r0;
5123         r1 = x1 + r1;
5124     (i.e. we generate VF results in 2 registers).
5125     In this case we have a separate def-use cycle for each copy, and therefore
5126     for each copy we get the vector def for the reduction variable from the
5127     respective phi node created for this copy.
5128
5129     Otherwise (the reduction is unused in the loop nest), we can combine
5130     together intermediate results, like so (e.g. for ncopies=2):
5131         r = phi (init, r)
5132         r = x0 + r;
5133         r = x1 + r;
5134    (i.e. we generate VF/2 results in a single register).
5135    In this case for each copy we get the vector def for the reduction variable
5136    from the vectorized reduction operation generated in the previous iteration.
5137   */
5138
5139   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5140     {
5141       single_defuse_cycle = true;
5142       epilog_copies = 1;
5143     }
5144   else
5145     epilog_copies = ncopies;
5146
5147   prev_stmt_info = NULL;
5148   prev_phi_info = NULL;
5149   if (slp_node)
5150     {
5151       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5152       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5153                   == TYPE_VECTOR_SUBPARTS (vectype_in));
5154     }
5155   else
5156     {
5157       vec_num = 1;
5158       vec_oprnds0.create (1);
5159       if (op_type == ternary_op)
5160         vec_oprnds1.create (1);
5161     }
5162
5163   phis.create (vec_num);
5164   vect_defs.create (vec_num);
5165   if (!slp_node)
5166     vect_defs.quick_push (NULL_TREE);
5167
5168   for (j = 0; j < ncopies; j++)
5169     {
5170       if (j == 0 || !single_defuse_cycle)
5171         {
5172           for (i = 0; i < vec_num; i++)
5173             {
5174               /* Create the reduction-phi that defines the reduction
5175                  operand.  */
5176               new_phi = create_phi_node (vec_dest, loop->header);
5177               set_vinfo_for_stmt (new_phi,
5178                                   new_stmt_vec_info (new_phi, loop_vinfo,
5179                                                      NULL));
5180                if (j == 0 || slp_node)
5181                  phis.quick_push (new_phi);
5182             }
5183         }
5184
5185       if (code == COND_EXPR)
5186         {
5187           gcc_assert (!slp_node);
5188           vectorizable_condition (stmt, gsi, vec_stmt,
5189                                   PHI_RESULT (phis[0]),
5190                                   reduc_index, NULL);
5191           /* Multiple types are not supported for condition.  */
5192           break;
5193         }
5194
5195       /* Handle uses.  */
5196       if (j == 0)
5197         {
5198           op0 = ops[!reduc_index];
5199           if (op_type == ternary_op)
5200             {
5201               if (reduc_index == 0)
5202                 op1 = ops[2];
5203               else
5204                 op1 = ops[1];
5205             }
5206
5207           if (slp_node)
5208             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5209                                slp_node, -1);
5210           else
5211             {
5212               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5213                                                             stmt, NULL);
5214               vec_oprnds0.quick_push (loop_vec_def0);
5215               if (op_type == ternary_op)
5216                {
5217                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5218                                                                NULL);
5219                  vec_oprnds1.quick_push (loop_vec_def1);
5220                }
5221             }
5222         }
5223       else
5224         {
5225           if (!slp_node)
5226             {
5227               enum vect_def_type dt;
5228               gimple dummy_stmt;
5229               tree dummy;
5230
5231               vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5232                                   &dummy_stmt, &dummy, &dt);
5233               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5234                                                               loop_vec_def0);
5235               vec_oprnds0[0] = loop_vec_def0;
5236               if (op_type == ternary_op)
5237                 {
5238                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5239                                       &dummy, &dt);
5240                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5241                                                                 loop_vec_def1);
5242                   vec_oprnds1[0] = loop_vec_def1;
5243                 }
5244             }
5245
5246           if (single_defuse_cycle)
5247             reduc_def = gimple_assign_lhs (new_stmt);
5248
5249           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5250         }
5251
5252       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5253         {
5254           if (slp_node)
5255             reduc_def = PHI_RESULT (phis[i]);
5256           else
5257             {
5258               if (!single_defuse_cycle || j == 0)
5259                 reduc_def = PHI_RESULT (new_phi);
5260             }
5261
5262           def1 = ((op_type == ternary_op)
5263                   ? vec_oprnds1[i] : NULL);
5264           if (op_type == binary_op)
5265             {
5266               if (reduc_index == 0)
5267                 expr = build2 (code, vectype_out, reduc_def, def0);
5268               else
5269                 expr = build2 (code, vectype_out, def0, reduc_def);
5270             }
5271           else
5272             {
5273               if (reduc_index == 0)
5274                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5275               else
5276                 {
5277                   if (reduc_index == 1)
5278                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5279                   else
5280                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5281                 }
5282             }
5283
5284           new_stmt = gimple_build_assign (vec_dest, expr);
5285           new_temp = make_ssa_name (vec_dest, new_stmt);
5286           gimple_assign_set_lhs (new_stmt, new_temp);
5287           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5288
5289           if (slp_node)
5290             {
5291               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5292               vect_defs.quick_push (new_temp);
5293             }
5294           else
5295             vect_defs[0] = new_temp;
5296         }
5297
5298       if (slp_node)
5299         continue;
5300
5301       if (j == 0)
5302         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5303       else
5304         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5305
5306       prev_stmt_info = vinfo_for_stmt (new_stmt);
5307       prev_phi_info = vinfo_for_stmt (new_phi);
5308     }
5309
5310   /* Finalize the reduction-phi (set its arguments) and create the
5311      epilog reduction code.  */
5312   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5313     {
5314       new_temp = gimple_assign_lhs (*vec_stmt);
5315       vect_defs[0] = new_temp;
5316     }
5317
5318   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5319                                     epilog_reduc_code, phis, reduc_index,
5320                                     double_reduc, slp_node);
5321
5322   return true;
5323 }
5324
5325 /* Function vect_min_worthwhile_factor.
5326
5327    For a loop where we could vectorize the operation indicated by CODE,
5328    return the minimum vectorization factor that makes it worthwhile
5329    to use generic vectors.  */
5330 int
5331 vect_min_worthwhile_factor (enum tree_code code)
5332 {
5333   switch (code)
5334     {
5335     case PLUS_EXPR:
5336     case MINUS_EXPR:
5337     case NEGATE_EXPR:
5338       return 4;
5339
5340     case BIT_AND_EXPR:
5341     case BIT_IOR_EXPR:
5342     case BIT_XOR_EXPR:
5343     case BIT_NOT_EXPR:
5344       return 2;
5345
5346     default:
5347       return INT_MAX;
5348     }
5349 }
5350
5351
5352 /* Function vectorizable_induction
5353
5354    Check if PHI performs an induction computation that can be vectorized.
5355    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5356    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5357    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5358
5359 bool
5360 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5361                         gimple *vec_stmt)
5362 {
5363   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5364   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5365   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5366   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5367   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5368   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5369   tree vec_def;
5370
5371   gcc_assert (ncopies >= 1);
5372   /* FORNOW. These restrictions should be relaxed.  */
5373   if (nested_in_vect_loop_p (loop, phi))
5374     {
5375       imm_use_iterator imm_iter;
5376       use_operand_p use_p;
5377       gimple exit_phi;
5378       edge latch_e;
5379       tree loop_arg;
5380
5381       if (ncopies > 1)
5382         {
5383           if (dump_enabled_p ())
5384             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5385                              "multiple types in nested loop.\n");
5386           return false;
5387         }
5388
5389       exit_phi = NULL;
5390       latch_e = loop_latch_edge (loop->inner);
5391       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5392       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5393         {
5394           if (!flow_bb_inside_loop_p (loop->inner,
5395                                       gimple_bb (USE_STMT (use_p))))
5396             {
5397               exit_phi = USE_STMT (use_p);
5398               break;
5399             }
5400         }
5401       if (exit_phi)
5402         {
5403           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5404           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5405                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5406             {
5407               if (dump_enabled_p ())
5408                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5409                                  "inner-loop induction only used outside "
5410                                  "of the outer vectorized loop.\n");
5411               return false;
5412             }
5413         }
5414     }
5415
5416   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5417     return false;
5418
5419   /* FORNOW: SLP not supported.  */
5420   if (STMT_SLP_TYPE (stmt_info))
5421     return false;
5422
5423   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5424
5425   if (gimple_code (phi) != GIMPLE_PHI)
5426     return false;
5427
5428   if (!vec_stmt) /* transformation not required.  */
5429     {
5430       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5431       if (dump_enabled_p ())
5432         dump_printf_loc (MSG_NOTE, vect_location,
5433                          "=== vectorizable_induction ===\n");
5434       vect_model_induction_cost (stmt_info, ncopies);
5435       return true;
5436     }
5437
5438   /** Transform.  **/
5439
5440   if (dump_enabled_p ())
5441     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5442
5443   vec_def = get_initial_def_for_induction (phi);
5444   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5445   return true;
5446 }
5447
5448 /* Function vectorizable_live_operation.
5449
5450    STMT computes a value that is used outside the loop.  Check if
5451    it can be supported.  */
5452
5453 bool
5454 vectorizable_live_operation (gimple stmt,
5455                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5456                              gimple *vec_stmt)
5457 {
5458   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5459   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5460   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5461   int i;
5462   int op_type;
5463   tree op;
5464   tree def;
5465   gimple def_stmt;
5466   enum vect_def_type dt;
5467   enum tree_code code;
5468   enum gimple_rhs_class rhs_class;
5469
5470   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5471
5472   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5473     return false;
5474
5475   if (!is_gimple_assign (stmt))
5476     {
5477       if (gimple_call_internal_p (stmt)
5478           && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5479           && gimple_call_lhs (stmt)
5480           && loop->simduid
5481           && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5482           && loop->simduid
5483              == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5484         {
5485           edge e = single_exit (loop);
5486           basic_block merge_bb = e->dest;
5487           imm_use_iterator imm_iter;
5488           use_operand_p use_p;
5489           tree lhs = gimple_call_lhs (stmt);
5490
5491           FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5492             {
5493               gimple use_stmt = USE_STMT (use_p);
5494               if (gimple_code (use_stmt) == GIMPLE_PHI
5495                   || gimple_bb (use_stmt) == merge_bb)
5496                 {
5497                   if (vec_stmt)
5498                     {
5499                       tree vfm1
5500                         = build_int_cst (unsigned_type_node,
5501                                          loop_vinfo->vectorization_factor - 1);
5502                       SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5503                     }
5504                   return true;
5505                 }
5506             }
5507         }
5508
5509       return false;
5510     }
5511
5512   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5513     return false;
5514
5515   /* FORNOW. CHECKME. */
5516   if (nested_in_vect_loop_p (loop, stmt))
5517     return false;
5518
5519   code = gimple_assign_rhs_code (stmt);
5520   op_type = TREE_CODE_LENGTH (code);
5521   rhs_class = get_gimple_rhs_class (code);
5522   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5523   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5524
5525   /* FORNOW: support only if all uses are invariant.  This means
5526      that the scalar operations can remain in place, unvectorized.
5527      The original last scalar value that they compute will be used.  */
5528
5529   for (i = 0; i < op_type; i++)
5530     {
5531       if (rhs_class == GIMPLE_SINGLE_RHS)
5532         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5533       else
5534         op = gimple_op (stmt, i + 1);
5535       if (op
5536           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5537                                   &dt))
5538         {
5539           if (dump_enabled_p ())
5540             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5541                              "use not simple.\n");
5542           return false;
5543         }
5544
5545       if (dt != vect_external_def && dt != vect_constant_def)
5546         return false;
5547     }
5548
5549   /* No transformation is required for the cases we currently support.  */
5550   return true;
5551 }
5552
5553 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5554
5555 static void
5556 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5557 {
5558   ssa_op_iter op_iter;
5559   imm_use_iterator imm_iter;
5560   def_operand_p def_p;
5561   gimple ustmt;
5562
5563   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5564     {
5565       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5566         {
5567           basic_block bb;
5568
5569           if (!is_gimple_debug (ustmt))
5570             continue;
5571
5572           bb = gimple_bb (ustmt);
5573
5574           if (!flow_bb_inside_loop_p (loop, bb))
5575             {
5576               if (gimple_debug_bind_p (ustmt))
5577                 {
5578                   if (dump_enabled_p ())
5579                     dump_printf_loc (MSG_NOTE, vect_location,
5580                                      "killing debug use\n");
5581
5582                   gimple_debug_bind_reset_value (ustmt);
5583                   update_stmt (ustmt);
5584                 }
5585               else
5586                 gcc_unreachable ();
5587             }
5588         }
5589     }
5590 }
5591
5592
5593 /* This function builds ni_name = number of iterations.  Statements
5594    are emitted on the loop preheader edge.  */
5595
5596 static tree
5597 vect_build_loop_niters (loop_vec_info loop_vinfo)
5598 {
5599   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5600   if (TREE_CODE (ni) == INTEGER_CST)
5601     return ni;
5602   else
5603     {
5604       tree ni_name, var;
5605       gimple_seq stmts = NULL;
5606       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5607
5608       var = create_tmp_var (TREE_TYPE (ni), "niters");
5609       ni_name = force_gimple_operand (ni, &stmts, false, var);
5610       if (stmts)
5611         gsi_insert_seq_on_edge_immediate (pe, stmts);
5612
5613       return ni_name;
5614     }
5615 }
5616
5617
5618 /* This function generates the following statements:
5619
5620    ni_name = number of iterations loop executes
5621    ratio = ni_name / vf
5622    ratio_mult_vf_name = ratio * vf
5623
5624    and places them on the loop preheader edge.  */
5625
5626 static void
5627 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5628                                  tree ni_name,
5629                                  tree *ratio_mult_vf_name_ptr,
5630                                  tree *ratio_name_ptr)
5631 {
5632   tree ni_minus_gap_name;
5633   tree var;
5634   tree ratio_name;
5635   tree ratio_mult_vf_name;
5636   tree ni = LOOP_VINFO_NITERS (loop_vinfo);
5637   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5638   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5639   tree log_vf;
5640
5641   log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
5642
5643   /* If epilogue loop is required because of data accesses with gaps, we
5644      subtract one iteration from the total number of iterations here for
5645      correct calculation of RATIO.  */
5646   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5647     {
5648       ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5649                                        ni_name,
5650                                        build_one_cst (TREE_TYPE (ni_name)));
5651       if (!is_gimple_val (ni_minus_gap_name))
5652         {
5653           var = create_tmp_var (TREE_TYPE (ni), "ni_gap");
5654           gimple stmts = NULL;
5655           ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5656                                                     true, var);
5657           gsi_insert_seq_on_edge_immediate (pe, stmts);
5658         }
5659     }
5660   else
5661     ni_minus_gap_name = ni_name;
5662
5663   /* Create: ratio = ni >> log2(vf) */
5664
5665   ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_minus_gap_name),
5666                             ni_minus_gap_name, log_vf);
5667   if (!is_gimple_val (ratio_name))
5668     {
5669       var = create_tmp_var (TREE_TYPE (ni), "bnd");
5670       gimple stmts = NULL;
5671       ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5672       gsi_insert_seq_on_edge_immediate (pe, stmts);
5673     }
5674   *ratio_name_ptr = ratio_name;
5675
5676   /* Create: ratio_mult_vf = ratio << log2 (vf).  */
5677
5678   if (ratio_mult_vf_name_ptr)
5679     {
5680       ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5681                                         ratio_name, log_vf);
5682       if (!is_gimple_val (ratio_mult_vf_name))
5683         {
5684           var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
5685           gimple stmts = NULL;
5686           ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5687                                                      true, var);
5688           gsi_insert_seq_on_edge_immediate (pe, stmts);
5689         }
5690       *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5691     }
5692
5693   return;
5694 }
5695
5696
5697 /* Function vect_transform_loop.
5698
5699    The analysis phase has determined that the loop is vectorizable.
5700    Vectorize the loop - created vectorized stmts to replace the scalar
5701    stmts in the loop, and update the loop exit condition.  */
5702
5703 void
5704 vect_transform_loop (loop_vec_info loop_vinfo)
5705 {
5706   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5707   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5708   int nbbs = loop->num_nodes;
5709   gimple_stmt_iterator si;
5710   int i;
5711   tree ratio = NULL;
5712   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5713   bool grouped_store;
5714   bool slp_scheduled = false;
5715   gimple stmt, pattern_stmt;
5716   gimple_seq pattern_def_seq = NULL;
5717   gimple_stmt_iterator pattern_def_si = gsi_none ();
5718   bool transform_pattern_stmt = false;
5719   bool check_profitability = false;
5720   int th;
5721   /* Record number of iterations before we started tampering with the profile. */
5722   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5723
5724   if (dump_enabled_p ())
5725     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5726
5727   /* If profile is inprecise, we have chance to fix it up.  */
5728   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5729     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5730
5731   /* Use the more conservative vectorization threshold.  If the number
5732      of iterations is constant assume the cost check has been performed
5733      by our caller.  If the threshold makes all loops profitable that
5734      run at least the vectorization factor number of times checking
5735      is pointless, too.  */
5736   th = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
5737          * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
5738   th = MAX (th, LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo));
5739   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5740       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5741     {
5742       if (dump_enabled_p ())
5743         dump_printf_loc (MSG_NOTE, vect_location,
5744                          "Profitability threshold is %d loop iterations.\n",
5745                          th);
5746       check_profitability = true;
5747     }
5748
5749   /* Version the loop first, if required, so the profitability check
5750      comes first.  */
5751
5752   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5753       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5754     {
5755       vect_loop_versioning (loop_vinfo, th, check_profitability);
5756       check_profitability = false;
5757     }
5758
5759   tree ni_name = vect_build_loop_niters (loop_vinfo);
5760   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5761
5762   /* Peel the loop if there are data refs with unknown alignment.
5763      Only one data ref with unknown store is allowed.  */
5764
5765   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5766     {
5767       vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5768                                      th, check_profitability);
5769       check_profitability = false;
5770       /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5771          be re-computed.  */
5772       ni_name = NULL_TREE;
5773     }
5774
5775   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5776      compile time constant), or it is a constant that doesn't divide by the
5777      vectorization factor, then an epilog loop needs to be created.
5778      We therefore duplicate the loop: the original loop will be vectorized,
5779      and will compute the first (n/VF) iterations.  The second copy of the loop
5780      will remain scalar and will compute the remaining (n%VF) iterations.
5781      (VF is the vectorization factor).  */
5782
5783   if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
5784       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5785     {
5786       tree ratio_mult_vf;
5787       if (!ni_name)
5788         ni_name = vect_build_loop_niters (loop_vinfo);
5789       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
5790                                        &ratio);
5791       vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
5792                                       th, check_profitability);
5793     }
5794   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5795     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5796                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5797   else
5798     {
5799       if (!ni_name)
5800         ni_name = vect_build_loop_niters (loop_vinfo);
5801       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
5802     }
5803
5804   /* 1) Make sure the loop header has exactly two entries
5805      2) Make sure we have a preheader basic block.  */
5806
5807   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5808
5809   split_edge (loop_preheader_edge (loop));
5810
5811   /* FORNOW: the vectorizer supports only loops which body consist
5812      of one basic block (header + empty latch). When the vectorizer will
5813      support more involved loop forms, the order by which the BBs are
5814      traversed need to be reconsidered.  */
5815
5816   for (i = 0; i < nbbs; i++)
5817     {
5818       basic_block bb = bbs[i];
5819       stmt_vec_info stmt_info;
5820       gimple phi;
5821
5822       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5823         {
5824           phi = gsi_stmt (si);
5825           if (dump_enabled_p ())
5826             {
5827               dump_printf_loc (MSG_NOTE, vect_location,
5828                                "------>vectorizing phi: ");
5829               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5830               dump_printf (MSG_NOTE, "\n");
5831             }
5832           stmt_info = vinfo_for_stmt (phi);
5833           if (!stmt_info)
5834             continue;
5835
5836           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5837             vect_loop_kill_debug_uses (loop, phi);
5838
5839           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5840               && !STMT_VINFO_LIVE_P (stmt_info))
5841             continue;
5842
5843           if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5844                 != (unsigned HOST_WIDE_INT) vectorization_factor)
5845               && dump_enabled_p ())
5846             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
5847
5848           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5849             {
5850               if (dump_enabled_p ())
5851                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
5852               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5853             }
5854         }
5855
5856       pattern_stmt = NULL;
5857       for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5858         {
5859           bool is_store;
5860
5861           if (transform_pattern_stmt)
5862             stmt = pattern_stmt;
5863           else
5864             {
5865               stmt = gsi_stmt (si);
5866               /* During vectorization remove existing clobber stmts.  */
5867               if (gimple_clobber_p (stmt))
5868                 {
5869                   unlink_stmt_vdef (stmt);
5870                   gsi_remove (&si, true);
5871                   release_defs (stmt);
5872                   continue;
5873                 }
5874             }
5875
5876           if (dump_enabled_p ())
5877             {
5878               dump_printf_loc (MSG_NOTE, vect_location,
5879                                "------>vectorizing statement: ");
5880               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5881               dump_printf (MSG_NOTE, "\n");
5882             }
5883
5884           stmt_info = vinfo_for_stmt (stmt);
5885
5886           /* vector stmts created in the outer-loop during vectorization of
5887              stmts in an inner-loop may not have a stmt_info, and do not
5888              need to be vectorized.  */
5889           if (!stmt_info)
5890             {
5891               gsi_next (&si);
5892               continue;
5893             }
5894
5895           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5896             vect_loop_kill_debug_uses (loop, stmt);
5897
5898           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5899               && !STMT_VINFO_LIVE_P (stmt_info))
5900             {
5901               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5902                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5903                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5904                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5905                 {
5906                   stmt = pattern_stmt;
5907                   stmt_info = vinfo_for_stmt (stmt);
5908                 }
5909               else
5910                 {
5911                   gsi_next (&si);
5912                   continue;
5913                 }
5914             }
5915           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5916                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5917                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5918                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5919             transform_pattern_stmt = true;
5920
5921           /* If pattern statement has def stmts, vectorize them too.  */
5922           if (is_pattern_stmt_p (stmt_info))
5923             {
5924               if (pattern_def_seq == NULL)
5925                 {
5926                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
5927                   pattern_def_si = gsi_start (pattern_def_seq);
5928                 }
5929               else if (!gsi_end_p (pattern_def_si))
5930                 gsi_next (&pattern_def_si);
5931               if (pattern_def_seq != NULL)
5932                 {
5933                   gimple pattern_def_stmt = NULL;
5934                   stmt_vec_info pattern_def_stmt_info = NULL;
5935
5936                   while (!gsi_end_p (pattern_def_si))
5937                     {
5938                       pattern_def_stmt = gsi_stmt (pattern_def_si);
5939                       pattern_def_stmt_info
5940                         = vinfo_for_stmt (pattern_def_stmt);
5941                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
5942                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
5943                         break;
5944                       gsi_next (&pattern_def_si);
5945                     }
5946
5947                   if (!gsi_end_p (pattern_def_si))
5948                     {
5949                       if (dump_enabled_p ())
5950                         {
5951                           dump_printf_loc (MSG_NOTE, vect_location,
5952                                            "==> vectorizing pattern def "
5953                                            "stmt: ");
5954                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
5955                                             pattern_def_stmt, 0);
5956                           dump_printf (MSG_NOTE, "\n");
5957                         }
5958
5959                       stmt = pattern_def_stmt;
5960                       stmt_info = pattern_def_stmt_info;
5961                     }
5962                   else
5963                     {
5964                       pattern_def_si = gsi_none ();
5965                       transform_pattern_stmt = false;
5966                     }
5967                 }
5968               else
5969                 transform_pattern_stmt = false;
5970             }
5971
5972           if (STMT_VINFO_VECTYPE (stmt_info))
5973             {
5974               unsigned int nunits
5975                 = (unsigned int)
5976                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
5977               if (!STMT_SLP_TYPE (stmt_info)
5978                   && nunits != (unsigned int) vectorization_factor
5979                   && dump_enabled_p ())
5980                   /* For SLP VF is set according to unrolling factor, and not
5981                      to vector size, hence for SLP this print is not valid.  */
5982                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
5983             }
5984
5985           /* SLP. Schedule all the SLP instances when the first SLP stmt is
5986              reached.  */
5987           if (STMT_SLP_TYPE (stmt_info))
5988             {
5989               if (!slp_scheduled)
5990                 {
5991                   slp_scheduled = true;
5992
5993                   if (dump_enabled_p ())
5994                     dump_printf_loc (MSG_NOTE, vect_location,
5995                                      "=== scheduling SLP instances ===\n");
5996
5997                   vect_schedule_slp (loop_vinfo, NULL);
5998                 }
5999
6000               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
6001               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6002                 {
6003                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6004                     {
6005                       pattern_def_seq = NULL;
6006                       gsi_next (&si);
6007                     }
6008                   continue;
6009                 }
6010             }
6011
6012           /* -------- vectorize statement ------------ */
6013           if (dump_enabled_p ())
6014             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6015
6016           grouped_store = false;
6017           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6018           if (is_store)
6019             {
6020               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6021                 {
6022                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6023                      interleaving chain was completed - free all the stores in
6024                      the chain.  */
6025                   gsi_next (&si);
6026                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6027                   continue;
6028                 }
6029               else
6030                 {
6031                   /* Free the attached stmt_vec_info and remove the stmt.  */
6032                   gimple store = gsi_stmt (si);
6033                   free_stmt_vec_info (store);
6034                   unlink_stmt_vdef (store);
6035                   gsi_remove (&si, true);
6036                   release_defs (store);
6037                   continue;
6038                 }
6039             }
6040
6041           if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6042             {
6043               pattern_def_seq = NULL;
6044               gsi_next (&si);
6045             }
6046         }                       /* stmts in BB */
6047     }                           /* BBs in loop */
6048
6049   slpeel_make_loop_iterate_ntimes (loop, ratio);
6050
6051   /* Reduce loop iterations by the vectorization factor.  */
6052   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6053                       expected_iterations / vectorization_factor);
6054   loop->nb_iterations_upper_bound
6055     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (vectorization_factor),
6056                                             FLOOR_DIV_EXPR);
6057   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6058       && loop->nb_iterations_upper_bound != double_int_zero)
6059     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - double_int_one;
6060   if (loop->any_estimate)
6061     {
6062       loop->nb_iterations_estimate
6063         = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (vectorization_factor),
6064                                              FLOOR_DIV_EXPR);
6065        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6066            && loop->nb_iterations_estimate != double_int_zero)
6067          loop->nb_iterations_estimate = loop->nb_iterations_estimate - double_int_one;
6068     }
6069
6070   if (dump_enabled_p ())
6071     {
6072       dump_printf_loc (MSG_NOTE, vect_location,
6073                        "LOOP VECTORIZED\n");
6074       if (loop->inner)
6075         dump_printf_loc (MSG_NOTE, vect_location,
6076                          "OUTER LOOP VECTORIZED\n");
6077       dump_printf (MSG_NOTE, "\n");
6078     }
6079 }