gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2013 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "tree.h"
  28 #include "stor-layout.h"
  29 #include "basic-block.h"
  30 #include "gimple-pretty-print.h"
  31 #include "tree-ssa-alias.h"
  32 #include "internal-fn.h"
  33 #include "gimple-expr.h"
  34 #include "is-a.h"
  35 #include "gimple.h"
  36 #include "gimplify.h"
  37 #include "gimple-iterator.h"
  38 #include "gimplify-me.h"
  39 #include "gimple-ssa.h"
  40 #include "tree-phinodes.h"
  41 #include "ssa-iterators.h"
  42 #include "stringpool.h"
  43 #include "tree-ssanames.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop-niter.h"
  47 #include "tree-pass.h"
  48 #include "cfgloop.h"
  49 #include "expr.h"
  50 #include "recog.h"
  51 #include "optabs.h"
  52 #include "params.h"
  53 #include "diagnostic-core.h"
  54 #include "tree-chrec.h"
  55 #include "tree-scalar-evolution.h"
  56 #include "tree-vectorizer.h"
  57 #include "target.h"
  58
  59 /* Loop Vectorization Pass.
  60
  61    This pass tries to vectorize loops.
  62
  63    For example, the vectorizer transforms the following simple loop:
  64
  65         short a[N]; short b[N]; short c[N]; int i;
  66
  67         for (i=0; i<N; i++){
  68           a[i] = b[i] + c[i];
  69         }
  70
  71    as if it was manually vectorized by rewriting the source code into:
  72
  73         typedef int __attribute__((mode(V8HI))) v8hi;
  74         short a[N];  short b[N]; short c[N];   int i;
  75         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  76         v8hi va, vb, vc;
  77
  78         for (i=0; i<N/8; i++){
  79           vb = pb[i];
  80           vc = pc[i];
  81           va = vb + vc;
  82           pa[i] = va;
  83         }
  84
  85         The main entry to this pass is vectorize_loops(), in which
  86    the vectorizer applies a set of analyses on a given set of loops,
  87    followed by the actual vectorization transformation for the loops that
  88    had successfully passed the analysis phase.
  89         Throughout this pass we make a distinction between two types of
  90    data: scalars (which are represented by SSA_NAMES), and memory references
  91    ("data-refs").  These two types of data require different handling both
  92    during analysis and transformation. The types of data-refs that the
  93    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  94    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  95    accesses are required to have a simple (consecutive) access pattern.
  96
  97    Analysis phase:
  98    ===============
  99         The driver for the analysis phase is vect_analyze_loop().
 100    It applies a set of analyses, some of which rely on the scalar evolution
 101    analyzer (scev) developed by Sebastian Pop.
 102
 103         During the analysis phase the vectorizer records some information
 104    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 105    loop, as well as general information about the loop as a whole, which is
 106    recorded in a "loop_vec_info" struct attached to each loop.
 107
 108    Transformation phase:
 109    =====================
 110         The loop transformation phase scans all the stmts in the loop, and
 111    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 112    the loop that needs to be vectorized.  It inserts the vector code sequence
 113    just before the scalar stmt S, and records a pointer to the vector code
 114    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 115    attached to S).  This pointer will be used for the vectorization of following
 116    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 117    otherwise, we rely on dead code elimination for removing it.
 118
 119         For example, say stmt S1 was vectorized into stmt VS1:
 120
 121    VS1: vb = px[i];
 122    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 123    S2:  a = b;
 124
 125    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 126    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 127    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 128    resulting sequence would be:
 129
 130    VS1: vb = px[i];
 131    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 132    VS2: va = vb;
 133    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 134
 135         Operands that are not SSA_NAMEs, are data-refs that appear in
 136    load/store operations (like 'x[i]' in S1), and are handled differently.
 137
 138    Target modeling:
 139    =================
 140         Currently the only target specific information that is used is the
 141    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 142    Targets that can support different sizes of vectors, for now will need
 143    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 144    flexibility will be added in the future.
 145
 146         Since we only vectorize operations which vector form can be
 147    expressed using existing tree codes, to verify that an operation is
 148    supported, the vectorizer checks the relevant optab at the relevant
 149    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 150    the value found is CODE_FOR_nothing, then there's no target support, and
 151    we can't vectorize the stmt.
 152
 153    For additional information on this project see:
 154    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 155 */
 156
 157 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 158
 159 /* Function vect_determine_vectorization_factor
 160
 161    Determine the vectorization factor (VF).  VF is the number of data elements
 162    that are operated upon in parallel in a single iteration of the vectorized
 163    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 164    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 165    elements can fit in a single vector register.
 166
 167    We currently support vectorization of loops in which all types operated upon
 168    are of the same size.  Therefore this function currently sets VF according to
 169    the size of the types operated upon, and fails if there are multiple sizes
 170    in the loop.
 171
 172    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 173    original loop:
 174         for (i=0; i<N; i++){
 175           a[i] = b[i] + c[i];
 176         }
 177
 178    vectorized loop:
 179         for (i=0; i<N; i+=VF){
 180           a[i:VF] = b[i:VF] + c[i:VF];
 181         }
 182 */
 183
 184 static bool
 185 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 186 {
 187   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 188   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 189   int nbbs = loop->num_nodes;
 190   gimple_stmt_iterator si;
 191   unsigned int vectorization_factor = 0;
 192   tree scalar_type;
 193   gimple phi;
 194   tree vectype;
 195   unsigned int nunits;
 196   stmt_vec_info stmt_info;
 197   int i;
 198   HOST_WIDE_INT dummy;
 199   gimple stmt, pattern_stmt = NULL;
 200   gimple_seq pattern_def_seq = NULL;
 201   gimple_stmt_iterator pattern_def_si = gsi_none ();
 202   bool analyze_pattern_stmt = false;
 203
 204   if (dump_enabled_p ())
 205     dump_printf_loc (MSG_NOTE, vect_location,
 206                      "=== vect_determine_vectorization_factor ===\n");
 207
 208   for (i = 0; i < nbbs; i++)
 209     {
 210       basic_block bb = bbs[i];
 211
 212       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 213         {
 214           phi = gsi_stmt (si);
 215           stmt_info = vinfo_for_stmt (phi);
 216           if (dump_enabled_p ())
 217             {
 218               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 219               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 220               dump_printf (MSG_NOTE, "\n");
 221             }
 222
 223           gcc_assert (stmt_info);
 224
 225           if (STMT_VINFO_RELEVANT_P (stmt_info))
 226             {
 227               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 228               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 229
 230               if (dump_enabled_p ())
 231                 {
 232                   dump_printf_loc (MSG_NOTE, vect_location,
 233                                    "get vectype for scalar type:  ");
 234                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 235                   dump_printf (MSG_NOTE, "\n");
 236                 }
 237
 238               vectype = get_vectype_for_scalar_type (scalar_type);
 239               if (!vectype)
 240                 {
 241                   if (dump_enabled_p ())
 242                     {
 243                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 244                                        "not vectorized: unsupported "
 245                                        "data-type ");
 246                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 247                                          scalar_type);
 248                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 249                     }
 250                   return false;
 251                 }
 252               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 253
 254               if (dump_enabled_p ())
 255                 {
 256                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 257                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 258                   dump_printf (MSG_NOTE, "\n");
 259                 }
 260
 261               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 262               if (dump_enabled_p ())
 263                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 264                                  nunits);
 265
 266               if (!vectorization_factor
 267                   || (nunits > vectorization_factor))
 268                 vectorization_factor = nunits;
 269             }
 270         }
 271
 272       for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
 273         {
 274           tree vf_vectype;
 275
 276           if (analyze_pattern_stmt)
 277             stmt = pattern_stmt;
 278           else
 279             stmt = gsi_stmt (si);
 280
 281           stmt_info = vinfo_for_stmt (stmt);
 282
 283           if (dump_enabled_p ())
 284             {
 285               dump_printf_loc (MSG_NOTE, vect_location,
 286                                "==> examining statement: ");
 287               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 288               dump_printf (MSG_NOTE, "\n");
 289             }
 290
 291           gcc_assert (stmt_info);
 292
 293           /* Skip stmts which do not need to be vectorized.  */
 294           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 295                && !STMT_VINFO_LIVE_P (stmt_info))
 296               || gimple_clobber_p (stmt))
 297             {
 298               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 299                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 300                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 301                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 302                 {
 303                   stmt = pattern_stmt;
 304                   stmt_info = vinfo_for_stmt (pattern_stmt);
 305                   if (dump_enabled_p ())
 306                     {
 307                       dump_printf_loc (MSG_NOTE, vect_location,
 308                                        "==> examining pattern statement: ");
 309                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 310                       dump_printf (MSG_NOTE, "\n");
 311                     }
 312                 }
 313               else
 314                 {
 315                   if (dump_enabled_p ())
 316                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 317                   gsi_next (&si);
 318                   continue;
 319                 }
 320             }
 321           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 322                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 323                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 324                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 325             analyze_pattern_stmt = true;
 326
 327           /* If a pattern statement has def stmts, analyze them too.  */
 328           if (is_pattern_stmt_p (stmt_info))
 329             {
 330               if (pattern_def_seq == NULL)
 331                 {
 332                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 333                   pattern_def_si = gsi_start (pattern_def_seq);
 334                 }
 335               else if (!gsi_end_p (pattern_def_si))
 336                 gsi_next (&pattern_def_si);
 337               if (pattern_def_seq != NULL)
 338                 {
 339                   gimple pattern_def_stmt = NULL;
 340                   stmt_vec_info pattern_def_stmt_info = NULL;
 341
 342                   while (!gsi_end_p (pattern_def_si))
 343                     {
 344                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 345                       pattern_def_stmt_info
 346                         = vinfo_for_stmt (pattern_def_stmt);
 347                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 348                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 349                         break;
 350                       gsi_next (&pattern_def_si);
 351                     }
 352
 353                   if (!gsi_end_p (pattern_def_si))
 354                     {
 355                       if (dump_enabled_p ())
 356                         {
 357                           dump_printf_loc (MSG_NOTE, vect_location,
 358                                            "==> examining pattern def stmt: ");
 359                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 360                                             pattern_def_stmt, 0);
 361                           dump_printf (MSG_NOTE, "\n");
 362                         }
 363
 364                       stmt = pattern_def_stmt;
 365                       stmt_info = pattern_def_stmt_info;
 366                     }
 367                   else
 368                     {
 369                       pattern_def_si = gsi_none ();
 370                       analyze_pattern_stmt = false;
 371                     }
 372                 }
 373               else
 374                 analyze_pattern_stmt = false;
 375             }
 376
 377           if (gimple_get_lhs (stmt) == NULL_TREE)
 378             {
 379               if (is_gimple_call (stmt))
 380                 {
 381                   /* Ignore calls with no lhs.  These must be calls to
 382                      #pragma omp simd functions, and what vectorization factor
 383                      it really needs can't be determined until
 384                      vectorizable_simd_clone_call.  */
 385                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 386                     {
 387                       pattern_def_seq = NULL;
 388                       gsi_next (&si);
 389                     }
 390                   continue;
 391                 }
 392               if (dump_enabled_p ())
 393                 {
 394                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 395                                    "not vectorized: irregular stmt.");
 396                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 397                                     0);
 398                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 399                 }
 400               return false;
 401             }
 402
 403           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 404             {
 405               if (dump_enabled_p ())
 406                 {
 407                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 408                                    "not vectorized: vector stmt in loop:");
 409                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 410                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 411                 }
 412               return false;
 413             }
 414
 415           if (STMT_VINFO_VECTYPE (stmt_info))
 416             {
 417               /* The only case when a vectype had been already set is for stmts
 418                  that contain a dataref, or for "pattern-stmts" (stmts
 419                  generated by the vectorizer to represent/replace a certain
 420                  idiom).  */
 421               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 422                           || is_pattern_stmt_p (stmt_info)
 423                           || !gsi_end_p (pattern_def_si));
 424               vectype = STMT_VINFO_VECTYPE (stmt_info);
 425             }
 426           else
 427             {
 428               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 429               scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 430               if (dump_enabled_p ())
 431                 {
 432                   dump_printf_loc (MSG_NOTE, vect_location,
 433                                    "get vectype for scalar type:  ");
 434                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 435                   dump_printf (MSG_NOTE, "\n");
 436                 }
 437               vectype = get_vectype_for_scalar_type (scalar_type);
 438               if (!vectype)
 439                 {
 440                   if (dump_enabled_p ())
 441                     {
 442                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 443                                        "not vectorized: unsupported "
 444                                        "data-type ");
 445                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 446                                          scalar_type);
 447                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 448                     }
 449                   return false;
 450                 }
 451
 452               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 453
 454               if (dump_enabled_p ())
 455                 {
 456                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 457                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 458                   dump_printf (MSG_NOTE, "\n");
 459                 }
 460             }
 461
 462           /* The vectorization factor is according to the smallest
 463              scalar type (or the largest vector size, but we only
 464              support one vector size per loop).  */
 465           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 466                                                        &dummy);
 467           if (dump_enabled_p ())
 468             {
 469               dump_printf_loc (MSG_NOTE, vect_location,
 470                                "get vectype for scalar type:  ");
 471               dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 472               dump_printf (MSG_NOTE, "\n");
 473             }
 474           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 475           if (!vf_vectype)
 476             {
 477               if (dump_enabled_p ())
 478                 {
 479                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 480                                    "not vectorized: unsupported data-type ");
 481                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 482                                      scalar_type);
 483                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 484                 }
 485               return false;
 486             }
 487
 488           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 489                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 490             {
 491               if (dump_enabled_p ())
 492                 {
 493                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 494                                    "not vectorized: different sized vector "
 495                                    "types in statement, ");
 496                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 497                                      vectype);
 498                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 499                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 500                                      vf_vectype);
 501                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 502                 }
 503               return false;
 504             }
 505
 506           if (dump_enabled_p ())
 507             {
 508               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 509               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 510               dump_printf (MSG_NOTE, "\n");
 511             }
 512
 513           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 514           if (dump_enabled_p ())
 515             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 516           if (!vectorization_factor
 517               || (nunits > vectorization_factor))
 518             vectorization_factor = nunits;
 519
 520           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 521             {
 522               pattern_def_seq = NULL;
 523               gsi_next (&si);
 524             }
 525         }
 526     }
 527
 528   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 529   if (dump_enabled_p ())
 530     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 531                      vectorization_factor);
 532   if (vectorization_factor <= 1)
 533     {
 534       if (dump_enabled_p ())
 535         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 536                          "not vectorized: unsupported data-type\n");
 537       return false;
 538     }
 539   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 540
 541   return true;
 542 }
 543
 544
 545 /* Function vect_is_simple_iv_evolution.
 546
 547    FORNOW: A simple evolution of an induction variables in the loop is
 548    considered a polynomial evolution.  */
 549
 550 static bool
 551 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 552                              tree * step)
 553 {
 554   tree init_expr;
 555   tree step_expr;
 556   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 557   basic_block bb;
 558
 559   /* When there is no evolution in this loop, the evolution function
 560      is not "simple".  */
 561   if (evolution_part == NULL_TREE)
 562     return false;
 563
 564   /* When the evolution is a polynomial of degree >= 2
 565      the evolution function is not "simple".  */
 566   if (tree_is_chrec (evolution_part))
 567     return false;
 568
 569   step_expr = evolution_part;
 570   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 571
 572   if (dump_enabled_p ())
 573     {
 574       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 575       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 576       dump_printf (MSG_NOTE, ",  init: ");
 577       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 578       dump_printf (MSG_NOTE, "\n");
 579     }
 580
 581   *init = init_expr;
 582   *step = step_expr;
 583
 584   if (TREE_CODE (step_expr) != INTEGER_CST
 585       && (TREE_CODE (step_expr) != SSA_NAME
 586           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 587               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 588           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 589               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 590                   || !flag_associative_math)))
 591       && (TREE_CODE (step_expr) != REAL_CST
 592           || !flag_associative_math))
 593     {
 594       if (dump_enabled_p ())
 595         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 596                          "step unknown.\n");
 597       return false;
 598     }
 599
 600   return true;
 601 }
 602
 603 /* Function vect_analyze_scalar_cycles_1.
 604
 605    Examine the cross iteration def-use cycles of scalar variables
 606    in LOOP.  LOOP_VINFO represents the loop that is now being
 607    considered for vectorization (can be LOOP, or an outer-loop
 608    enclosing LOOP).  */
 609
 610 static void
 611 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 612 {
 613   basic_block bb = loop->header;
 614   tree init, step;
 615   stack_vec<gimple, 64> worklist;
 616   gimple_stmt_iterator gsi;
 617   bool double_reduc;
 618
 619   if (dump_enabled_p ())
 620     dump_printf_loc (MSG_NOTE, vect_location,
 621                      "=== vect_analyze_scalar_cycles ===\n");
 622
 623   /* First - identify all inductions.  Reduction detection assumes that all the
 624      inductions have been identified, therefore, this order must not be
 625      changed.  */
 626   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 627     {
 628       gimple phi = gsi_stmt (gsi);
 629       tree access_fn = NULL;
 630       tree def = PHI_RESULT (phi);
 631       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 632
 633       if (dump_enabled_p ())
 634         {
 635           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 636           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 637           dump_printf (MSG_NOTE, "\n");
 638         }
 639
 640       /* Skip virtual phi's.  The data dependences that are associated with
 641          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 642       if (virtual_operand_p (def))
 643         continue;
 644
 645       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 646
 647       /* Analyze the evolution function.  */
 648       access_fn = analyze_scalar_evolution (loop, def);
 649       if (access_fn)
 650         {
 651           STRIP_NOPS (access_fn);
 652           if (dump_enabled_p ())
 653             {
 654               dump_printf_loc (MSG_NOTE, vect_location,
 655                                "Access function of PHI: ");
 656               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 657               dump_printf (MSG_NOTE, "\n");
 658             }
 659           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 660             = evolution_part_in_loop_num (access_fn, loop->num);
 661         }
 662
 663       if (!access_fn
 664           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 665           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 666               && TREE_CODE (step) != INTEGER_CST))
 667         {
 668           worklist.safe_push (phi);
 669           continue;
 670         }
 671
 672       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 673
 674       if (dump_enabled_p ())
 675         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 676       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 677     }
 678
 679
 680   /* Second - identify all reductions and nested cycles.  */
 681   while (worklist.length () > 0)
 682     {
 683       gimple phi = worklist.pop ();
 684       tree def = PHI_RESULT (phi);
 685       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 686       gimple reduc_stmt;
 687       bool nested_cycle;
 688
 689       if (dump_enabled_p ())
 690         {
 691           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 692           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 693           dump_printf (MSG_NOTE, "\n");
 694         }
 695
 696       gcc_assert (!virtual_operand_p (def)
 697                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 698
 699       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 700       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 701                                                 &double_reduc);
 702       if (reduc_stmt)
 703         {
 704           if (double_reduc)
 705             {
 706               if (dump_enabled_p ())
 707                 dump_printf_loc (MSG_NOTE, vect_location,
 708                                  "Detected double reduction.\n");
 709
 710               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 711               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 712                                                     vect_double_reduction_def;
 713             }
 714           else
 715             {
 716               if (nested_cycle)
 717                 {
 718                   if (dump_enabled_p ())
 719                     dump_printf_loc (MSG_NOTE, vect_location,
 720                                      "Detected vectorizable nested cycle.\n");
 721
 722                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 723                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 724                                                              vect_nested_cycle;
 725                 }
 726               else
 727                 {
 728                   if (dump_enabled_p ())
 729                     dump_printf_loc (MSG_NOTE, vect_location,
 730                                      "Detected reduction.\n");
 731
 732                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 733                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 734                                                            vect_reduction_def;
 735                   /* Store the reduction cycles for possible vectorization in
 736                      loop-aware SLP.  */
 737                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 738                 }
 739             }
 740         }
 741       else
 742         if (dump_enabled_p ())
 743           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 744                            "Unknown def-use cycle pattern.\n");
 745     }
 746 }
 747
 748
 749 /* Function vect_analyze_scalar_cycles.
 750
 751    Examine the cross iteration def-use cycles of scalar variables, by
 752    analyzing the loop-header PHIs of scalar variables.  Classify each
 753    cycle as one of the following: invariant, induction, reduction, unknown.
 754    We do that for the loop represented by LOOP_VINFO, and also to its
 755    inner-loop, if exists.
 756    Examples for scalar cycles:
 757
 758    Example1: reduction:
 759
 760               loop1:
 761               for (i=0; i<N; i++)
 762                  sum += a[i];
 763
 764    Example2: induction:
 765
 766               loop2:
 767               for (i=0; i<N; i++)
 768                  a[i] = i;  */
 769
 770 static void
 771 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 772 {
 773   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 774
 775   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 776
 777   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 778      Reductions in such inner-loop therefore have different properties than
 779      the reductions in the nest that gets vectorized:
 780      1. When vectorized, they are executed in the same order as in the original
 781         scalar loop, so we can't change the order of computation when
 782         vectorizing them.
 783      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 784         current checks are too strict.  */
 785
 786   if (loop->inner)
 787     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 788 }
 789
 790
 791 /* Function vect_get_loop_niters.
 792
 793    Determine how many iterations the loop is executed and place it
 794    in NUMBER_OF_ITERATIONS.
 795
 796    Return the loop exit condition.  */
 797
 798 static gimple
 799 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
 800 {
 801   tree niters;
 802
 803   if (dump_enabled_p ())
 804     dump_printf_loc (MSG_NOTE, vect_location,
 805                      "=== get_loop_niters ===\n");
 806
 807   niters = number_of_latch_executions (loop);
 808   /* We want the number of loop header executions which is the number
 809      of latch executions plus one.
 810      ???  For UINT_MAX latch executions this number overflows to zero
 811      for loops like do { n++; } while (n != 0);  */
 812   if (niters && !chrec_contains_undetermined (niters))
 813     niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), niters,
 814                           build_int_cst (TREE_TYPE (niters), 1));
 815   *number_of_iterations = niters;
 816
 817   return get_loop_exit_condition (loop);
 818 }
 819
 820
 821 /* Function bb_in_loop_p
 822
 823    Used as predicate for dfs order traversal of the loop bbs.  */
 824
 825 static bool
 826 bb_in_loop_p (const_basic_block bb, const void *data)
 827 {
 828   const struct loop *const loop = (const struct loop *)data;
 829   if (flow_bb_inside_loop_p (loop, bb))
 830     return true;
 831   return false;
 832 }
 833
 834
 835 /* Function new_loop_vec_info.
 836
 837    Create and initialize a new loop_vec_info struct for LOOP, as well as
 838    stmt_vec_info structs for all the stmts in LOOP.  */
 839
 840 static loop_vec_info
 841 new_loop_vec_info (struct loop *loop)
 842 {
 843   loop_vec_info res;
 844   basic_block *bbs;
 845   gimple_stmt_iterator si;
 846   unsigned int i, nbbs;
 847
 848   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 849   LOOP_VINFO_LOOP (res) = loop;
 850
 851   bbs = get_loop_body (loop);
 852
 853   /* Create/Update stmt_info for all stmts in the loop.  */
 854   for (i = 0; i < loop->num_nodes; i++)
 855     {
 856       basic_block bb = bbs[i];
 857
 858       /* BBs in a nested inner-loop will have been already processed (because
 859          we will have called vect_analyze_loop_form for any nested inner-loop).
 860          Therefore, for stmts in an inner-loop we just want to update the
 861          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 862          loop_info of the outer-loop we are currently considering to vectorize
 863          (instead of the loop_info of the inner-loop).
 864          For stmts in other BBs we need to create a stmt_info from scratch.  */
 865       if (bb->loop_father != loop)
 866         {
 867           /* Inner-loop bb.  */
 868           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 869           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 870             {
 871               gimple phi = gsi_stmt (si);
 872               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 873               loop_vec_info inner_loop_vinfo =
 874                 STMT_VINFO_LOOP_VINFO (stmt_info);
 875               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 876               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 877             }
 878           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 879            {
 880               gimple stmt = gsi_stmt (si);
 881               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 882               loop_vec_info inner_loop_vinfo =
 883                  STMT_VINFO_LOOP_VINFO (stmt_info);
 884               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 885               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 886            }
 887         }
 888       else
 889         {
 890           /* bb in current nest.  */
 891           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 892             {
 893               gimple phi = gsi_stmt (si);
 894               gimple_set_uid (phi, 0);
 895               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 896             }
 897
 898           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 899             {
 900               gimple stmt = gsi_stmt (si);
 901               gimple_set_uid (stmt, 0);
 902               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 903             }
 904         }
 905     }
 906
 907   /* CHECKME: We want to visit all BBs before their successors (except for
 908      latch blocks, for which this assertion wouldn't hold).  In the simple
 909      case of the loop forms we allow, a dfs order of the BBs would the same
 910      as reversed postorder traversal, so we are safe.  */
 911
 912    free (bbs);
 913    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 914    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 915                               bbs, loop->num_nodes, loop);
 916    gcc_assert (nbbs == loop->num_nodes);
 917
 918   LOOP_VINFO_BBS (res) = bbs;
 919   LOOP_VINFO_NITERS (res) = NULL;
 920   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 921   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 922   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 923   LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
 924   LOOP_VINFO_VECT_FACTOR (res) = 0;
 925   LOOP_VINFO_LOOP_NEST (res).create (3);
 926   LOOP_VINFO_DATAREFS (res).create (10);
 927   LOOP_VINFO_DDRS (res).create (10 * 10);
 928   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 929   LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
 930              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 931   LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
 932              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 933   LOOP_VINFO_GROUPED_STORES (res).create (10);
 934   LOOP_VINFO_REDUCTIONS (res).create (10);
 935   LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
 936   LOOP_VINFO_SLP_INSTANCES (res).create (10);
 937   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 938   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
 939   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
 940   LOOP_VINFO_PEELING_FOR_NITER (res) = false;
 941   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
 942
 943   return res;
 944 }
 945
 946
 947 /* Function destroy_loop_vec_info.
 948
 949    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 950    stmts in the loop.  */
 951
 952 void
 953 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 954 {
 955   struct loop *loop;
 956   basic_block *bbs;
 957   int nbbs;
 958   gimple_stmt_iterator si;
 959   int j;
 960   vec<slp_instance> slp_instances;
 961   slp_instance instance;
 962   bool swapped;
 963
 964   if (!loop_vinfo)
 965     return;
 966
 967   loop = LOOP_VINFO_LOOP (loop_vinfo);
 968
 969   bbs = LOOP_VINFO_BBS (loop_vinfo);
 970   nbbs = clean_stmts ? loop->num_nodes : 0;
 971   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
 972
 973   for (j = 0; j < nbbs; j++)
 974     {
 975       basic_block bb = bbs[j];
 976       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 977         free_stmt_vec_info (gsi_stmt (si));
 978
 979       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 980         {
 981           gimple stmt = gsi_stmt (si);
 982
 983           /* We may have broken canonical form by moving a constant
 984              into RHS1 of a commutative op.  Fix such occurrences.  */
 985           if (swapped && is_gimple_assign (stmt))
 986             {
 987               enum tree_code code = gimple_assign_rhs_code (stmt);
 988
 989               if ((code == PLUS_EXPR
 990                    || code == POINTER_PLUS_EXPR
 991                    || code == MULT_EXPR)
 992                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 993                 swap_ssa_operands (stmt,
 994                                    gimple_assign_rhs1_ptr (stmt),
 995                                    gimple_assign_rhs2_ptr (stmt));
 996             }
 997
 998           /* Free stmt_vec_info.  */
 999           free_stmt_vec_info (stmt);
1000           gsi_next (&si);
1001         }
1002     }
1003
1004   free (LOOP_VINFO_BBS (loop_vinfo));
1005   vect_destroy_datarefs (loop_vinfo, NULL);
1006   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1007   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1008   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1009   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1010   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1011   FOR_EACH_VEC_ELT (slp_instances, j, instance)
1012     vect_free_slp_instance (instance);
1013
1014   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1015   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1016   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1017   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1018
1019   if (LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
1020     LOOP_VINFO_PEELING_HTAB (loop_vinfo).dispose ();
1021
1022   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1023
1024   free (loop_vinfo);
1025   loop->aux = NULL;
1026 }
1027
1028
1029 /* Function vect_analyze_loop_1.
1030
1031    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1032    for it. The different analyses will record information in the
1033    loop_vec_info struct.  This is a subset of the analyses applied in
1034    vect_analyze_loop, to be applied on an inner-loop nested in the loop
1035    that is now considered for (outer-loop) vectorization.  */
1036
1037 static loop_vec_info
1038 vect_analyze_loop_1 (struct loop *loop)
1039 {
1040   loop_vec_info loop_vinfo;
1041
1042   if (dump_enabled_p ())
1043     dump_printf_loc (MSG_NOTE, vect_location,
1044                      "===== analyze_loop_nest_1 =====\n");
1045
1046   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1047
1048   loop_vinfo = vect_analyze_loop_form (loop);
1049   if (!loop_vinfo)
1050     {
1051       if (dump_enabled_p ())
1052         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1053                          "bad inner-loop form.\n");
1054       return NULL;
1055     }
1056
1057   return loop_vinfo;
1058 }
1059
1060
1061 /* Function vect_analyze_loop_form.
1062
1063    Verify that certain CFG restrictions hold, including:
1064    - the loop has a pre-header
1065    - the loop has a single entry and exit
1066    - the loop exit condition is simple enough, and the number of iterations
1067      can be analyzed (a countable loop).  */
1068
1069 loop_vec_info
1070 vect_analyze_loop_form (struct loop *loop)
1071 {
1072   loop_vec_info loop_vinfo;
1073   gimple loop_cond;
1074   tree number_of_iterations = NULL;
1075   loop_vec_info inner_loop_vinfo = NULL;
1076
1077   if (dump_enabled_p ())
1078     dump_printf_loc (MSG_NOTE, vect_location,
1079                      "=== vect_analyze_loop_form ===\n");
1080
1081   /* Different restrictions apply when we are considering an inner-most loop,
1082      vs. an outer (nested) loop.
1083      (FORNOW. May want to relax some of these restrictions in the future).  */
1084
1085   if (!loop->inner)
1086     {
1087       /* Inner-most loop.  We currently require that the number of BBs is
1088          exactly 2 (the header and latch).  Vectorizable inner-most loops
1089          look like this:
1090
1091                         (pre-header)
1092                            |
1093                           header <--------+
1094                            | |            |
1095                            | +--> latch --+
1096                            |
1097                         (exit-bb)  */
1098
1099       if (loop->num_nodes != 2)
1100         {
1101           if (dump_enabled_p ())
1102             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1103                              "not vectorized: control flow in loop.\n");
1104           return NULL;
1105         }
1106
1107       if (empty_block_p (loop->header))
1108         {
1109           if (dump_enabled_p ())
1110             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1111                              "not vectorized: empty loop.\n");
1112           return NULL;
1113         }
1114     }
1115   else
1116     {
1117       struct loop *innerloop = loop->inner;
1118       edge entryedge;
1119
1120       /* Nested loop. We currently require that the loop is doubly-nested,
1121          contains a single inner loop, and the number of BBs is exactly 5.
1122          Vectorizable outer-loops look like this:
1123
1124                         (pre-header)
1125                            |
1126                           header <---+
1127                            |         |
1128                           inner-loop |
1129                            |         |
1130                           tail ------+
1131                            |
1132                         (exit-bb)
1133
1134          The inner-loop has the properties expected of inner-most loops
1135          as described above.  */
1136
1137       if ((loop->inner)->inner || (loop->inner)->next)
1138         {
1139           if (dump_enabled_p ())
1140             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1141                              "not vectorized: multiple nested loops.\n");
1142           return NULL;
1143         }
1144
1145       /* Analyze the inner-loop.  */
1146       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1147       if (!inner_loop_vinfo)
1148         {
1149           if (dump_enabled_p ())
1150             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1151                              "not vectorized: Bad inner loop.\n");
1152           return NULL;
1153         }
1154
1155       if (!expr_invariant_in_loop_p (loop,
1156                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
1157         {
1158           if (dump_enabled_p ())
1159             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1160                              "not vectorized: inner-loop count not"
1161                              " invariant.\n");
1162           destroy_loop_vec_info (inner_loop_vinfo, true);
1163           return NULL;
1164         }
1165
1166       if (loop->num_nodes != 5)
1167         {
1168           if (dump_enabled_p ())
1169             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1170                              "not vectorized: control flow in loop.\n");
1171           destroy_loop_vec_info (inner_loop_vinfo, true);
1172           return NULL;
1173         }
1174
1175       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1176       entryedge = EDGE_PRED (innerloop->header, 0);
1177       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1178         entryedge = EDGE_PRED (innerloop->header, 1);
1179
1180       if (entryedge->src != loop->header
1181           || !single_exit (innerloop)
1182           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1183         {
1184           if (dump_enabled_p ())
1185             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1186                              "not vectorized: unsupported outerloop form.\n");
1187           destroy_loop_vec_info (inner_loop_vinfo, true);
1188           return NULL;
1189         }
1190
1191       if (dump_enabled_p ())
1192         dump_printf_loc (MSG_NOTE, vect_location,
1193                          "Considering outer-loop vectorization.\n");
1194     }
1195
1196   if (!single_exit (loop)
1197       || EDGE_COUNT (loop->header->preds) != 2)
1198     {
1199       if (dump_enabled_p ())
1200         {
1201           if (!single_exit (loop))
1202             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1203                              "not vectorized: multiple exits.\n");
1204           else if (EDGE_COUNT (loop->header->preds) != 2)
1205             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1206                              "not vectorized: too many incoming edges.\n");
1207         }
1208       if (inner_loop_vinfo)
1209         destroy_loop_vec_info (inner_loop_vinfo, true);
1210       return NULL;
1211     }
1212
1213   /* We assume that the loop exit condition is at the end of the loop. i.e,
1214      that the loop is represented as a do-while (with a proper if-guard
1215      before the loop if needed), where the loop header contains all the
1216      executable statements, and the latch is empty.  */
1217   if (!empty_block_p (loop->latch)
1218       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1219     {
1220       if (dump_enabled_p ())
1221         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1222                          "not vectorized: latch block not empty.\n");
1223       if (inner_loop_vinfo)
1224         destroy_loop_vec_info (inner_loop_vinfo, true);
1225       return NULL;
1226     }
1227
1228   /* Make sure there exists a single-predecessor exit bb:  */
1229   if (!single_pred_p (single_exit (loop)->dest))
1230     {
1231       edge e = single_exit (loop);
1232       if (!(e->flags & EDGE_ABNORMAL))
1233         {
1234           split_loop_exit_edge (e);
1235           if (dump_enabled_p ())
1236             dump_printf (MSG_NOTE, "split exit edge.\n");
1237         }
1238       else
1239         {
1240           if (dump_enabled_p ())
1241             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1242                              "not vectorized: abnormal loop exit edge.\n");
1243           if (inner_loop_vinfo)
1244             destroy_loop_vec_info (inner_loop_vinfo, true);
1245           return NULL;
1246         }
1247     }
1248
1249   loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1250   if (!loop_cond)
1251     {
1252       if (dump_enabled_p ())
1253         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1254                          "not vectorized: complicated exit condition.\n");
1255       if (inner_loop_vinfo)
1256         destroy_loop_vec_info (inner_loop_vinfo, true);
1257       return NULL;
1258     }
1259
1260   if (!number_of_iterations
1261       || chrec_contains_undetermined (number_of_iterations))
1262     {
1263       if (dump_enabled_p ())
1264         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1265                          "not vectorized: number of iterations cannot be "
1266                          "computed.\n");
1267       if (inner_loop_vinfo)
1268         destroy_loop_vec_info (inner_loop_vinfo, true);
1269       return NULL;
1270     }
1271
1272   if (integer_zerop (number_of_iterations))
1273     {
1274       if (dump_enabled_p ())
1275         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1276                          "not vectorized: number of iterations = 0.\n");
1277       if (inner_loop_vinfo)
1278         destroy_loop_vec_info (inner_loop_vinfo, true);
1279       return NULL;
1280     }
1281
1282   loop_vinfo = new_loop_vec_info (loop);
1283   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1284   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1285
1286   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1287     {
1288       if (dump_enabled_p ())
1289         {
1290           dump_printf_loc (MSG_NOTE, vect_location,
1291                            "Symbolic number of iterations is ");
1292           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1293           dump_printf (MSG_NOTE, "\n");
1294         }
1295     }
1296
1297   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1298
1299   /* CHECKME: May want to keep it around it in the future.  */
1300   if (inner_loop_vinfo)
1301     destroy_loop_vec_info (inner_loop_vinfo, false);
1302
1303   gcc_assert (!loop->aux);
1304   loop->aux = loop_vinfo;
1305   return loop_vinfo;
1306 }
1307
1308
1309 /* Function vect_analyze_loop_operations.
1310
1311    Scan the loop stmts and make sure they are all vectorizable.  */
1312
1313 static bool
1314 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1315 {
1316   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1317   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1318   int nbbs = loop->num_nodes;
1319   gimple_stmt_iterator si;
1320   unsigned int vectorization_factor = 0;
1321   int i;
1322   gimple phi;
1323   stmt_vec_info stmt_info;
1324   bool need_to_vectorize = false;
1325   int min_profitable_iters;
1326   int min_scalar_loop_bound;
1327   unsigned int th;
1328   bool only_slp_in_loop = true, ok;
1329   HOST_WIDE_INT max_niter;
1330   HOST_WIDE_INT estimated_niter;
1331   int min_profitable_estimate;
1332
1333   if (dump_enabled_p ())
1334     dump_printf_loc (MSG_NOTE, vect_location,
1335                      "=== vect_analyze_loop_operations ===\n");
1336
1337   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1338   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1339   if (slp)
1340     {
1341       /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1342          vectorization factor of the loop is the unrolling factor required by
1343          the SLP instances.  If that unrolling factor is 1, we say, that we
1344          perform pure SLP on loop - cross iteration parallelism is not
1345          exploited.  */
1346       for (i = 0; i < nbbs; i++)
1347         {
1348           basic_block bb = bbs[i];
1349           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1350             {
1351               gimple stmt = gsi_stmt (si);
1352               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1353               gcc_assert (stmt_info);
1354               if ((STMT_VINFO_RELEVANT_P (stmt_info)
1355                    || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1356                   && !PURE_SLP_STMT (stmt_info))
1357                 /* STMT needs both SLP and loop-based vectorization.  */
1358                 only_slp_in_loop = false;
1359             }
1360         }
1361
1362       if (only_slp_in_loop)
1363         vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1364       else
1365         vectorization_factor = least_common_multiple (vectorization_factor,
1366                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1367
1368       LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1369       if (dump_enabled_p ())
1370         dump_printf_loc (MSG_NOTE, vect_location,
1371                          "Updating vectorization factor to %d\n",
1372                          vectorization_factor);
1373     }
1374
1375   for (i = 0; i < nbbs; i++)
1376     {
1377       basic_block bb = bbs[i];
1378
1379       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1380         {
1381           phi = gsi_stmt (si);
1382           ok = true;
1383
1384           stmt_info = vinfo_for_stmt (phi);
1385           if (dump_enabled_p ())
1386             {
1387               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1388               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1389               dump_printf (MSG_NOTE, "\n");
1390             }
1391
1392           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1393              (i.e., a phi in the tail of the outer-loop).  */
1394           if (! is_loop_header_bb_p (bb))
1395             {
1396               /* FORNOW: we currently don't support the case that these phis
1397                  are not used in the outerloop (unless it is double reduction,
1398                  i.e., this phi is vect_reduction_def), cause this case
1399                  requires to actually do something here.  */
1400               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1401                    || STMT_VINFO_LIVE_P (stmt_info))
1402                   && STMT_VINFO_DEF_TYPE (stmt_info)
1403                      != vect_double_reduction_def)
1404                 {
1405                   if (dump_enabled_p ())
1406                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1407                                      "Unsupported loop-closed phi in "
1408                                      "outer-loop.\n");
1409                   return false;
1410                 }
1411
1412               /* If PHI is used in the outer loop, we check that its operand
1413                  is defined in the inner loop.  */
1414               if (STMT_VINFO_RELEVANT_P (stmt_info))
1415                 {
1416                   tree phi_op;
1417                   gimple op_def_stmt;
1418
1419                   if (gimple_phi_num_args (phi) != 1)
1420                     return false;
1421
1422                   phi_op = PHI_ARG_DEF (phi, 0);
1423                   if (TREE_CODE (phi_op) != SSA_NAME)
1424                     return false;
1425
1426                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1427                   if (gimple_nop_p (op_def_stmt)
1428                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1429                       || !vinfo_for_stmt (op_def_stmt))
1430                     return false;
1431
1432                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1433                         != vect_used_in_outer
1434                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1435                            != vect_used_in_outer_by_reduction)
1436                     return false;
1437                 }
1438
1439               continue;
1440             }
1441
1442           gcc_assert (stmt_info);
1443
1444           if (STMT_VINFO_LIVE_P (stmt_info))
1445             {
1446               /* FORNOW: not yet supported.  */
1447               if (dump_enabled_p ())
1448                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1449                                  "not vectorized: value used after loop.\n");
1450               return false;
1451             }
1452
1453           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1454               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1455             {
1456               /* A scalar-dependence cycle that we don't support.  */
1457               if (dump_enabled_p ())
1458                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1459                                  "not vectorized: scalar dependence cycle.\n");
1460               return false;
1461             }
1462
1463           if (STMT_VINFO_RELEVANT_P (stmt_info))
1464             {
1465               need_to_vectorize = true;
1466               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1467                 ok = vectorizable_induction (phi, NULL, NULL);
1468             }
1469
1470           if (!ok)
1471             {
1472               if (dump_enabled_p ())
1473                 {
1474                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1475                                    "not vectorized: relevant phi not "
1476                                    "supported: ");
1477                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1478                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1479                 }
1480               return false;
1481             }
1482         }
1483
1484       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1485         {
1486           gimple stmt = gsi_stmt (si);
1487           if (!gimple_clobber_p (stmt)
1488               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1489             return false;
1490         }
1491     } /* bbs */
1492
1493   /* All operations in the loop are either irrelevant (deal with loop
1494      control, or dead), or only used outside the loop and can be moved
1495      out of the loop (e.g. invariants, inductions).  The loop can be
1496      optimized away by scalar optimizations.  We're better off not
1497      touching this loop.  */
1498   if (!need_to_vectorize)
1499     {
1500       if (dump_enabled_p ())
1501         dump_printf_loc (MSG_NOTE, vect_location,
1502                          "All the computation can be taken out of the loop.\n");
1503       if (dump_enabled_p ())
1504         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1505                          "not vectorized: redundant loop. no profit to "
1506                          "vectorize.\n");
1507       return false;
1508     }
1509
1510   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1511     dump_printf_loc (MSG_NOTE, vect_location,
1512                      "vectorization_factor = %d, niters = "
1513                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1514                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1515
1516   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1517        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1518       || ((max_niter = max_stmt_executions_int (loop)) != -1
1519           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1520     {
1521       if (dump_enabled_p ())
1522         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1523                          "not vectorized: iteration count too small.\n");
1524       if (dump_enabled_p ())
1525         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1526                          "not vectorized: iteration count smaller than "
1527                          "vectorization factor.\n");
1528       return false;
1529     }
1530
1531   /* Analyze cost.  Decide if worth while to vectorize.  */
1532
1533   /* Once VF is set, SLP costs should be updated since the number of created
1534      vector stmts depends on VF.  */
1535   vect_update_slp_costs_according_to_vf (loop_vinfo);
1536
1537   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1538                                       &min_profitable_estimate);
1539   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1540
1541   if (min_profitable_iters < 0)
1542     {
1543       if (dump_enabled_p ())
1544         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1545                          "not vectorized: vectorization not profitable.\n");
1546       if (dump_enabled_p ())
1547         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1548                          "not vectorized: vector version will never be "
1549                          "profitable.\n");
1550       return false;
1551     }
1552
1553   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1554                             * vectorization_factor) - 1);
1555
1556
1557   /* Use the cost model only if it is more conservative than user specified
1558      threshold.  */
1559
1560   th = (unsigned) min_scalar_loop_bound;
1561   if (min_profitable_iters
1562       && (!min_scalar_loop_bound
1563           || min_profitable_iters > min_scalar_loop_bound))
1564     th = (unsigned) min_profitable_iters;
1565
1566   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1567       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1568     {
1569       if (dump_enabled_p ())
1570         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1571                          "not vectorized: vectorization not profitable.\n");
1572       if (dump_enabled_p ())
1573         dump_printf_loc (MSG_NOTE, vect_location,
1574                          "not vectorized: iteration count smaller than user "
1575                          "specified loop bound parameter or minimum profitable "
1576                          "iterations (whichever is more conservative).\n");
1577       return false;
1578     }
1579
1580   if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1581       && ((unsigned HOST_WIDE_INT) estimated_niter
1582           <= MAX (th, (unsigned)min_profitable_estimate)))
1583     {
1584       if (dump_enabled_p ())
1585         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1586                          "not vectorized: estimated iteration count too "
1587                          "small.\n");
1588       if (dump_enabled_p ())
1589         dump_printf_loc (MSG_NOTE, vect_location,
1590                          "not vectorized: estimated iteration count smaller "
1591                          "than specified loop bound parameter or minimum "
1592                          "profitable iterations (whichever is more "
1593                          "conservative).\n");
1594       return false;
1595     }
1596
1597   return true;
1598 }
1599
1600
1601 /* Function vect_analyze_loop_2.
1602
1603    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1604    for it.  The different analyses will record information in the
1605    loop_vec_info struct.  */
1606 static bool
1607 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1608 {
1609   bool ok, slp = false;
1610   int max_vf = MAX_VECTORIZATION_FACTOR;
1611   int min_vf = 2;
1612
1613   /* Find all data references in the loop (which correspond to vdefs/vuses)
1614      and analyze their evolution in the loop.  Also adjust the minimal
1615      vectorization factor according to the loads and stores.
1616
1617      FORNOW: Handle only simple, array references, which
1618      alignment can be forced, and aligned pointer-references.  */
1619
1620   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1621   if (!ok)
1622     {
1623       if (dump_enabled_p ())
1624         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1625                          "bad data references.\n");
1626       return false;
1627     }
1628
1629   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1630      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1631
1632   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1633   if (!ok)
1634     {
1635       if (dump_enabled_p ())
1636         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1637                          "bad data access.\n");
1638       return false;
1639     }
1640
1641   /* Classify all cross-iteration scalar data-flow cycles.
1642      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1643
1644   vect_analyze_scalar_cycles (loop_vinfo);
1645
1646   vect_pattern_recog (loop_vinfo, NULL);
1647
1648   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1649
1650   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1651   if (!ok)
1652     {
1653       if (dump_enabled_p ())
1654         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1655                          "unexpected pattern.\n");
1656       return false;
1657     }
1658
1659   /* Analyze data dependences between the data-refs in the loop
1660      and adjust the maximum vectorization factor according to
1661      the dependences.
1662      FORNOW: fail at the first data dependence that we encounter.  */
1663
1664   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1665   if (!ok
1666       || max_vf < min_vf)
1667     {
1668       if (dump_enabled_p ())
1669             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1670                              "bad data dependence.\n");
1671       return false;
1672     }
1673
1674   ok = vect_determine_vectorization_factor (loop_vinfo);
1675   if (!ok)
1676     {
1677       if (dump_enabled_p ())
1678         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1679                          "can't determine vectorization factor.\n");
1680       return false;
1681     }
1682   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1683     {
1684       if (dump_enabled_p ())
1685         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1686                          "bad data dependence.\n");
1687       return false;
1688     }
1689
1690   /* Analyze the alignment of the data-refs in the loop.
1691      Fail if a data reference is found that cannot be vectorized.  */
1692
1693   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1694   if (!ok)
1695     {
1696       if (dump_enabled_p ())
1697         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1698                          "bad data alignment.\n");
1699       return false;
1700     }
1701
1702   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1703      It is important to call pruning after vect_analyze_data_ref_accesses,
1704      since we use grouping information gathered by interleaving analysis.  */
1705   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1706   if (!ok)
1707     {
1708       if (dump_enabled_p ())
1709         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1710                          "too long list of versioning for alias "
1711                          "run-time tests.\n");
1712       return false;
1713     }
1714
1715   /* This pass will decide on using loop versioning and/or loop peeling in
1716      order to enhance the alignment of data references in the loop.  */
1717
1718   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1719   if (!ok)
1720     {
1721       if (dump_enabled_p ())
1722         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1723                          "bad data alignment.\n");
1724       return false;
1725     }
1726
1727   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1728   ok = vect_analyze_slp (loop_vinfo, NULL);
1729   if (ok)
1730     {
1731       /* Decide which possible SLP instances to SLP.  */
1732       slp = vect_make_slp_decision (loop_vinfo);
1733
1734       /* Find stmts that need to be both vectorized and SLPed.  */
1735       vect_detect_hybrid_slp (loop_vinfo);
1736     }
1737   else
1738     return false;
1739
1740   /* Scan all the operations in the loop and make sure they are
1741      vectorizable.  */
1742
1743   ok = vect_analyze_loop_operations (loop_vinfo, slp);
1744   if (!ok)
1745     {
1746       if (dump_enabled_p ())
1747         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1748                          "bad operation or unsupported loop bound.\n");
1749       return false;
1750     }
1751
1752   /* Decide whether we need to create an epilogue loop to handle
1753      remaining scalar iterations.  */
1754   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1755       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1756     {
1757       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1758                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1759           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1760         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1761     }
1762   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1763            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1764                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
1765     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1766
1767   /* If an epilogue loop is required make sure we can create one.  */
1768   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1769       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1770     {
1771       if (dump_enabled_p ())
1772         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1773       if (!vect_can_advance_ivs_p (loop_vinfo)
1774           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1775                                            single_exit (LOOP_VINFO_LOOP
1776                                                          (loop_vinfo))))
1777         {
1778           if (dump_enabled_p ())
1779             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1780                              "not vectorized: can't create required "
1781                              "epilog loop\n");
1782           return false;
1783         }
1784     }
1785
1786   return true;
1787 }
1788
1789 /* Function vect_analyze_loop.
1790
1791    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1792    for it.  The different analyses will record information in the
1793    loop_vec_info struct.  */
1794 loop_vec_info
1795 vect_analyze_loop (struct loop *loop)
1796 {
1797   loop_vec_info loop_vinfo;
1798   unsigned int vector_sizes;
1799
1800   /* Autodetect first vector size we try.  */
1801   current_vector_size = 0;
1802   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1803
1804   if (dump_enabled_p ())
1805     dump_printf_loc (MSG_NOTE, vect_location,
1806                      "===== analyze_loop_nest =====\n");
1807
1808   if (loop_outer (loop)
1809       && loop_vec_info_for_loop (loop_outer (loop))
1810       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1811     {
1812       if (dump_enabled_p ())
1813         dump_printf_loc (MSG_NOTE, vect_location,
1814                          "outer-loop already vectorized.\n");
1815       return NULL;
1816     }
1817
1818   while (1)
1819     {
1820       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1821       loop_vinfo = vect_analyze_loop_form (loop);
1822       if (!loop_vinfo)
1823         {
1824           if (dump_enabled_p ())
1825             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1826                              "bad loop form.\n");
1827           return NULL;
1828         }
1829
1830       if (vect_analyze_loop_2 (loop_vinfo))
1831         {
1832           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1833
1834           return loop_vinfo;
1835         }
1836
1837       destroy_loop_vec_info (loop_vinfo, true);
1838
1839       vector_sizes &= ~current_vector_size;
1840       if (vector_sizes == 0
1841           || current_vector_size == 0)
1842         return NULL;
1843
1844       /* Try the next biggest vector size.  */
1845       current_vector_size = 1 << floor_log2 (vector_sizes);
1846       if (dump_enabled_p ())
1847         dump_printf_loc (MSG_NOTE, vect_location,
1848                          "***** Re-trying analysis with "
1849                          "vector size %d\n", current_vector_size);
1850     }
1851 }
1852
1853
1854 /* Function reduction_code_for_scalar_code
1855
1856    Input:
1857    CODE - tree_code of a reduction operations.
1858
1859    Output:
1860    REDUC_CODE - the corresponding tree-code to be used to reduce the
1861       vector of partial results into a single scalar result (which
1862       will also reside in a vector) or ERROR_MARK if the operation is
1863       a supported reduction operation, but does not have such tree-code.
1864
1865    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1866
1867 static bool
1868 reduction_code_for_scalar_code (enum tree_code code,
1869                                 enum tree_code *reduc_code)
1870 {
1871   switch (code)
1872     {
1873       case MAX_EXPR:
1874         *reduc_code = REDUC_MAX_EXPR;
1875         return true;
1876
1877       case MIN_EXPR:
1878         *reduc_code = REDUC_MIN_EXPR;
1879         return true;
1880
1881       case PLUS_EXPR:
1882         *reduc_code = REDUC_PLUS_EXPR;
1883         return true;
1884
1885       case MULT_EXPR:
1886       case MINUS_EXPR:
1887       case BIT_IOR_EXPR:
1888       case BIT_XOR_EXPR:
1889       case BIT_AND_EXPR:
1890         *reduc_code = ERROR_MARK;
1891         return true;
1892
1893       default:
1894        return false;
1895     }
1896 }
1897
1898
1899 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
1900    STMT is printed with a message MSG. */
1901
1902 static void
1903 report_vect_op (int msg_type, gimple stmt, const char *msg)
1904 {
1905   dump_printf_loc (msg_type, vect_location, "%s", msg);
1906   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1907   dump_printf (msg_type, "\n");
1908 }
1909
1910
1911 /* Detect SLP reduction of the form:
1912
1913    #a1 = phi <a5, a0>
1914    a2 = operation (a1)
1915    a3 = operation (a2)
1916    a4 = operation (a3)
1917    a5 = operation (a4)
1918
1919    #a = phi <a5>
1920
1921    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1922    FIRST_STMT is the first reduction stmt in the chain
1923    (a2 = operation (a1)).
1924
1925    Return TRUE if a reduction chain was detected.  */
1926
1927 static bool
1928 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1929 {
1930   struct loop *loop = (gimple_bb (phi))->loop_father;
1931   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1932   enum tree_code code;
1933   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1934   stmt_vec_info use_stmt_info, current_stmt_info;
1935   tree lhs;
1936   imm_use_iterator imm_iter;
1937   use_operand_p use_p;
1938   int nloop_uses, size = 0, n_out_of_loop_uses;
1939   bool found = false;
1940
1941   if (loop != vect_loop)
1942     return false;
1943
1944   lhs = PHI_RESULT (phi);
1945   code = gimple_assign_rhs_code (first_stmt);
1946   while (1)
1947     {
1948       nloop_uses = 0;
1949       n_out_of_loop_uses = 0;
1950       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1951         {
1952           gimple use_stmt = USE_STMT (use_p);
1953           if (is_gimple_debug (use_stmt))
1954             continue;
1955
1956           use_stmt = USE_STMT (use_p);
1957
1958           /* Check if we got back to the reduction phi.  */
1959           if (use_stmt == phi)
1960             {
1961               loop_use_stmt = use_stmt;
1962               found = true;
1963               break;
1964             }
1965
1966           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1967             {
1968               if (vinfo_for_stmt (use_stmt)
1969                   && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1970                 {
1971                   loop_use_stmt = use_stmt;
1972                   nloop_uses++;
1973                 }
1974             }
1975            else
1976              n_out_of_loop_uses++;
1977
1978            /* There are can be either a single use in the loop or two uses in
1979               phi nodes.  */
1980            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1981              return false;
1982         }
1983
1984       if (found)
1985         break;
1986
1987       /* We reached a statement with no loop uses.  */
1988       if (nloop_uses == 0)
1989         return false;
1990
1991       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
1992       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
1993         return false;
1994
1995       if (!is_gimple_assign (loop_use_stmt)
1996           || code != gimple_assign_rhs_code (loop_use_stmt)
1997           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
1998         return false;
1999
2000       /* Insert USE_STMT into reduction chain.  */
2001       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2002       if (current_stmt)
2003         {
2004           current_stmt_info = vinfo_for_stmt (current_stmt);
2005           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2006           GROUP_FIRST_ELEMENT (use_stmt_info)
2007             = GROUP_FIRST_ELEMENT (current_stmt_info);
2008         }
2009       else
2010         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2011
2012       lhs = gimple_assign_lhs (loop_use_stmt);
2013       current_stmt = loop_use_stmt;
2014       size++;
2015    }
2016
2017   if (!found || loop_use_stmt != phi || size < 2)
2018     return false;
2019
2020   /* Swap the operands, if needed, to make the reduction operand be the second
2021      operand.  */
2022   lhs = PHI_RESULT (phi);
2023   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2024   while (next_stmt)
2025     {
2026       if (gimple_assign_rhs2 (next_stmt) == lhs)
2027         {
2028           tree op = gimple_assign_rhs1 (next_stmt);
2029           gimple def_stmt = NULL;
2030
2031           if (TREE_CODE (op) == SSA_NAME)
2032             def_stmt = SSA_NAME_DEF_STMT (op);
2033
2034           /* Check that the other def is either defined in the loop
2035              ("vect_internal_def"), or it's an induction (defined by a
2036              loop-header phi-node).  */
2037           if (def_stmt
2038               && gimple_bb (def_stmt)
2039               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2040               && (is_gimple_assign (def_stmt)
2041                   || is_gimple_call (def_stmt)
2042                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2043                            == vect_induction_def
2044                   || (gimple_code (def_stmt) == GIMPLE_PHI
2045                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2046                                   == vect_internal_def
2047                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2048             {
2049               lhs = gimple_assign_lhs (next_stmt);
2050               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2051               continue;
2052             }
2053
2054           return false;
2055         }
2056       else
2057         {
2058           tree op = gimple_assign_rhs2 (next_stmt);
2059           gimple def_stmt = NULL;
2060
2061           if (TREE_CODE (op) == SSA_NAME)
2062             def_stmt = SSA_NAME_DEF_STMT (op);
2063
2064           /* Check that the other def is either defined in the loop
2065             ("vect_internal_def"), or it's an induction (defined by a
2066             loop-header phi-node).  */
2067           if (def_stmt
2068               && gimple_bb (def_stmt)
2069               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2070               && (is_gimple_assign (def_stmt)
2071                   || is_gimple_call (def_stmt)
2072                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2073                               == vect_induction_def
2074                   || (gimple_code (def_stmt) == GIMPLE_PHI
2075                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2076                                   == vect_internal_def
2077                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2078             {
2079               if (dump_enabled_p ())
2080                 {
2081                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2082                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2083                   dump_printf (MSG_NOTE, "\n");
2084                 }
2085
2086               swap_ssa_operands (next_stmt,
2087                                  gimple_assign_rhs1_ptr (next_stmt),
2088                                  gimple_assign_rhs2_ptr (next_stmt));
2089               update_stmt (next_stmt);
2090
2091               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2092                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2093             }
2094           else
2095             return false;
2096         }
2097
2098       lhs = gimple_assign_lhs (next_stmt);
2099       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2100     }
2101
2102   /* Save the chain for further analysis in SLP detection.  */
2103   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2104   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2105   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2106
2107   return true;
2108 }
2109
2110
2111 /* Function vect_is_simple_reduction_1
2112
2113    (1) Detect a cross-iteration def-use cycle that represents a simple
2114    reduction computation.  We look for the following pattern:
2115
2116    loop_header:
2117      a1 = phi < a0, a2 >
2118      a3 = ...
2119      a2 = operation (a3, a1)
2120
2121    or
2122
2123    a3 = ...
2124    loop_header:
2125      a1 = phi < a0, a2 >
2126      a2 = operation (a3, a1)
2127
2128    such that:
2129    1. operation is commutative and associative and it is safe to
2130       change the order of the computation (if CHECK_REDUCTION is true)
2131    2. no uses for a2 in the loop (a2 is used out of the loop)
2132    3. no uses of a1 in the loop besides the reduction operation
2133    4. no uses of a1 outside the loop.
2134
2135    Conditions 1,4 are tested here.
2136    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2137
2138    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2139    nested cycles, if CHECK_REDUCTION is false.
2140
2141    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2142    reductions:
2143
2144      a1 = phi < a0, a2 >
2145      inner loop (def of a3)
2146      a2 = phi < a3 >
2147
2148    If MODIFY is true it tries also to rework the code in-place to enable
2149    detection of more reduction patterns.  For the time being we rewrite
2150    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2151 */
2152
2153 static gimple
2154 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2155                             bool check_reduction, bool *double_reduc,
2156                             bool modify)
2157 {
2158   struct loop *loop = (gimple_bb (phi))->loop_father;
2159   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2160   edge latch_e = loop_latch_edge (loop);
2161   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2162   gimple def_stmt, def1 = NULL, def2 = NULL;
2163   enum tree_code orig_code, code;
2164   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2165   tree type;
2166   int nloop_uses;
2167   tree name;
2168   imm_use_iterator imm_iter;
2169   use_operand_p use_p;
2170   bool phi_def;
2171
2172   *double_reduc = false;
2173
2174   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2175      otherwise, we assume outer loop vectorization.  */
2176   gcc_assert ((check_reduction && loop == vect_loop)
2177               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2178
2179   name = PHI_RESULT (phi);
2180   nloop_uses = 0;
2181   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2182     {
2183       gimple use_stmt = USE_STMT (use_p);
2184       if (is_gimple_debug (use_stmt))
2185         continue;
2186
2187       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2188         {
2189           if (dump_enabled_p ())
2190             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2191                              "intermediate value used outside loop.\n");
2192
2193           return NULL;
2194         }
2195
2196       if (vinfo_for_stmt (use_stmt)
2197           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2198         nloop_uses++;
2199       if (nloop_uses > 1)
2200         {
2201           if (dump_enabled_p ())
2202             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2203                              "reduction used in loop.\n");
2204           return NULL;
2205         }
2206     }
2207
2208   if (TREE_CODE (loop_arg) != SSA_NAME)
2209     {
2210       if (dump_enabled_p ())
2211         {
2212           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2213                            "reduction: not ssa_name: ");
2214           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2215           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2216         }
2217       return NULL;
2218     }
2219
2220   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2221   if (!def_stmt)
2222     {
2223       if (dump_enabled_p ())
2224         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2225                          "reduction: no def_stmt.\n");
2226       return NULL;
2227     }
2228
2229   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2230     {
2231       if (dump_enabled_p ())
2232         {
2233           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2234           dump_printf (MSG_NOTE, "\n");
2235         }
2236       return NULL;
2237     }
2238
2239   if (is_gimple_assign (def_stmt))
2240     {
2241       name = gimple_assign_lhs (def_stmt);
2242       phi_def = false;
2243     }
2244   else
2245     {
2246       name = PHI_RESULT (def_stmt);
2247       phi_def = true;
2248     }
2249
2250   nloop_uses = 0;
2251   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2252     {
2253       gimple use_stmt = USE_STMT (use_p);
2254       if (is_gimple_debug (use_stmt))
2255         continue;
2256       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2257           && vinfo_for_stmt (use_stmt)
2258           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2259         nloop_uses++;
2260       if (nloop_uses > 1)
2261         {
2262           if (dump_enabled_p ())
2263             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2264                              "reduction used in loop.\n");
2265           return NULL;
2266         }
2267     }
2268
2269   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2270      defined in the inner loop.  */
2271   if (phi_def)
2272     {
2273       op1 = PHI_ARG_DEF (def_stmt, 0);
2274
2275       if (gimple_phi_num_args (def_stmt) != 1
2276           || TREE_CODE (op1) != SSA_NAME)
2277         {
2278           if (dump_enabled_p ())
2279             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2280                              "unsupported phi node definition.\n");
2281
2282           return NULL;
2283         }
2284
2285       def1 = SSA_NAME_DEF_STMT (op1);
2286       if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2287           && loop->inner
2288           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2289           && is_gimple_assign (def1))
2290         {
2291           if (dump_enabled_p ())
2292             report_vect_op (MSG_NOTE, def_stmt,
2293                             "detected double reduction: ");
2294
2295           *double_reduc = true;
2296           return def_stmt;
2297         }
2298
2299       return NULL;
2300     }
2301
2302   code = orig_code = gimple_assign_rhs_code (def_stmt);
2303
2304   /* We can handle "res -= x[i]", which is non-associative by
2305      simply rewriting this into "res += -x[i]".  Avoid changing
2306      gimple instruction for the first simple tests and only do this
2307      if we're allowed to change code at all.  */
2308   if (code == MINUS_EXPR
2309       && modify
2310       && (op1 = gimple_assign_rhs1 (def_stmt))
2311       && TREE_CODE (op1) == SSA_NAME
2312       && SSA_NAME_DEF_STMT (op1) == phi)
2313     code = PLUS_EXPR;
2314
2315   if (check_reduction
2316       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2317     {
2318       if (dump_enabled_p ())
2319         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2320                         "reduction: not commutative/associative: ");
2321       return NULL;
2322     }
2323
2324   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2325     {
2326       if (code != COND_EXPR)
2327         {
2328           if (dump_enabled_p ())
2329             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2330                             "reduction: not binary operation: ");
2331
2332           return NULL;
2333         }
2334
2335       op3 = gimple_assign_rhs1 (def_stmt);
2336       if (COMPARISON_CLASS_P (op3))
2337         {
2338           op4 = TREE_OPERAND (op3, 1);
2339           op3 = TREE_OPERAND (op3, 0);
2340         }
2341
2342       op1 = gimple_assign_rhs2 (def_stmt);
2343       op2 = gimple_assign_rhs3 (def_stmt);
2344
2345       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2346         {
2347           if (dump_enabled_p ())
2348             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2349                             "reduction: uses not ssa_names: ");
2350
2351           return NULL;
2352         }
2353     }
2354   else
2355     {
2356       op1 = gimple_assign_rhs1 (def_stmt);
2357       op2 = gimple_assign_rhs2 (def_stmt);
2358
2359       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2360         {
2361           if (dump_enabled_p ())
2362             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2363                             "reduction: uses not ssa_names: ");
2364
2365           return NULL;
2366         }
2367    }
2368
2369   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2370   if ((TREE_CODE (op1) == SSA_NAME
2371        && !types_compatible_p (type,TREE_TYPE (op1)))
2372       || (TREE_CODE (op2) == SSA_NAME
2373           && !types_compatible_p (type, TREE_TYPE (op2)))
2374       || (op3 && TREE_CODE (op3) == SSA_NAME
2375           && !types_compatible_p (type, TREE_TYPE (op3)))
2376       || (op4 && TREE_CODE (op4) == SSA_NAME
2377           && !types_compatible_p (type, TREE_TYPE (op4))))
2378     {
2379       if (dump_enabled_p ())
2380         {
2381           dump_printf_loc (MSG_NOTE, vect_location,
2382                            "reduction: multiple types: operation type: ");
2383           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2384           dump_printf (MSG_NOTE, ", operands types: ");
2385           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2386                              TREE_TYPE (op1));
2387           dump_printf (MSG_NOTE, ",");
2388           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2389                              TREE_TYPE (op2));
2390           if (op3)
2391             {
2392               dump_printf (MSG_NOTE, ",");
2393               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2394                                  TREE_TYPE (op3));
2395             }
2396
2397           if (op4)
2398             {
2399               dump_printf (MSG_NOTE, ",");
2400               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2401                                  TREE_TYPE (op4));
2402             }
2403           dump_printf (MSG_NOTE, "\n");
2404         }
2405
2406       return NULL;
2407     }
2408
2409   /* Check that it's ok to change the order of the computation.
2410      Generally, when vectorizing a reduction we change the order of the
2411      computation.  This may change the behavior of the program in some
2412      cases, so we need to check that this is ok.  One exception is when
2413      vectorizing an outer-loop: the inner-loop is executed sequentially,
2414      and therefore vectorizing reductions in the inner-loop during
2415      outer-loop vectorization is safe.  */
2416
2417   /* CHECKME: check for !flag_finite_math_only too?  */
2418   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2419       && check_reduction)
2420     {
2421       /* Changing the order of operations changes the semantics.  */
2422       if (dump_enabled_p ())
2423         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2424                         "reduction: unsafe fp math optimization: ");
2425       return NULL;
2426     }
2427   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2428            && check_reduction)
2429     {
2430       /* Changing the order of operations changes the semantics.  */
2431       if (dump_enabled_p ())
2432         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2433                         "reduction: unsafe int math optimization: ");
2434       return NULL;
2435     }
2436   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2437     {
2438       /* Changing the order of operations changes the semantics.  */
2439       if (dump_enabled_p ())
2440         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2441                         "reduction: unsafe fixed-point math optimization: ");
2442       return NULL;
2443     }
2444
2445   /* If we detected "res -= x[i]" earlier, rewrite it into
2446      "res += -x[i]" now.  If this turns out to be useless reassoc
2447      will clean it up again.  */
2448   if (orig_code == MINUS_EXPR)
2449     {
2450       tree rhs = gimple_assign_rhs2 (def_stmt);
2451       tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
2452       gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2453                                                          rhs, NULL);
2454       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2455       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2456                                                           loop_info, NULL));
2457       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2458       gimple_assign_set_rhs2 (def_stmt, negrhs);
2459       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2460       update_stmt (def_stmt);
2461     }
2462
2463   /* Reduction is safe. We're dealing with one of the following:
2464      1) integer arithmetic and no trapv
2465      2) floating point arithmetic, and special flags permit this optimization
2466      3) nested cycle (i.e., outer loop vectorization).  */
2467   if (TREE_CODE (op1) == SSA_NAME)
2468     def1 = SSA_NAME_DEF_STMT (op1);
2469
2470   if (TREE_CODE (op2) == SSA_NAME)
2471     def2 = SSA_NAME_DEF_STMT (op2);
2472
2473   if (code != COND_EXPR
2474       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2475     {
2476       if (dump_enabled_p ())
2477         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2478       return NULL;
2479     }
2480
2481   /* Check that one def is the reduction def, defined by PHI,
2482      the other def is either defined in the loop ("vect_internal_def"),
2483      or it's an induction (defined by a loop-header phi-node).  */
2484
2485   if (def2 && def2 == phi
2486       && (code == COND_EXPR
2487           || !def1 || gimple_nop_p (def1)
2488           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2489           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2490               && (is_gimple_assign (def1)
2491                   || is_gimple_call (def1)
2492                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2493                       == vect_induction_def
2494                   || (gimple_code (def1) == GIMPLE_PHI
2495                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2496                           == vect_internal_def
2497                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2498     {
2499       if (dump_enabled_p ())
2500         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2501       return def_stmt;
2502     }
2503
2504   if (def1 && def1 == phi
2505       && (code == COND_EXPR
2506           || !def2 || gimple_nop_p (def2)
2507           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2508           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2509               && (is_gimple_assign (def2)
2510                   || is_gimple_call (def2)
2511                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2512                       == vect_induction_def
2513                   || (gimple_code (def2) == GIMPLE_PHI
2514                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2515                           == vect_internal_def
2516                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2517     {
2518       if (check_reduction)
2519         {
2520           /* Swap operands (just for simplicity - so that the rest of the code
2521              can assume that the reduction variable is always the last (second)
2522              argument).  */
2523           if (dump_enabled_p ())
2524             report_vect_op (MSG_NOTE, def_stmt,
2525                             "detected reduction: need to swap operands: ");
2526
2527           swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2528                              gimple_assign_rhs2_ptr (def_stmt));
2529
2530           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2531             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2532         }
2533       else
2534         {
2535           if (dump_enabled_p ())
2536             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2537         }
2538
2539       return def_stmt;
2540     }
2541
2542   /* Try to find SLP reduction chain.  */
2543   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2544     {
2545       if (dump_enabled_p ())
2546         report_vect_op (MSG_NOTE, def_stmt,
2547                         "reduction: detected reduction chain: ");
2548
2549       return def_stmt;
2550     }
2551
2552   if (dump_enabled_p ())
2553     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2554                     "reduction: unknown pattern: ");
2555
2556   return NULL;
2557 }
2558
2559 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2560    in-place.  Arguments as there.  */
2561
2562 static gimple
2563 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2564                           bool check_reduction, bool *double_reduc)
2565 {
2566   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2567                                      double_reduc, false);
2568 }
2569
2570 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2571    in-place if it enables detection of more reductions.  Arguments
2572    as there.  */
2573
2574 gimple
2575 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2576                           bool check_reduction, bool *double_reduc)
2577 {
2578   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2579                                      double_reduc, true);
2580 }
2581
2582 /* Calculate the cost of one scalar iteration of the loop.  */
2583 int
2584 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2585 {
2586   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2587   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2588   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2589   int innerloop_iters, i, stmt_cost;
2590
2591   /* Count statements in scalar loop.  Using this as scalar cost for a single
2592      iteration for now.
2593
2594      TODO: Add outer loop support.
2595
2596      TODO: Consider assigning different costs to different scalar
2597      statements.  */
2598
2599   /* FORNOW.  */
2600   innerloop_iters = 1;
2601   if (loop->inner)
2602     innerloop_iters = 50; /* FIXME */
2603
2604   for (i = 0; i < nbbs; i++)
2605     {
2606       gimple_stmt_iterator si;
2607       basic_block bb = bbs[i];
2608
2609       if (bb->loop_father == loop->inner)
2610         factor = innerloop_iters;
2611       else
2612         factor = 1;
2613
2614       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2615         {
2616           gimple stmt = gsi_stmt (si);
2617           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2618
2619           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2620             continue;
2621
2622           /* Skip stmts that are not vectorized inside the loop.  */
2623           if (stmt_info
2624               && !STMT_VINFO_RELEVANT_P (stmt_info)
2625               && (!STMT_VINFO_LIVE_P (stmt_info)
2626                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2627               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2628             continue;
2629
2630           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2631             {
2632               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2633                stmt_cost = vect_get_stmt_cost (scalar_load);
2634              else
2635                stmt_cost = vect_get_stmt_cost (scalar_store);
2636             }
2637           else
2638             stmt_cost = vect_get_stmt_cost (scalar_stmt);
2639
2640           scalar_single_iter_cost += stmt_cost * factor;
2641         }
2642     }
2643   return scalar_single_iter_cost;
2644 }
2645
2646 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2647 int
2648 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2649                              int *peel_iters_epilogue,
2650                              int scalar_single_iter_cost,
2651                              stmt_vector_for_cost *prologue_cost_vec,
2652                              stmt_vector_for_cost *epilogue_cost_vec)
2653 {
2654   int retval = 0;
2655   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2656
2657   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2658     {
2659       *peel_iters_epilogue = vf/2;
2660       if (dump_enabled_p ())
2661         dump_printf_loc (MSG_NOTE, vect_location,
2662                          "cost model: epilogue peel iters set to vf/2 "
2663                          "because loop iterations are unknown .\n");
2664
2665       /* If peeled iterations are known but number of scalar loop
2666          iterations are unknown, count a taken branch per peeled loop.  */
2667       retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2668                                  NULL, 0, vect_prologue);
2669     }
2670   else
2671     {
2672       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2673       peel_iters_prologue = niters < peel_iters_prologue ?
2674                             niters : peel_iters_prologue;
2675       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2676       /* If we need to peel for gaps, but no peeling is required, we have to
2677          peel VF iterations.  */
2678       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2679         *peel_iters_epilogue = vf;
2680     }
2681
2682   if (peel_iters_prologue)
2683     retval += record_stmt_cost (prologue_cost_vec,
2684                                 peel_iters_prologue * scalar_single_iter_cost,
2685                                 scalar_stmt, NULL, 0, vect_prologue);
2686   if (*peel_iters_epilogue)
2687     retval += record_stmt_cost (epilogue_cost_vec,
2688                                 *peel_iters_epilogue * scalar_single_iter_cost,
2689                                 scalar_stmt, NULL, 0, vect_epilogue);
2690   return retval;
2691 }
2692
2693 /* Function vect_estimate_min_profitable_iters
2694
2695    Return the number of iterations required for the vector version of the
2696    loop to be profitable relative to the cost of the scalar version of the
2697    loop.  */
2698
2699 static void
2700 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2701                                     int *ret_min_profitable_niters,
2702                                     int *ret_min_profitable_estimate)
2703 {
2704   int min_profitable_iters;
2705   int min_profitable_estimate;
2706   int peel_iters_prologue;
2707   int peel_iters_epilogue;
2708   unsigned vec_inside_cost = 0;
2709   int vec_outside_cost = 0;
2710   unsigned vec_prologue_cost = 0;
2711   unsigned vec_epilogue_cost = 0;
2712   int scalar_single_iter_cost = 0;
2713   int scalar_outside_cost = 0;
2714   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2715   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2716   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2717
2718   /* Cost model disabled.  */
2719   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2720     {
2721       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2722       *ret_min_profitable_niters = 0;
2723       *ret_min_profitable_estimate = 0;
2724       return;
2725     }
2726
2727   /* Requires loop versioning tests to handle misalignment.  */
2728   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2729     {
2730       /*  FIXME: Make cost depend on complexity of individual check.  */
2731       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2732       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2733                             vect_prologue);
2734       dump_printf (MSG_NOTE,
2735                    "cost model: Adding cost of checks for loop "
2736                    "versioning to treat misalignment.\n");
2737     }
2738
2739   /* Requires loop versioning with alias checks.  */
2740   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2741     {
2742       /*  FIXME: Make cost depend on complexity of individual check.  */
2743       unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2744       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2745                             vect_prologue);
2746       dump_printf (MSG_NOTE,
2747                    "cost model: Adding cost of checks for loop "
2748                    "versioning aliasing.\n");
2749     }
2750
2751   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2752       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2753     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2754                           vect_prologue);
2755
2756   /* Count statements in scalar loop.  Using this as scalar cost for a single
2757      iteration for now.
2758
2759      TODO: Add outer loop support.
2760
2761      TODO: Consider assigning different costs to different scalar
2762      statements.  */
2763
2764   scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2765
2766   /* Add additional cost for the peeled instructions in prologue and epilogue
2767      loop.
2768
2769      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2770      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2771
2772      TODO: Build an expression that represents peel_iters for prologue and
2773      epilogue to be used in a run-time test.  */
2774
2775   if (npeel  < 0)
2776     {
2777       peel_iters_prologue = vf/2;
2778       dump_printf (MSG_NOTE, "cost model: "
2779                    "prologue peel iters set to vf/2.\n");
2780
2781       /* If peeling for alignment is unknown, loop bound of main loop becomes
2782          unknown.  */
2783       peel_iters_epilogue = vf/2;
2784       dump_printf (MSG_NOTE, "cost model: "
2785                    "epilogue peel iters set to vf/2 because "
2786                    "peeling for alignment is unknown.\n");
2787
2788       /* If peeled iterations are unknown, count a taken branch and a not taken
2789          branch per peeled loop. Even if scalar loop iterations are known,
2790          vector iterations are not known since peeled prologue iterations are
2791          not known. Hence guards remain the same.  */
2792       (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2793                             NULL, 0, vect_prologue);
2794       (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2795                             NULL, 0, vect_prologue);
2796       /* FORNOW: Don't attempt to pass individual scalar instructions to
2797          the model; just assume linear cost for scalar iterations.  */
2798       (void) add_stmt_cost (target_cost_data,
2799                             peel_iters_prologue * scalar_single_iter_cost,
2800                             scalar_stmt, NULL, 0, vect_prologue);
2801       (void) add_stmt_cost (target_cost_data,
2802                             peel_iters_epilogue * scalar_single_iter_cost,
2803                             scalar_stmt, NULL, 0, vect_epilogue);
2804     }
2805   else
2806     {
2807       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2808       stmt_info_for_cost *si;
2809       int j;
2810       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2811
2812       prologue_cost_vec.create (2);
2813       epilogue_cost_vec.create (2);
2814       peel_iters_prologue = npeel;
2815
2816       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2817                                           &peel_iters_epilogue,
2818                                           scalar_single_iter_cost,
2819                                           &prologue_cost_vec,
2820                                           &epilogue_cost_vec);
2821
2822       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2823         {
2824           struct _stmt_vec_info *stmt_info
2825             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2826           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2827                                 si->misalign, vect_prologue);
2828         }
2829
2830       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2831         {
2832           struct _stmt_vec_info *stmt_info
2833             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2834           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2835                                 si->misalign, vect_epilogue);
2836         }
2837
2838       prologue_cost_vec.release ();
2839       epilogue_cost_vec.release ();
2840     }
2841
2842   /* FORNOW: The scalar outside cost is incremented in one of the
2843      following ways:
2844
2845      1. The vectorizer checks for alignment and aliasing and generates
2846      a condition that allows dynamic vectorization.  A cost model
2847      check is ANDED with the versioning condition.  Hence scalar code
2848      path now has the added cost of the versioning check.
2849
2850        if (cost > th & versioning_check)
2851          jmp to vector code
2852
2853      Hence run-time scalar is incremented by not-taken branch cost.
2854
2855      2. The vectorizer then checks if a prologue is required.  If the
2856      cost model check was not done before during versioning, it has to
2857      be done before the prologue check.
2858
2859        if (cost <= th)
2860          prologue = scalar_iters
2861        if (prologue == 0)
2862          jmp to vector code
2863        else
2864          execute prologue
2865        if (prologue == num_iters)
2866          go to exit
2867
2868      Hence the run-time scalar cost is incremented by a taken branch,
2869      plus a not-taken branch, plus a taken branch cost.
2870
2871      3. The vectorizer then checks if an epilogue is required.  If the
2872      cost model check was not done before during prologue check, it
2873      has to be done with the epilogue check.
2874
2875        if (prologue == 0)
2876          jmp to vector code
2877        else
2878          execute prologue
2879        if (prologue == num_iters)
2880          go to exit
2881        vector code:
2882          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2883            jmp to epilogue
2884
2885      Hence the run-time scalar cost should be incremented by 2 taken
2886      branches.
2887
2888      TODO: The back end may reorder the BBS's differently and reverse
2889      conditions/branch directions.  Change the estimates below to
2890      something more reasonable.  */
2891
2892   /* If the number of iterations is known and we do not do versioning, we can
2893      decide whether to vectorize at compile time.  Hence the scalar version
2894      do not carry cost model guard costs.  */
2895   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2896       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2897       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2898     {
2899       /* Cost model check occurs at versioning.  */
2900       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2901           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2902         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2903       else
2904         {
2905           /* Cost model check occurs at prologue generation.  */
2906           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2907             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2908               + vect_get_stmt_cost (cond_branch_not_taken);
2909           /* Cost model check occurs at epilogue generation.  */
2910           else
2911             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2912         }
2913     }
2914
2915   /* Complete the target-specific cost calculations.  */
2916   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2917                &vec_inside_cost, &vec_epilogue_cost);
2918
2919   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2920
2921   /* Calculate number of iterations required to make the vector version
2922      profitable, relative to the loop bodies only.  The following condition
2923      must hold true:
2924      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2925      where
2926      SIC = scalar iteration cost, VIC = vector iteration cost,
2927      VOC = vector outside cost, VF = vectorization factor,
2928      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2929      SOC = scalar outside cost for run time cost model check.  */
2930
2931   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2932     {
2933       if (vec_outside_cost <= 0)
2934         min_profitable_iters = 1;
2935       else
2936         {
2937           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2938                                   - vec_inside_cost * peel_iters_prologue
2939                                   - vec_inside_cost * peel_iters_epilogue)
2940                                  / ((scalar_single_iter_cost * vf)
2941                                     - vec_inside_cost);
2942
2943           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2944               <= (((int) vec_inside_cost * min_profitable_iters)
2945                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
2946             min_profitable_iters++;
2947         }
2948     }
2949   /* vector version will never be profitable.  */
2950   else
2951     {
2952       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vect)
2953         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
2954                     "did not happen for a simd loop");
2955
2956       if (dump_enabled_p ())
2957         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2958                          "cost model: the vector iteration cost = %d "
2959                          "divided by the scalar iteration cost = %d "
2960                          "is greater or equal to the vectorization factor = %d"
2961                          ".\n",
2962                          vec_inside_cost, scalar_single_iter_cost, vf);
2963       *ret_min_profitable_niters = -1;
2964       *ret_min_profitable_estimate = -1;
2965       return;
2966     }
2967
2968   if (dump_enabled_p ())
2969     {
2970       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2971       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
2972                    vec_inside_cost);
2973       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
2974                    vec_prologue_cost);
2975       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
2976                    vec_epilogue_cost);
2977       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
2978                    scalar_single_iter_cost);
2979       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
2980                    scalar_outside_cost);
2981       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
2982                    vec_outside_cost);
2983       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
2984                    peel_iters_prologue);
2985       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
2986                    peel_iters_epilogue);
2987       dump_printf (MSG_NOTE,
2988                    "  Calculated minimum iters for profitability: %d\n",
2989                    min_profitable_iters);
2990       dump_printf (MSG_NOTE, "\n");
2991     }
2992
2993   min_profitable_iters =
2994         min_profitable_iters < vf ? vf : min_profitable_iters;
2995
2996   /* Because the condition we create is:
2997      if (niters <= min_profitable_iters)
2998        then skip the vectorized loop.  */
2999   min_profitable_iters--;
3000
3001   if (dump_enabled_p ())
3002     dump_printf_loc (MSG_NOTE, vect_location,
3003                      "  Runtime profitability threshold = %d\n",
3004                      min_profitable_iters);
3005
3006   *ret_min_profitable_niters = min_profitable_iters;
3007
3008   /* Calculate number of iterations required to make the vector version
3009      profitable, relative to the loop bodies only.
3010
3011      Non-vectorized variant is SIC * niters and it must win over vector
3012      variant on the expected loop trip count.  The following condition must hold true:
3013      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3014
3015   if (vec_outside_cost <= 0)
3016     min_profitable_estimate = 1;
3017   else
3018     {
3019       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3020                                  - vec_inside_cost * peel_iters_prologue
3021                                  - vec_inside_cost * peel_iters_epilogue)
3022                                  / ((scalar_single_iter_cost * vf)
3023                                    - vec_inside_cost);
3024     }
3025   min_profitable_estimate --;
3026   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3027   if (dump_enabled_p ())
3028     dump_printf_loc (MSG_NOTE, vect_location,
3029                      "  Static estimate profitability threshold = %d\n",
3030                       min_profitable_iters);
3031
3032   *ret_min_profitable_estimate = min_profitable_estimate;
3033 }
3034
3035
3036 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3037    functions. Design better to avoid maintenance issues.  */
3038
3039 /* Function vect_model_reduction_cost.
3040
3041    Models cost for a reduction operation, including the vector ops
3042    generated within the strip-mine loop, the initial definition before
3043    the loop, and the epilogue code that must be generated.  */
3044
3045 static bool
3046 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3047                            int ncopies)
3048 {
3049   int prologue_cost = 0, epilogue_cost = 0;
3050   enum tree_code code;
3051   optab optab;
3052   tree vectype;
3053   gimple stmt, orig_stmt;
3054   tree reduction_op;
3055   enum machine_mode mode;
3056   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3057   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3058   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3059
3060   /* Cost of reduction op inside loop.  */
3061   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3062                                         stmt_info, 0, vect_body);
3063   stmt = STMT_VINFO_STMT (stmt_info);
3064
3065   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3066     {
3067     case GIMPLE_SINGLE_RHS:
3068       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
3069       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
3070       break;
3071     case GIMPLE_UNARY_RHS:
3072       reduction_op = gimple_assign_rhs1 (stmt);
3073       break;
3074     case GIMPLE_BINARY_RHS:
3075       reduction_op = gimple_assign_rhs2 (stmt);
3076       break;
3077     case GIMPLE_TERNARY_RHS:
3078       reduction_op = gimple_assign_rhs3 (stmt);
3079       break;
3080     default:
3081       gcc_unreachable ();
3082     }
3083
3084   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3085   if (!vectype)
3086     {
3087       if (dump_enabled_p ())
3088         {
3089           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3090                            "unsupported data-type ");
3091           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3092                              TREE_TYPE (reduction_op));
3093           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3094         }
3095       return false;
3096    }
3097
3098   mode = TYPE_MODE (vectype);
3099   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3100
3101   if (!orig_stmt)
3102     orig_stmt = STMT_VINFO_STMT (stmt_info);
3103
3104   code = gimple_assign_rhs_code (orig_stmt);
3105
3106   /* Add in cost for initial definition.  */
3107   prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3108                                   stmt_info, 0, vect_prologue);
3109
3110   /* Determine cost of epilogue code.
3111
3112      We have a reduction operator that will reduce the vector in one statement.
3113      Also requires scalar extract.  */
3114
3115   if (!nested_in_vect_loop_p (loop, orig_stmt))
3116     {
3117       if (reduc_code != ERROR_MARK)
3118         {
3119           epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3120                                           stmt_info, 0, vect_epilogue);
3121           epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3122                                           stmt_info, 0, vect_epilogue);
3123         }
3124       else
3125         {
3126           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3127           tree bitsize =
3128             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3129           int element_bitsize = tree_to_uhwi (bitsize);
3130           int nelements = vec_size_in_bits / element_bitsize;
3131
3132           optab = optab_for_tree_code (code, vectype, optab_default);
3133
3134           /* We have a whole vector shift available.  */
3135           if (VECTOR_MODE_P (mode)
3136               && optab_handler (optab, mode) != CODE_FOR_nothing
3137               && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3138             {
3139               /* Final reduction via vector shifts and the reduction operator.
3140                  Also requires scalar extract.  */
3141               epilogue_cost += add_stmt_cost (target_cost_data,
3142                                               exact_log2 (nelements) * 2,
3143                                               vector_stmt, stmt_info, 0,
3144                                               vect_epilogue);
3145               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3146                                               vec_to_scalar, stmt_info, 0,
3147                                               vect_epilogue);
3148             }
3149           else
3150             /* Use extracts and reduction op for final reduction.  For N
3151                elements, we have N extracts and N-1 reduction ops.  */
3152             epilogue_cost += add_stmt_cost (target_cost_data,
3153                                             nelements + nelements - 1,
3154                                             vector_stmt, stmt_info, 0,
3155                                             vect_epilogue);
3156         }
3157     }
3158
3159   if (dump_enabled_p ())
3160     dump_printf (MSG_NOTE,
3161                  "vect_model_reduction_cost: inside_cost = %d, "
3162                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3163                  prologue_cost, epilogue_cost);
3164
3165   return true;
3166 }
3167
3168
3169 /* Function vect_model_induction_cost.
3170
3171    Models cost for induction operations.  */
3172
3173 static void
3174 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3175 {
3176   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3177   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3178   unsigned inside_cost, prologue_cost;
3179
3180   /* loop cost for vec_loop.  */
3181   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3182                                stmt_info, 0, vect_body);
3183
3184   /* prologue cost for vec_init and vec_step.  */
3185   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3186                                  stmt_info, 0, vect_prologue);
3187
3188   if (dump_enabled_p ())
3189     dump_printf_loc (MSG_NOTE, vect_location,
3190                      "vect_model_induction_cost: inside_cost = %d, "
3191                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3192 }
3193
3194
3195 /* Function get_initial_def_for_induction
3196
3197    Input:
3198    STMT - a stmt that performs an induction operation in the loop.
3199    IV_PHI - the initial value of the induction variable
3200
3201    Output:
3202    Return a vector variable, initialized with the first VF values of
3203    the induction variable.  E.g., for an iv with IV_PHI='X' and
3204    evolution S, for a vector of 4 units, we want to return:
3205    [X, X + S, X + 2*S, X + 3*S].  */
3206
3207 static tree
3208 get_initial_def_for_induction (gimple iv_phi)
3209 {
3210   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3211   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3212   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3213   tree vectype;
3214   int nunits;
3215   edge pe = loop_preheader_edge (loop);
3216   struct loop *iv_loop;
3217   basic_block new_bb;
3218   tree new_vec, vec_init, vec_step, t;
3219   tree new_var;
3220   tree new_name;
3221   gimple init_stmt, induction_phi, new_stmt;
3222   tree induc_def, vec_def, vec_dest;
3223   tree init_expr, step_expr;
3224   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3225   int i;
3226   int ncopies;
3227   tree expr;
3228   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3229   bool nested_in_vect_loop = false;
3230   gimple_seq stmts = NULL;
3231   imm_use_iterator imm_iter;
3232   use_operand_p use_p;
3233   gimple exit_phi;
3234   edge latch_e;
3235   tree loop_arg;
3236   gimple_stmt_iterator si;
3237   basic_block bb = gimple_bb (iv_phi);
3238   tree stepvectype;
3239   tree resvectype;
3240
3241   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3242   if (nested_in_vect_loop_p (loop, iv_phi))
3243     {
3244       nested_in_vect_loop = true;
3245       iv_loop = loop->inner;
3246     }
3247   else
3248     iv_loop = loop;
3249   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3250
3251   latch_e = loop_latch_edge (iv_loop);
3252   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3253
3254   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3255   gcc_assert (step_expr != NULL_TREE);
3256
3257   pe = loop_preheader_edge (iv_loop);
3258   init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3259                                      loop_preheader_edge (iv_loop));
3260
3261   vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3262   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3263   gcc_assert (vectype);
3264   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3265   ncopies = vf / nunits;
3266
3267   gcc_assert (phi_info);
3268   gcc_assert (ncopies >= 1);
3269
3270   /* Convert the step to the desired type.  */
3271   step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3272                                                   step_expr),
3273                                     &stmts, true, NULL_TREE);
3274   if (stmts)
3275     {
3276       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3277       gcc_assert (!new_bb);
3278     }
3279
3280   /* Find the first insertion point in the BB.  */
3281   si = gsi_after_labels (bb);
3282
3283   /* Create the vector that holds the initial_value of the induction.  */
3284   if (nested_in_vect_loop)
3285     {
3286       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3287          been created during vectorization of previous stmts.  We obtain it
3288          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3289       vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
3290       /* If the initial value is not of proper type, convert it.  */
3291       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3292         {
3293           new_stmt = gimple_build_assign_with_ops
3294               (VIEW_CONVERT_EXPR,
3295                vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"),
3296                build1 (VIEW_CONVERT_EXPR, vectype, vec_init), NULL_TREE);
3297           vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3298           gimple_assign_set_lhs (new_stmt, vec_init);
3299           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3300                                                  new_stmt);
3301           gcc_assert (!new_bb);
3302           set_vinfo_for_stmt (new_stmt,
3303                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3304         }
3305     }
3306   else
3307     {
3308       vec<constructor_elt, va_gc> *v;
3309
3310       /* iv_loop is the loop to be vectorized. Create:
3311          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3312       new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3313                                        vect_scalar_var, "var_");
3314       new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3315                                                      init_expr),
3316                                        &stmts, false, new_var);
3317       if (stmts)
3318         {
3319           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3320           gcc_assert (!new_bb);
3321         }
3322
3323       vec_alloc (v, nunits);
3324       bool constant_p = is_gimple_min_invariant (new_name);
3325       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3326       for (i = 1; i < nunits; i++)
3327         {
3328           /* Create: new_name_i = new_name + step_expr  */
3329           new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3330                                   new_name, step_expr);
3331           if (!is_gimple_min_invariant (new_name))
3332             {
3333               init_stmt = gimple_build_assign (new_var, new_name);
3334               new_name = make_ssa_name (new_var, init_stmt);
3335               gimple_assign_set_lhs (init_stmt, new_name);
3336               new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3337               gcc_assert (!new_bb);
3338               if (dump_enabled_p ())
3339                 {
3340                   dump_printf_loc (MSG_NOTE, vect_location,
3341                                    "created new init_stmt: ");
3342                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3343                   dump_printf (MSG_NOTE, "\n");
3344                 }
3345               constant_p = false;
3346             }
3347           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3348         }
3349       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3350       if (constant_p)
3351         new_vec = build_vector_from_ctor (vectype, v);
3352       else
3353         new_vec = build_constructor (vectype, v);
3354       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3355     }
3356
3357
3358   /* Create the vector that holds the step of the induction.  */
3359   if (nested_in_vect_loop)
3360     /* iv_loop is nested in the loop to be vectorized. Generate:
3361        vec_step = [S, S, S, S]  */
3362     new_name = step_expr;
3363   else
3364     {
3365       /* iv_loop is the loop to be vectorized. Generate:
3366           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3367       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3368         {
3369           expr = build_int_cst (integer_type_node, vf);
3370           expr = fold_convert (TREE_TYPE (step_expr), expr);
3371         }
3372       else
3373         expr = build_int_cst (TREE_TYPE (step_expr), vf);
3374       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3375                               expr, step_expr);
3376       if (TREE_CODE (step_expr) == SSA_NAME)
3377         new_name = vect_init_vector (iv_phi, new_name,
3378                                      TREE_TYPE (step_expr), NULL);
3379     }
3380
3381   t = unshare_expr (new_name);
3382   gcc_assert (CONSTANT_CLASS_P (new_name)
3383               || TREE_CODE (new_name) == SSA_NAME);
3384   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3385   gcc_assert (stepvectype);
3386   new_vec = build_vector_from_val (stepvectype, t);
3387   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3388
3389
3390   /* Create the following def-use cycle:
3391      loop prolog:
3392          vec_init = ...
3393          vec_step = ...
3394      loop:
3395          vec_iv = PHI <vec_init, vec_loop>
3396          ...
3397          STMT
3398          ...
3399          vec_loop = vec_iv + vec_step;  */
3400
3401   /* Create the induction-phi that defines the induction-operand.  */
3402   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3403   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3404   set_vinfo_for_stmt (induction_phi,
3405                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3406   induc_def = PHI_RESULT (induction_phi);
3407
3408   /* Create the iv update inside the loop  */
3409   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3410                                            induc_def, vec_step);
3411   vec_def = make_ssa_name (vec_dest, new_stmt);
3412   gimple_assign_set_lhs (new_stmt, vec_def);
3413   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3414   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3415                                                    NULL));
3416
3417   /* Set the arguments of the phi node:  */
3418   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3419   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3420                UNKNOWN_LOCATION);
3421
3422
3423   /* In case that vectorization factor (VF) is bigger than the number
3424      of elements that we can fit in a vectype (nunits), we have to generate
3425      more than one vector stmt - i.e - we need to "unroll" the
3426      vector stmt by a factor VF/nunits.  For more details see documentation
3427      in vectorizable_operation.  */
3428
3429   if (ncopies > 1)
3430     {
3431       stmt_vec_info prev_stmt_vinfo;
3432       /* FORNOW. This restriction should be relaxed.  */
3433       gcc_assert (!nested_in_vect_loop);
3434
3435       /* Create the vector that holds the step of the induction.  */
3436       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3437         {
3438           expr = build_int_cst (integer_type_node, nunits);
3439           expr = fold_convert (TREE_TYPE (step_expr), expr);
3440         }
3441       else
3442         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3443       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3444                               expr, step_expr);
3445       if (TREE_CODE (step_expr) == SSA_NAME)
3446         new_name = vect_init_vector (iv_phi, new_name,
3447                                      TREE_TYPE (step_expr), NULL);
3448       t = unshare_expr (new_name);
3449       gcc_assert (CONSTANT_CLASS_P (new_name)
3450                   || TREE_CODE (new_name) == SSA_NAME);
3451       new_vec = build_vector_from_val (stepvectype, t);
3452       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3453
3454       vec_def = induc_def;
3455       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3456       for (i = 1; i < ncopies; i++)
3457         {
3458           /* vec_i = vec_prev + vec_step  */
3459           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3460                                                    vec_def, vec_step);
3461           vec_def = make_ssa_name (vec_dest, new_stmt);
3462           gimple_assign_set_lhs (new_stmt, vec_def);
3463
3464           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3465           if (!useless_type_conversion_p (resvectype, vectype))
3466             {
3467               new_stmt = gimple_build_assign_with_ops
3468                   (VIEW_CONVERT_EXPR,
3469                    vect_get_new_vect_var (resvectype, vect_simple_var,
3470                                           "vec_iv_"),
3471                    build1 (VIEW_CONVERT_EXPR, resvectype,
3472                            gimple_assign_lhs (new_stmt)), NULL_TREE);
3473               gimple_assign_set_lhs (new_stmt,
3474                                      make_ssa_name
3475                                        (gimple_assign_lhs (new_stmt), new_stmt));
3476               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3477             }
3478           set_vinfo_for_stmt (new_stmt,
3479                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3480           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3481           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3482         }
3483     }
3484
3485   if (nested_in_vect_loop)
3486     {
3487       /* Find the loop-closed exit-phi of the induction, and record
3488          the final vector of induction results:  */
3489       exit_phi = NULL;
3490       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3491         {
3492           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3493             {
3494               exit_phi = USE_STMT (use_p);
3495               break;
3496             }
3497         }
3498       if (exit_phi)
3499         {
3500           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3501           /* FORNOW. Currently not supporting the case that an inner-loop induction
3502              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3503           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3504                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3505
3506           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3507           if (dump_enabled_p ())
3508             {
3509               dump_printf_loc (MSG_NOTE, vect_location,
3510                                "vector of inductions after inner-loop:");
3511               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3512               dump_printf (MSG_NOTE, "\n");
3513             }
3514         }
3515     }
3516
3517
3518   if (dump_enabled_p ())
3519     {
3520       dump_printf_loc (MSG_NOTE, vect_location,
3521                        "transform induction: created def-use cycle: ");
3522       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3523       dump_printf (MSG_NOTE, "\n");
3524       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3525                         SSA_NAME_DEF_STMT (vec_def), 0);
3526       dump_printf (MSG_NOTE, "\n");
3527     }
3528
3529   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3530   if (!useless_type_conversion_p (resvectype, vectype))
3531     {
3532       new_stmt = gimple_build_assign_with_ops
3533          (VIEW_CONVERT_EXPR,
3534           vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3535           build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3536       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3537       gimple_assign_set_lhs (new_stmt, induc_def);
3538       si = gsi_after_labels (bb);
3539       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3540       set_vinfo_for_stmt (new_stmt,
3541                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3542       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3543         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3544     }
3545
3546   return induc_def;
3547 }
3548
3549
3550 /* Function get_initial_def_for_reduction
3551
3552    Input:
3553    STMT - a stmt that performs a reduction operation in the loop.
3554    INIT_VAL - the initial value of the reduction variable
3555
3556    Output:
3557    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3558         of the reduction (used for adjusting the epilog - see below).
3559    Return a vector variable, initialized according to the operation that STMT
3560         performs. This vector will be used as the initial value of the
3561         vector of partial results.
3562
3563    Option1 (adjust in epilog): Initialize the vector as follows:
3564      add/bit or/xor:    [0,0,...,0,0]
3565      mult/bit and:      [1,1,...,1,1]
3566      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3567    and when necessary (e.g. add/mult case) let the caller know
3568    that it needs to adjust the result by init_val.
3569
3570    Option2: Initialize the vector as follows:
3571      add/bit or/xor:    [init_val,0,0,...,0]
3572      mult/bit and:      [init_val,1,1,...,1]
3573      min/max/cond_expr: [init_val,init_val,...,init_val]
3574    and no adjustments are needed.
3575
3576    For example, for the following code:
3577
3578    s = init_val;
3579    for (i=0;i<n;i++)
3580      s = s + a[i];
3581
3582    STMT is 's = s + a[i]', and the reduction variable is 's'.
3583    For a vector of 4 units, we want to return either [0,0,0,init_val],
3584    or [0,0,0,0] and let the caller know that it needs to adjust
3585    the result at the end by 'init_val'.
3586
3587    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3588    initialization vector is simpler (same element in all entries), if
3589    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3590
3591    A cost model should help decide between these two schemes.  */
3592
3593 tree
3594 get_initial_def_for_reduction (gimple stmt, tree init_val,
3595                                tree *adjustment_def)
3596 {
3597   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3598   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3599   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3600   tree scalar_type = TREE_TYPE (init_val);
3601   tree vectype = get_vectype_for_scalar_type (scalar_type);
3602   int nunits;
3603   enum tree_code code = gimple_assign_rhs_code (stmt);
3604   tree def_for_init;
3605   tree init_def;
3606   tree *elts;
3607   int i;
3608   bool nested_in_vect_loop = false;
3609   tree init_value;
3610   REAL_VALUE_TYPE real_init_val = dconst0;
3611   int int_init_val = 0;
3612   gimple def_stmt = NULL;
3613
3614   gcc_assert (vectype);
3615   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3616
3617   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3618               || SCALAR_FLOAT_TYPE_P (scalar_type));
3619
3620   if (nested_in_vect_loop_p (loop, stmt))
3621     nested_in_vect_loop = true;
3622   else
3623     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3624
3625   /* In case of double reduction we only create a vector variable to be put
3626      in the reduction phi node.  The actual statement creation is done in
3627      vect_create_epilog_for_reduction.  */
3628   if (adjustment_def && nested_in_vect_loop
3629       && TREE_CODE (init_val) == SSA_NAME
3630       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3631       && gimple_code (def_stmt) == GIMPLE_PHI
3632       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3633       && vinfo_for_stmt (def_stmt)
3634       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3635           == vect_double_reduction_def)
3636     {
3637       *adjustment_def = NULL;
3638       return vect_create_destination_var (init_val, vectype);
3639     }
3640
3641   if (TREE_CONSTANT (init_val))
3642     {
3643       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3644         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3645       else
3646         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3647     }
3648   else
3649     init_value = init_val;
3650
3651   switch (code)
3652     {
3653       case WIDEN_SUM_EXPR:
3654       case DOT_PROD_EXPR:
3655       case PLUS_EXPR:
3656       case MINUS_EXPR:
3657       case BIT_IOR_EXPR:
3658       case BIT_XOR_EXPR:
3659       case MULT_EXPR:
3660       case BIT_AND_EXPR:
3661         /* ADJUSMENT_DEF is NULL when called from
3662            vect_create_epilog_for_reduction to vectorize double reduction.  */
3663         if (adjustment_def)
3664           {
3665             if (nested_in_vect_loop)
3666               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3667                                                               NULL);
3668             else
3669               *adjustment_def = init_val;
3670           }
3671
3672         if (code == MULT_EXPR)
3673           {
3674             real_init_val = dconst1;
3675             int_init_val = 1;
3676           }
3677
3678         if (code == BIT_AND_EXPR)
3679           int_init_val = -1;
3680
3681         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3682           def_for_init = build_real (scalar_type, real_init_val);
3683         else
3684           def_for_init = build_int_cst (scalar_type, int_init_val);
3685
3686         /* Create a vector of '0' or '1' except the first element.  */
3687         elts = XALLOCAVEC (tree, nunits);
3688         for (i = nunits - 2; i >= 0; --i)
3689           elts[i + 1] = def_for_init;
3690
3691         /* Option1: the first element is '0' or '1' as well.  */
3692         if (adjustment_def)
3693           {
3694             elts[0] = def_for_init;
3695             init_def = build_vector (vectype, elts);
3696             break;
3697           }
3698
3699         /* Option2: the first element is INIT_VAL.  */
3700         elts[0] = init_val;
3701         if (TREE_CONSTANT (init_val))
3702           init_def = build_vector (vectype, elts);
3703         else
3704           {
3705             vec<constructor_elt, va_gc> *v;
3706             vec_alloc (v, nunits);
3707             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3708             for (i = 1; i < nunits; ++i)
3709               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3710             init_def = build_constructor (vectype, v);
3711           }
3712
3713         break;
3714
3715       case MIN_EXPR:
3716       case MAX_EXPR:
3717       case COND_EXPR:
3718         if (adjustment_def)
3719           {
3720             *adjustment_def = NULL_TREE;
3721             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3722             break;
3723           }
3724
3725         init_def = build_vector_from_val (vectype, init_value);
3726         break;
3727
3728       default:
3729         gcc_unreachable ();
3730     }
3731
3732   return init_def;
3733 }
3734
3735
3736 /* Function vect_create_epilog_for_reduction
3737
3738    Create code at the loop-epilog to finalize the result of a reduction
3739    computation.
3740
3741    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3742      reduction statements.
3743    STMT is the scalar reduction stmt that is being vectorized.
3744    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3745      number of elements that we can fit in a vectype (nunits).  In this case
3746      we have to generate more than one vector stmt - i.e - we need to "unroll"
3747      the vector stmt by a factor VF/nunits.  For more details see documentation
3748      in vectorizable_operation.
3749    REDUC_CODE is the tree-code for the epilog reduction.
3750    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3751      computation.
3752    REDUC_INDEX is the index of the operand in the right hand side of the
3753      statement that is defined by REDUCTION_PHI.
3754    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3755    SLP_NODE is an SLP node containing a group of reduction statements. The
3756      first one in this group is STMT.
3757
3758    This function:
3759    1. Creates the reduction def-use cycles: sets the arguments for
3760       REDUCTION_PHIS:
3761       The loop-entry argument is the vectorized initial-value of the reduction.
3762       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3763       sums.
3764    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3765       by applying the operation specified by REDUC_CODE if available, or by
3766       other means (whole-vector shifts or a scalar loop).
3767       The function also creates a new phi node at the loop exit to preserve
3768       loop-closed form, as illustrated below.
3769
3770      The flow at the entry to this function:
3771
3772         loop:
3773           vec_def = phi <null, null>            # REDUCTION_PHI
3774           VECT_DEF = vector_stmt                # vectorized form of STMT
3775           s_loop = scalar_stmt                  # (scalar) STMT
3776         loop_exit:
3777           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3778           use <s_out0>
3779           use <s_out0>
3780
3781      The above is transformed by this function into:
3782
3783         loop:
3784           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3785           VECT_DEF = vector_stmt                # vectorized form of STMT
3786           s_loop = scalar_stmt                  # (scalar) STMT
3787         loop_exit:
3788           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3789           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3790           v_out2 = reduce <v_out1>
3791           s_out3 = extract_field <v_out2, 0>
3792           s_out4 = adjust_result <s_out3>
3793           use <s_out4>
3794           use <s_out4>
3795 */
3796
3797 static void
3798 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3799                                   int ncopies, enum tree_code reduc_code,
3800                                   vec<gimple> reduction_phis,
3801                                   int reduc_index, bool double_reduc,
3802                                   slp_tree slp_node)
3803 {
3804   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3805   stmt_vec_info prev_phi_info;
3806   tree vectype;
3807   enum machine_mode mode;
3808   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3809   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3810   basic_block exit_bb;
3811   tree scalar_dest;
3812   tree scalar_type;
3813   gimple new_phi = NULL, phi;
3814   gimple_stmt_iterator exit_gsi;
3815   tree vec_dest;
3816   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3817   gimple epilog_stmt = NULL;
3818   enum tree_code code = gimple_assign_rhs_code (stmt);
3819   gimple exit_phi;
3820   tree bitsize, bitpos;
3821   tree adjustment_def = NULL;
3822   tree vec_initial_def = NULL;
3823   tree reduction_op, expr, def;
3824   tree orig_name, scalar_result;
3825   imm_use_iterator imm_iter, phi_imm_iter;
3826   use_operand_p use_p, phi_use_p;
3827   bool extract_scalar_result = false;
3828   gimple use_stmt, orig_stmt, reduction_phi = NULL;
3829   bool nested_in_vect_loop = false;
3830   auto_vec<gimple> new_phis;
3831   auto_vec<gimple> inner_phis;
3832   enum vect_def_type dt = vect_unknown_def_type;
3833   int j, i;
3834   auto_vec<tree> scalar_results;
3835   unsigned int group_size = 1, k, ratio;
3836   auto_vec<tree> vec_initial_defs;
3837   auto_vec<gimple> phis;
3838   bool slp_reduc = false;
3839   tree new_phi_result;
3840   gimple inner_phi = NULL;
3841
3842   if (slp_node)
3843     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3844
3845   if (nested_in_vect_loop_p (loop, stmt))
3846     {
3847       outer_loop = loop;
3848       loop = loop->inner;
3849       nested_in_vect_loop = true;
3850       gcc_assert (!slp_node);
3851     }
3852
3853   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3854     {
3855     case GIMPLE_SINGLE_RHS:
3856       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3857                   == ternary_op);
3858       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3859       break;
3860     case GIMPLE_UNARY_RHS:
3861       reduction_op = gimple_assign_rhs1 (stmt);
3862       break;
3863     case GIMPLE_BINARY_RHS:
3864       reduction_op = reduc_index ?
3865                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3866       break;
3867     case GIMPLE_TERNARY_RHS:
3868       reduction_op = gimple_op (stmt, reduc_index + 1);
3869       break;
3870     default:
3871       gcc_unreachable ();
3872     }
3873
3874   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3875   gcc_assert (vectype);
3876   mode = TYPE_MODE (vectype);
3877
3878   /* 1. Create the reduction def-use cycle:
3879      Set the arguments of REDUCTION_PHIS, i.e., transform
3880
3881         loop:
3882           vec_def = phi <null, null>            # REDUCTION_PHI
3883           VECT_DEF = vector_stmt                # vectorized form of STMT
3884           ...
3885
3886      into:
3887
3888         loop:
3889           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3890           VECT_DEF = vector_stmt                # vectorized form of STMT
3891           ...
3892
3893      (in case of SLP, do it for all the phis). */
3894
3895   /* Get the loop-entry arguments.  */
3896   if (slp_node)
3897     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3898                        NULL, slp_node, reduc_index);
3899   else
3900     {
3901       vec_initial_defs.create (1);
3902      /* For the case of reduction, vect_get_vec_def_for_operand returns
3903         the scalar def before the loop, that defines the initial value
3904         of the reduction variable.  */
3905       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3906                                                       &adjustment_def);
3907       vec_initial_defs.quick_push (vec_initial_def);
3908     }
3909
3910   /* Set phi nodes arguments.  */
3911   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
3912     {
3913       tree vec_init_def = vec_initial_defs[i];
3914       tree def = vect_defs[i];
3915       for (j = 0; j < ncopies; j++)
3916         {
3917           /* Set the loop-entry arg of the reduction-phi.  */
3918           add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3919                        UNKNOWN_LOCATION);
3920
3921           /* Set the loop-latch arg for the reduction-phi.  */
3922           if (j > 0)
3923             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3924
3925           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3926
3927           if (dump_enabled_p ())
3928             {
3929               dump_printf_loc (MSG_NOTE, vect_location,
3930                                "transform reduction: created def-use cycle: ");
3931               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
3932               dump_printf (MSG_NOTE, "\n");
3933               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
3934               dump_printf (MSG_NOTE, "\n");
3935             }
3936
3937           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3938         }
3939     }
3940
3941   /* 2. Create epilog code.
3942         The reduction epilog code operates across the elements of the vector
3943         of partial results computed by the vectorized loop.
3944         The reduction epilog code consists of:
3945
3946         step 1: compute the scalar result in a vector (v_out2)
3947         step 2: extract the scalar result (s_out3) from the vector (v_out2)
3948         step 3: adjust the scalar result (s_out3) if needed.
3949
3950         Step 1 can be accomplished using one the following three schemes:
3951           (scheme 1) using reduc_code, if available.
3952           (scheme 2) using whole-vector shifts, if available.
3953           (scheme 3) using a scalar loop. In this case steps 1+2 above are
3954                      combined.
3955
3956           The overall epilog code looks like this:
3957
3958           s_out0 = phi <s_loop>         # original EXIT_PHI
3959           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
3960           v_out2 = reduce <v_out1>              # step 1
3961           s_out3 = extract_field <v_out2, 0>    # step 2
3962           s_out4 = adjust_result <s_out3>       # step 3
3963
3964           (step 3 is optional, and steps 1 and 2 may be combined).
3965           Lastly, the uses of s_out0 are replaced by s_out4.  */
3966
3967
3968   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3969          v_out1 = phi <VECT_DEF>
3970          Store them in NEW_PHIS.  */
3971
3972   exit_bb = single_exit (loop)->dest;
3973   prev_phi_info = NULL;
3974   new_phis.create (vect_defs.length ());
3975   FOR_EACH_VEC_ELT (vect_defs, i, def)
3976     {
3977       for (j = 0; j < ncopies; j++)
3978         {
3979           tree new_def = copy_ssa_name (def, NULL);
3980           phi = create_phi_node (new_def, exit_bb);
3981           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3982           if (j == 0)
3983             new_phis.quick_push (phi);
3984           else
3985             {
3986               def = vect_get_vec_def_for_stmt_copy (dt, def);
3987               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3988             }
3989
3990           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3991           prev_phi_info = vinfo_for_stmt (phi);
3992         }
3993     }
3994
3995   /* The epilogue is created for the outer-loop, i.e., for the loop being
3996      vectorized.  Create exit phis for the outer loop.  */
3997   if (double_reduc)
3998     {
3999       loop = outer_loop;
4000       exit_bb = single_exit (loop)->dest;
4001       inner_phis.create (vect_defs.length ());
4002       FOR_EACH_VEC_ELT (new_phis, i, phi)
4003         {
4004           tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
4005           gimple outer_phi = create_phi_node (new_result, exit_bb);
4006           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4007                            PHI_RESULT (phi));
4008           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4009                                                             loop_vinfo, NULL));
4010           inner_phis.quick_push (phi);
4011           new_phis[i] = outer_phi;
4012           prev_phi_info = vinfo_for_stmt (outer_phi);
4013           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4014             {
4015               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4016               new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
4017               outer_phi = create_phi_node (new_result, exit_bb);
4018               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4019                                PHI_RESULT (phi));
4020               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4021                                                         loop_vinfo, NULL));
4022               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4023               prev_phi_info = vinfo_for_stmt (outer_phi);
4024             }
4025         }
4026     }
4027
4028   exit_gsi = gsi_after_labels (exit_bb);
4029
4030   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4031          (i.e. when reduc_code is not available) and in the final adjustment
4032          code (if needed).  Also get the original scalar reduction variable as
4033          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4034          represents a reduction pattern), the tree-code and scalar-def are
4035          taken from the original stmt that the pattern-stmt (STMT) replaces.
4036          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4037          are taken from STMT.  */
4038
4039   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4040   if (!orig_stmt)
4041     {
4042       /* Regular reduction  */
4043       orig_stmt = stmt;
4044     }
4045   else
4046     {
4047       /* Reduction pattern  */
4048       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4049       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4050       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4051     }
4052
4053   code = gimple_assign_rhs_code (orig_stmt);
4054   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4055      partial results are added and not subtracted.  */
4056   if (code == MINUS_EXPR)
4057     code = PLUS_EXPR;
4058
4059   scalar_dest = gimple_assign_lhs (orig_stmt);
4060   scalar_type = TREE_TYPE (scalar_dest);
4061   scalar_results.create (group_size);
4062   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4063   bitsize = TYPE_SIZE (scalar_type);
4064
4065   /* In case this is a reduction in an inner-loop while vectorizing an outer
4066      loop - we don't need to extract a single scalar result at the end of the
4067      inner-loop (unless it is double reduction, i.e., the use of reduction is
4068      outside the outer-loop).  The final vector of partial results will be used
4069      in the vectorized outer-loop, or reduced to a scalar result at the end of
4070      the outer-loop.  */
4071   if (nested_in_vect_loop && !double_reduc)
4072     goto vect_finalize_reduction;
4073
4074   /* SLP reduction without reduction chain, e.g.,
4075      # a1 = phi <a2, a0>
4076      # b1 = phi <b2, b0>
4077      a2 = operation (a1)
4078      b2 = operation (b1)  */
4079   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4080
4081   /* In case of reduction chain, e.g.,
4082      # a1 = phi <a3, a0>
4083      a2 = operation (a1)
4084      a3 = operation (a2),
4085
4086      we may end up with more than one vector result.  Here we reduce them to
4087      one vector.  */
4088   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4089     {
4090       tree first_vect = PHI_RESULT (new_phis[0]);
4091       tree tmp;
4092       gimple new_vec_stmt = NULL;
4093
4094       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4095       for (k = 1; k < new_phis.length (); k++)
4096         {
4097           gimple next_phi = new_phis[k];
4098           tree second_vect = PHI_RESULT (next_phi);
4099
4100           tmp = build2 (code, vectype,  first_vect, second_vect);
4101           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4102           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4103           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4104           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4105         }
4106
4107       new_phi_result = first_vect;
4108       if (new_vec_stmt)
4109         {
4110           new_phis.truncate (0);
4111           new_phis.safe_push (new_vec_stmt);
4112         }
4113     }
4114   else
4115     new_phi_result = PHI_RESULT (new_phis[0]);
4116
4117   /* 2.3 Create the reduction code, using one of the three schemes described
4118          above. In SLP we simply need to extract all the elements from the
4119          vector (without reducing them), so we use scalar shifts.  */
4120   if (reduc_code != ERROR_MARK && !slp_reduc)
4121     {
4122       tree tmp;
4123
4124       /*** Case 1:  Create:
4125            v_out2 = reduc_expr <v_out1>  */
4126
4127       if (dump_enabled_p ())
4128         dump_printf_loc (MSG_NOTE, vect_location,
4129                          "Reduce using direct vector reduction.\n");
4130
4131       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4132       tmp = build1 (reduc_code, vectype, new_phi_result);
4133       epilog_stmt = gimple_build_assign (vec_dest, tmp);
4134       new_temp = make_ssa_name (vec_dest, epilog_stmt);
4135       gimple_assign_set_lhs (epilog_stmt, new_temp);
4136       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4137
4138       extract_scalar_result = true;
4139     }
4140   else
4141     {
4142       enum tree_code shift_code = ERROR_MARK;
4143       bool have_whole_vector_shift = true;
4144       int bit_offset;
4145       int element_bitsize = tree_to_uhwi (bitsize);
4146       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4147       tree vec_temp;
4148
4149       if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4150         shift_code = VEC_RSHIFT_EXPR;
4151       else
4152         have_whole_vector_shift = false;
4153
4154       /* Regardless of whether we have a whole vector shift, if we're
4155          emulating the operation via tree-vect-generic, we don't want
4156          to use it.  Only the first round of the reduction is likely
4157          to still be profitable via emulation.  */
4158       /* ??? It might be better to emit a reduction tree code here, so that
4159          tree-vect-generic can expand the first round via bit tricks.  */
4160       if (!VECTOR_MODE_P (mode))
4161         have_whole_vector_shift = false;
4162       else
4163         {
4164           optab optab = optab_for_tree_code (code, vectype, optab_default);
4165           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4166             have_whole_vector_shift = false;
4167         }
4168
4169       if (have_whole_vector_shift && !slp_reduc)
4170         {
4171           /*** Case 2: Create:
4172              for (offset = VS/2; offset >= element_size; offset/=2)
4173                 {
4174                   Create:  va' = vec_shift <va, offset>
4175                   Create:  va = vop <va, va'>
4176                 }  */
4177
4178           if (dump_enabled_p ())
4179             dump_printf_loc (MSG_NOTE, vect_location,
4180                              "Reduce using vector shifts\n");
4181
4182           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4183           new_temp = new_phi_result;
4184           for (bit_offset = vec_size_in_bits/2;
4185                bit_offset >= element_bitsize;
4186                bit_offset /= 2)
4187             {
4188               tree bitpos = size_int (bit_offset);
4189
4190               epilog_stmt = gimple_build_assign_with_ops (shift_code,
4191                                                vec_dest, new_temp, bitpos);
4192               new_name = make_ssa_name (vec_dest, epilog_stmt);
4193               gimple_assign_set_lhs (epilog_stmt, new_name);
4194               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4195
4196               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4197                                                           new_name, new_temp);
4198               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4199               gimple_assign_set_lhs (epilog_stmt, new_temp);
4200               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4201             }
4202
4203           extract_scalar_result = true;
4204         }
4205       else
4206         {
4207           tree rhs;
4208
4209           /*** Case 3: Create:
4210              s = extract_field <v_out2, 0>
4211              for (offset = element_size;
4212                   offset < vector_size;
4213                   offset += element_size;)
4214                {
4215                  Create:  s' = extract_field <v_out2, offset>
4216                  Create:  s = op <s, s'>  // For non SLP cases
4217                }  */
4218
4219           if (dump_enabled_p ())
4220             dump_printf_loc (MSG_NOTE, vect_location,
4221                              "Reduce using scalar code.\n");
4222
4223           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4224           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4225             {
4226               if (gimple_code (new_phi) == GIMPLE_PHI)
4227                 vec_temp = PHI_RESULT (new_phi);
4228               else
4229                 vec_temp = gimple_assign_lhs (new_phi);
4230               rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4231                             bitsize_zero_node);
4232               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4233               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4234               gimple_assign_set_lhs (epilog_stmt, new_temp);
4235               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4236
4237               /* In SLP we don't need to apply reduction operation, so we just
4238                  collect s' values in SCALAR_RESULTS.  */
4239               if (slp_reduc)
4240                 scalar_results.safe_push (new_temp);
4241
4242               for (bit_offset = element_bitsize;
4243                    bit_offset < vec_size_in_bits;
4244                    bit_offset += element_bitsize)
4245                 {
4246                   tree bitpos = bitsize_int (bit_offset);
4247                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4248                                      bitsize, bitpos);
4249
4250                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4251                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4252                   gimple_assign_set_lhs (epilog_stmt, new_name);
4253                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4254
4255                   if (slp_reduc)
4256                     {
4257                       /* In SLP we don't need to apply reduction operation, so
4258                          we just collect s' values in SCALAR_RESULTS.  */
4259                       new_temp = new_name;
4260                       scalar_results.safe_push (new_name);
4261                     }
4262                   else
4263                     {
4264                       epilog_stmt = gimple_build_assign_with_ops (code,
4265                                           new_scalar_dest, new_name, new_temp);
4266                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4267                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4268                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4269                     }
4270                 }
4271             }
4272
4273           /* The only case where we need to reduce scalar results in SLP, is
4274              unrolling.  If the size of SCALAR_RESULTS is greater than
4275              GROUP_SIZE, we reduce them combining elements modulo
4276              GROUP_SIZE.  */
4277           if (slp_reduc)
4278             {
4279               tree res, first_res, new_res;
4280               gimple new_stmt;
4281
4282               /* Reduce multiple scalar results in case of SLP unrolling.  */
4283               for (j = group_size; scalar_results.iterate (j, &res);
4284                    j++)
4285                 {
4286                   first_res = scalar_results[j % group_size];
4287                   new_stmt = gimple_build_assign_with_ops (code,
4288                                               new_scalar_dest, first_res, res);
4289                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4290                   gimple_assign_set_lhs (new_stmt, new_res);
4291                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4292                   scalar_results[j % group_size] = new_res;
4293                 }
4294             }
4295           else
4296             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4297             scalar_results.safe_push (new_temp);
4298
4299           extract_scalar_result = false;
4300         }
4301     }
4302
4303   /* 2.4  Extract the final scalar result.  Create:
4304           s_out3 = extract_field <v_out2, bitpos>  */
4305
4306   if (extract_scalar_result)
4307     {
4308       tree rhs;
4309
4310       if (dump_enabled_p ())
4311         dump_printf_loc (MSG_NOTE, vect_location,
4312                          "extract scalar result\n");
4313
4314       if (BYTES_BIG_ENDIAN)
4315         bitpos = size_binop (MULT_EXPR,
4316                              bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4317                              TYPE_SIZE (scalar_type));
4318       else
4319         bitpos = bitsize_zero_node;
4320
4321       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4322       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4323       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4324       gimple_assign_set_lhs (epilog_stmt, new_temp);
4325       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4326       scalar_results.safe_push (new_temp);
4327     }
4328
4329 vect_finalize_reduction:
4330
4331   if (double_reduc)
4332     loop = loop->inner;
4333
4334   /* 2.5 Adjust the final result by the initial value of the reduction
4335          variable. (When such adjustment is not needed, then
4336          'adjustment_def' is zero).  For example, if code is PLUS we create:
4337          new_temp = loop_exit_def + adjustment_def  */
4338
4339   if (adjustment_def)
4340     {
4341       gcc_assert (!slp_reduc);
4342       if (nested_in_vect_loop)
4343         {
4344           new_phi = new_phis[0];
4345           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4346           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4347           new_dest = vect_create_destination_var (scalar_dest, vectype);
4348         }
4349       else
4350         {
4351           new_temp = scalar_results[0];
4352           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4353           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4354           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4355         }
4356
4357       epilog_stmt = gimple_build_assign (new_dest, expr);
4358       new_temp = make_ssa_name (new_dest, epilog_stmt);
4359       gimple_assign_set_lhs (epilog_stmt, new_temp);
4360       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4361       if (nested_in_vect_loop)
4362         {
4363           set_vinfo_for_stmt (epilog_stmt,
4364                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
4365                                                  NULL));
4366           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4367                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4368
4369           if (!double_reduc)
4370             scalar_results.quick_push (new_temp);
4371           else
4372             scalar_results[0] = new_temp;
4373         }
4374       else
4375         scalar_results[0] = new_temp;
4376
4377       new_phis[0] = epilog_stmt;
4378     }
4379
4380   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4381           phis with new adjusted scalar results, i.e., replace use <s_out0>
4382           with use <s_out4>.
4383
4384      Transform:
4385         loop_exit:
4386           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4387           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4388           v_out2 = reduce <v_out1>
4389           s_out3 = extract_field <v_out2, 0>
4390           s_out4 = adjust_result <s_out3>
4391           use <s_out0>
4392           use <s_out0>
4393
4394      into:
4395
4396         loop_exit:
4397           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4398           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4399           v_out2 = reduce <v_out1>
4400           s_out3 = extract_field <v_out2, 0>
4401           s_out4 = adjust_result <s_out3>
4402           use <s_out4>
4403           use <s_out4> */
4404
4405
4406   /* In SLP reduction chain we reduce vector results into one vector if
4407      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4408      the last stmt in the reduction chain, since we are looking for the loop
4409      exit phi node.  */
4410   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4411     {
4412       scalar_dest = gimple_assign_lhs (
4413                         SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4414       group_size = 1;
4415     }
4416
4417   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4418      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4419      need to match SCALAR_RESULTS with corresponding statements.  The first
4420      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4421      the first vector stmt, etc.
4422      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4423   if (group_size > new_phis.length ())
4424     {
4425       ratio = group_size / new_phis.length ();
4426       gcc_assert (!(group_size % new_phis.length ()));
4427     }
4428   else
4429     ratio = 1;
4430
4431   for (k = 0; k < group_size; k++)
4432     {
4433       if (k % ratio == 0)
4434         {
4435           epilog_stmt = new_phis[k / ratio];
4436           reduction_phi = reduction_phis[k / ratio];
4437           if (double_reduc)
4438             inner_phi = inner_phis[k / ratio];
4439         }
4440
4441       if (slp_reduc)
4442         {
4443           gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4444
4445           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4446           /* SLP statements can't participate in patterns.  */
4447           gcc_assert (!orig_stmt);
4448           scalar_dest = gimple_assign_lhs (current_stmt);
4449         }
4450
4451       phis.create (3);
4452       /* Find the loop-closed-use at the loop exit of the original scalar
4453          result.  (The reduction result is expected to have two immediate uses -
4454          one at the latch block, and one at the loop exit).  */
4455       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4456         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4457             && !is_gimple_debug (USE_STMT (use_p)))
4458           phis.safe_push (USE_STMT (use_p));
4459
4460       /* While we expect to have found an exit_phi because of loop-closed-ssa
4461          form we can end up without one if the scalar cycle is dead.  */
4462
4463       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4464         {
4465           if (outer_loop)
4466             {
4467               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4468               gimple vect_phi;
4469
4470               /* FORNOW. Currently not supporting the case that an inner-loop
4471                  reduction is not used in the outer-loop (but only outside the
4472                  outer-loop), unless it is double reduction.  */
4473               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4474                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4475                           || double_reduc);
4476
4477               STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4478               if (!double_reduc
4479                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4480                       != vect_double_reduction_def)
4481                 continue;
4482
4483               /* Handle double reduction:
4484
4485                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4486                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4487                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4488                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4489
4490                  At that point the regular reduction (stmt2 and stmt3) is
4491                  already vectorized, as well as the exit phi node, stmt4.
4492                  Here we vectorize the phi node of double reduction, stmt1, and
4493                  update all relevant statements.  */
4494
4495               /* Go through all the uses of s2 to find double reduction phi
4496                  node, i.e., stmt1 above.  */
4497               orig_name = PHI_RESULT (exit_phi);
4498               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4499                 {
4500                   stmt_vec_info use_stmt_vinfo;
4501                   stmt_vec_info new_phi_vinfo;
4502                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4503                   basic_block bb = gimple_bb (use_stmt);
4504                   gimple use;
4505
4506                   /* Check that USE_STMT is really double reduction phi
4507                      node.  */
4508                   if (gimple_code (use_stmt) != GIMPLE_PHI
4509                       || gimple_phi_num_args (use_stmt) != 2
4510                       || bb->loop_father != outer_loop)
4511                     continue;
4512                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4513                   if (!use_stmt_vinfo
4514                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4515                           != vect_double_reduction_def)
4516                     continue;
4517
4518                   /* Create vector phi node for double reduction:
4519                      vs1 = phi <vs0, vs2>
4520                      vs1 was created previously in this function by a call to
4521                        vect_get_vec_def_for_operand and is stored in
4522                        vec_initial_def;
4523                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4524                      vs0 is created here.  */
4525
4526                   /* Create vector phi node.  */
4527                   vect_phi = create_phi_node (vec_initial_def, bb);
4528                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4529                                     loop_vec_info_for_loop (outer_loop), NULL);
4530                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4531
4532                   /* Create vs0 - initial def of the double reduction phi.  */
4533                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4534                                              loop_preheader_edge (outer_loop));
4535                   init_def = get_initial_def_for_reduction (stmt,
4536                                                           preheader_arg, NULL);
4537                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4538                                                     vectype, NULL);
4539
4540                   /* Update phi node arguments with vs0 and vs2.  */
4541                   add_phi_arg (vect_phi, vect_phi_init,
4542                                loop_preheader_edge (outer_loop),
4543                                UNKNOWN_LOCATION);
4544                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4545                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4546                   if (dump_enabled_p ())
4547                     {
4548                       dump_printf_loc (MSG_NOTE, vect_location,
4549                                        "created double reduction phi node: ");
4550                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4551                       dump_printf (MSG_NOTE, "\n");
4552                     }
4553
4554                   vect_phi_res = PHI_RESULT (vect_phi);
4555
4556                   /* Replace the use, i.e., set the correct vs1 in the regular
4557                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4558                      loop is redundant.  */
4559                   use = reduction_phi;
4560                   for (j = 0; j < ncopies; j++)
4561                     {
4562                       edge pr_edge = loop_preheader_edge (loop);
4563                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4564                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4565                     }
4566                 }
4567             }
4568         }
4569
4570       phis.release ();
4571       if (nested_in_vect_loop)
4572         {
4573           if (double_reduc)
4574             loop = outer_loop;
4575           else
4576             continue;
4577         }
4578
4579       phis.create (3);
4580       /* Find the loop-closed-use at the loop exit of the original scalar
4581          result.  (The reduction result is expected to have two immediate uses,
4582          one at the latch block, and one at the loop exit).  For double
4583          reductions we are looking for exit phis of the outer loop.  */
4584       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4585         {
4586           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4587             {
4588               if (!is_gimple_debug (USE_STMT (use_p)))
4589                 phis.safe_push (USE_STMT (use_p));
4590             }
4591           else
4592             {
4593               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4594                 {
4595                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4596
4597                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4598                     {
4599                       if (!flow_bb_inside_loop_p (loop,
4600                                              gimple_bb (USE_STMT (phi_use_p)))
4601                           && !is_gimple_debug (USE_STMT (phi_use_p)))
4602                         phis.safe_push (USE_STMT (phi_use_p));
4603                     }
4604                 }
4605             }
4606         }
4607
4608       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4609         {
4610           /* Replace the uses:  */
4611           orig_name = PHI_RESULT (exit_phi);
4612           scalar_result = scalar_results[k];
4613           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4614             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4615               SET_USE (use_p, scalar_result);
4616         }
4617
4618       phis.release ();
4619     }
4620 }
4621
4622
4623 /* Function vectorizable_reduction.
4624
4625    Check if STMT performs a reduction operation that can be vectorized.
4626    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4627    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4628    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4629
4630    This function also handles reduction idioms (patterns) that have been
4631    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4632    of this form:
4633      X = pattern_expr (arg0, arg1, ..., X)
4634    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4635    sequence that had been detected and replaced by the pattern-stmt (STMT).
4636
4637    In some cases of reduction patterns, the type of the reduction variable X is
4638    different than the type of the other arguments of STMT.
4639    In such cases, the vectype that is used when transforming STMT into a vector
4640    stmt is different than the vectype that is used to determine the
4641    vectorization factor, because it consists of a different number of elements
4642    than the actual number of elements that are being operated upon in parallel.
4643
4644    For example, consider an accumulation of shorts into an int accumulator.
4645    On some targets it's possible to vectorize this pattern operating on 8
4646    shorts at a time (hence, the vectype for purposes of determining the
4647    vectorization factor should be V8HI); on the other hand, the vectype that
4648    is used to create the vector form is actually V4SI (the type of the result).
4649
4650    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4651    indicates what is the actual level of parallelism (V8HI in the example), so
4652    that the right vectorization factor would be derived.  This vectype
4653    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4654    be used to create the vectorized stmt.  The right vectype for the vectorized
4655    stmt is obtained from the type of the result X:
4656         get_vectype_for_scalar_type (TREE_TYPE (X))
4657
4658    This means that, contrary to "regular" reductions (or "regular" stmts in
4659    general), the following equation:
4660       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4661    does *NOT* necessarily hold for reduction patterns.  */
4662
4663 bool
4664 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4665                         gimple *vec_stmt, slp_tree slp_node)
4666 {
4667   tree vec_dest;
4668   tree scalar_dest;
4669   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4670   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4671   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4672   tree vectype_in = NULL_TREE;
4673   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4674   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4675   enum tree_code code, orig_code, epilog_reduc_code;
4676   enum machine_mode vec_mode;
4677   int op_type;
4678   optab optab, reduc_optab;
4679   tree new_temp = NULL_TREE;
4680   tree def;
4681   gimple def_stmt;
4682   enum vect_def_type dt;
4683   gimple new_phi = NULL;
4684   tree scalar_type;
4685   bool is_simple_use;
4686   gimple orig_stmt;
4687   stmt_vec_info orig_stmt_info;
4688   tree expr = NULL_TREE;
4689   int i;
4690   int ncopies;
4691   int epilog_copies;
4692   stmt_vec_info prev_stmt_info, prev_phi_info;
4693   bool single_defuse_cycle = false;
4694   tree reduc_def = NULL_TREE;
4695   gimple new_stmt = NULL;
4696   int j;
4697   tree ops[3];
4698   bool nested_cycle = false, found_nested_cycle_def = false;
4699   gimple reduc_def_stmt = NULL;
4700   /* The default is that the reduction variable is the last in statement.  */
4701   int reduc_index = 2;
4702   bool double_reduc = false, dummy;
4703   basic_block def_bb;
4704   struct loop * def_stmt_loop, *outer_loop = NULL;
4705   tree def_arg;
4706   gimple def_arg_stmt;
4707   auto_vec<tree> vec_oprnds0;
4708   auto_vec<tree> vec_oprnds1;
4709   auto_vec<tree> vect_defs;
4710   auto_vec<gimple> phis;
4711   int vec_num;
4712   tree def0, def1, tem, op0, op1 = NULL_TREE;
4713
4714   /* In case of reduction chain we switch to the first stmt in the chain, but
4715      we don't update STMT_INFO, since only the last stmt is marked as reduction
4716      and has reduction properties.  */
4717   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4718     stmt = GROUP_FIRST_ELEMENT (stmt_info);
4719
4720   if (nested_in_vect_loop_p (loop, stmt))
4721     {
4722       outer_loop = loop;
4723       loop = loop->inner;
4724       nested_cycle = true;
4725     }
4726
4727   /* 1. Is vectorizable reduction?  */
4728   /* Not supportable if the reduction variable is used in the loop, unless
4729      it's a reduction chain.  */
4730   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4731       && !GROUP_FIRST_ELEMENT (stmt_info))
4732     return false;
4733
4734   /* Reductions that are not used even in an enclosing outer-loop,
4735      are expected to be "live" (used out of the loop).  */
4736   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4737       && !STMT_VINFO_LIVE_P (stmt_info))
4738     return false;
4739
4740   /* Make sure it was already recognized as a reduction computation.  */
4741   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4742       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4743     return false;
4744
4745   /* 2. Has this been recognized as a reduction pattern?
4746
4747      Check if STMT represents a pattern that has been recognized
4748      in earlier analysis stages.  For stmts that represent a pattern,
4749      the STMT_VINFO_RELATED_STMT field records the last stmt in
4750      the original sequence that constitutes the pattern.  */
4751
4752   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4753   if (orig_stmt)
4754     {
4755       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4756       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4757       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4758     }
4759
4760   /* 3. Check the operands of the operation.  The first operands are defined
4761         inside the loop body. The last operand is the reduction variable,
4762         which is defined by the loop-header-phi.  */
4763
4764   gcc_assert (is_gimple_assign (stmt));
4765
4766   /* Flatten RHS.  */
4767   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4768     {
4769     case GIMPLE_SINGLE_RHS:
4770       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4771       if (op_type == ternary_op)
4772         {
4773           tree rhs = gimple_assign_rhs1 (stmt);
4774           ops[0] = TREE_OPERAND (rhs, 0);
4775           ops[1] = TREE_OPERAND (rhs, 1);
4776           ops[2] = TREE_OPERAND (rhs, 2);
4777           code = TREE_CODE (rhs);
4778         }
4779       else
4780         return false;
4781       break;
4782
4783     case GIMPLE_BINARY_RHS:
4784       code = gimple_assign_rhs_code (stmt);
4785       op_type = TREE_CODE_LENGTH (code);
4786       gcc_assert (op_type == binary_op);
4787       ops[0] = gimple_assign_rhs1 (stmt);
4788       ops[1] = gimple_assign_rhs2 (stmt);
4789       break;
4790
4791     case GIMPLE_TERNARY_RHS:
4792       code = gimple_assign_rhs_code (stmt);
4793       op_type = TREE_CODE_LENGTH (code);
4794       gcc_assert (op_type == ternary_op);
4795       ops[0] = gimple_assign_rhs1 (stmt);
4796       ops[1] = gimple_assign_rhs2 (stmt);
4797       ops[2] = gimple_assign_rhs3 (stmt);
4798       break;
4799
4800     case GIMPLE_UNARY_RHS:
4801       return false;
4802
4803     default:
4804       gcc_unreachable ();
4805     }
4806
4807   if (code == COND_EXPR && slp_node)
4808     return false;
4809
4810   scalar_dest = gimple_assign_lhs (stmt);
4811   scalar_type = TREE_TYPE (scalar_dest);
4812   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4813       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4814     return false;
4815
4816   /* Do not try to vectorize bit-precision reductions.  */
4817   if ((TYPE_PRECISION (scalar_type)
4818        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4819     return false;
4820
4821   /* All uses but the last are expected to be defined in the loop.
4822      The last use is the reduction variable.  In case of nested cycle this
4823      assumption is not true: we use reduc_index to record the index of the
4824      reduction variable.  */
4825   for (i = 0; i < op_type - 1; i++)
4826     {
4827       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4828       if (i == 0 && code == COND_EXPR)
4829         continue;
4830
4831       is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4832                                             &def_stmt, &def, &dt, &tem);
4833       if (!vectype_in)
4834         vectype_in = tem;
4835       gcc_assert (is_simple_use);
4836
4837       if (dt != vect_internal_def
4838           && dt != vect_external_def
4839           && dt != vect_constant_def
4840           && dt != vect_induction_def
4841           && !(dt == vect_nested_cycle && nested_cycle))
4842         return false;
4843
4844       if (dt == vect_nested_cycle)
4845         {
4846           found_nested_cycle_def = true;
4847           reduc_def_stmt = def_stmt;
4848           reduc_index = i;
4849         }
4850     }
4851
4852   is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4853                                         &def_stmt, &def, &dt, &tem);
4854   if (!vectype_in)
4855     vectype_in = tem;
4856   gcc_assert (is_simple_use);
4857   if (!(dt == vect_reduction_def
4858         || dt == vect_nested_cycle
4859         || ((dt == vect_internal_def || dt == vect_external_def
4860              || dt == vect_constant_def || dt == vect_induction_def)
4861             && nested_cycle && found_nested_cycle_def)))
4862     {
4863       /* For pattern recognized stmts, orig_stmt might be a reduction,
4864          but some helper statements for the pattern might not, or
4865          might be COND_EXPRs with reduction uses in the condition.  */
4866       gcc_assert (orig_stmt);
4867       return false;
4868     }
4869   if (!found_nested_cycle_def)
4870     reduc_def_stmt = def_stmt;
4871
4872   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4873   if (orig_stmt)
4874     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4875                                                        reduc_def_stmt,
4876                                                        !nested_cycle,
4877                                                        &dummy));
4878   else
4879     {
4880       gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4881                                              !nested_cycle, &dummy);
4882       /* We changed STMT to be the first stmt in reduction chain, hence we
4883          check that in this case the first element in the chain is STMT.  */
4884       gcc_assert (stmt == tmp
4885                   || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4886     }
4887
4888   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4889     return false;
4890
4891   if (slp_node || PURE_SLP_STMT (stmt_info))
4892     ncopies = 1;
4893   else
4894     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4895                / TYPE_VECTOR_SUBPARTS (vectype_in));
4896
4897   gcc_assert (ncopies >= 1);
4898
4899   vec_mode = TYPE_MODE (vectype_in);
4900
4901   if (code == COND_EXPR)
4902     {
4903       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4904         {
4905           if (dump_enabled_p ())
4906             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4907                              "unsupported condition in reduction\n");
4908
4909             return false;
4910         }
4911     }
4912   else
4913     {
4914       /* 4. Supportable by target?  */
4915
4916       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
4917           || code == LROTATE_EXPR || code == RROTATE_EXPR)
4918         {
4919           /* Shifts and rotates are only supported by vectorizable_shifts,
4920              not vectorizable_reduction.  */
4921           if (dump_enabled_p ())
4922             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4923                              "unsupported shift or rotation.\n");
4924           return false;
4925         }
4926
4927       /* 4.1. check support for the operation in the loop  */
4928       optab = optab_for_tree_code (code, vectype_in, optab_default);
4929       if (!optab)
4930         {
4931           if (dump_enabled_p ())
4932             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4933                              "no optab.\n");
4934
4935           return false;
4936         }
4937
4938       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4939         {
4940           if (dump_enabled_p ())
4941             dump_printf (MSG_NOTE, "op not supported by target.\n");
4942
4943           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4944               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4945                   < vect_min_worthwhile_factor (code))
4946             return false;
4947
4948           if (dump_enabled_p ())
4949             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
4950         }
4951
4952       /* Worthwhile without SIMD support?  */
4953       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4954           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4955              < vect_min_worthwhile_factor (code))
4956         {
4957           if (dump_enabled_p ())
4958             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4959                              "not worthwhile without SIMD support.\n");
4960
4961           return false;
4962         }
4963     }
4964
4965   /* 4.2. Check support for the epilog operation.
4966
4967           If STMT represents a reduction pattern, then the type of the
4968           reduction variable may be different than the type of the rest
4969           of the arguments.  For example, consider the case of accumulation
4970           of shorts into an int accumulator; The original code:
4971                         S1: int_a = (int) short_a;
4972           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
4973
4974           was replaced with:
4975                         STMT: int_acc = widen_sum <short_a, int_acc>
4976
4977           This means that:
4978           1. The tree-code that is used to create the vector operation in the
4979              epilog code (that reduces the partial results) is not the
4980              tree-code of STMT, but is rather the tree-code of the original
4981              stmt from the pattern that STMT is replacing.  I.e, in the example
4982              above we want to use 'widen_sum' in the loop, but 'plus' in the
4983              epilog.
4984           2. The type (mode) we use to check available target support
4985              for the vector operation to be created in the *epilog*, is
4986              determined by the type of the reduction variable (in the example
4987              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4988              However the type (mode) we use to check available target support
4989              for the vector operation to be created *inside the loop*, is
4990              determined by the type of the other arguments to STMT (in the
4991              example we'd check this: optab_handler (widen_sum_optab,
4992              vect_short_mode)).
4993
4994           This is contrary to "regular" reductions, in which the types of all
4995           the arguments are the same as the type of the reduction variable.
4996           For "regular" reductions we can therefore use the same vector type
4997           (and also the same tree-code) when generating the epilog code and
4998           when generating the code inside the loop.  */
4999
5000   if (orig_stmt)
5001     {
5002       /* This is a reduction pattern: get the vectype from the type of the
5003          reduction variable, and get the tree-code from orig_stmt.  */
5004       orig_code = gimple_assign_rhs_code (orig_stmt);
5005       gcc_assert (vectype_out);
5006       vec_mode = TYPE_MODE (vectype_out);
5007     }
5008   else
5009     {
5010       /* Regular reduction: use the same vectype and tree-code as used for
5011          the vector code inside the loop can be used for the epilog code. */
5012       orig_code = code;
5013     }
5014
5015   if (nested_cycle)
5016     {
5017       def_bb = gimple_bb (reduc_def_stmt);
5018       def_stmt_loop = def_bb->loop_father;
5019       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5020                                        loop_preheader_edge (def_stmt_loop));
5021       if (TREE_CODE (def_arg) == SSA_NAME
5022           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5023           && gimple_code (def_arg_stmt) == GIMPLE_PHI
5024           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5025           && vinfo_for_stmt (def_arg_stmt)
5026           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5027               == vect_double_reduction_def)
5028         double_reduc = true;
5029     }
5030
5031   epilog_reduc_code = ERROR_MARK;
5032   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5033     {
5034       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5035                                          optab_default);
5036       if (!reduc_optab)
5037         {
5038           if (dump_enabled_p ())
5039             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5040                              "no optab for reduction.\n");
5041
5042           epilog_reduc_code = ERROR_MARK;
5043         }
5044
5045       if (reduc_optab
5046           && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5047         {
5048           if (dump_enabled_p ())
5049             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5050                              "reduc op not supported by target.\n");
5051
5052           epilog_reduc_code = ERROR_MARK;
5053         }
5054     }
5055   else
5056     {
5057       if (!nested_cycle || double_reduc)
5058         {
5059           if (dump_enabled_p ())
5060             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5061                              "no reduc code for scalar code.\n");
5062
5063           return false;
5064         }
5065     }
5066
5067   if (double_reduc && ncopies > 1)
5068     {
5069       if (dump_enabled_p ())
5070         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5071                          "multiple types in double reduction\n");
5072
5073       return false;
5074     }
5075
5076   /* In case of widenning multiplication by a constant, we update the type
5077      of the constant to be the type of the other operand.  We check that the
5078      constant fits the type in the pattern recognition pass.  */
5079   if (code == DOT_PROD_EXPR
5080       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5081     {
5082       if (TREE_CODE (ops[0]) == INTEGER_CST)
5083         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5084       else if (TREE_CODE (ops[1]) == INTEGER_CST)
5085         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5086       else
5087         {
5088           if (dump_enabled_p ())
5089             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5090                              "invalid types in dot-prod\n");
5091
5092           return false;
5093         }
5094     }
5095
5096   if (!vec_stmt) /* transformation not required.  */
5097     {
5098       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
5099         return false;
5100       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5101       return true;
5102     }
5103
5104   /** Transform.  **/
5105
5106   if (dump_enabled_p ())
5107     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5108
5109   /* FORNOW: Multiple types are not supported for condition.  */
5110   if (code == COND_EXPR)
5111     gcc_assert (ncopies == 1);
5112
5113   /* Create the destination vector  */
5114   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5115
5116   /* In case the vectorization factor (VF) is bigger than the number
5117      of elements that we can fit in a vectype (nunits), we have to generate
5118      more than one vector stmt - i.e - we need to "unroll" the
5119      vector stmt by a factor VF/nunits.  For more details see documentation
5120      in vectorizable_operation.  */
5121
5122   /* If the reduction is used in an outer loop we need to generate
5123      VF intermediate results, like so (e.g. for ncopies=2):
5124         r0 = phi (init, r0)
5125         r1 = phi (init, r1)
5126         r0 = x0 + r0;
5127         r1 = x1 + r1;
5128     (i.e. we generate VF results in 2 registers).
5129     In this case we have a separate def-use cycle for each copy, and therefore
5130     for each copy we get the vector def for the reduction variable from the
5131     respective phi node created for this copy.
5132
5133     Otherwise (the reduction is unused in the loop nest), we can combine
5134     together intermediate results, like so (e.g. for ncopies=2):
5135         r = phi (init, r)
5136         r = x0 + r;
5137         r = x1 + r;
5138    (i.e. we generate VF/2 results in a single register).
5139    In this case for each copy we get the vector def for the reduction variable
5140    from the vectorized reduction operation generated in the previous iteration.
5141   */
5142
5143   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5144     {
5145       single_defuse_cycle = true;
5146       epilog_copies = 1;
5147     }
5148   else
5149     epilog_copies = ncopies;
5150
5151   prev_stmt_info = NULL;
5152   prev_phi_info = NULL;
5153   if (slp_node)
5154     {
5155       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5156       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5157                   == TYPE_VECTOR_SUBPARTS (vectype_in));
5158     }
5159   else
5160     {
5161       vec_num = 1;
5162       vec_oprnds0.create (1);
5163       if (op_type == ternary_op)
5164         vec_oprnds1.create (1);
5165     }
5166
5167   phis.create (vec_num);
5168   vect_defs.create (vec_num);
5169   if (!slp_node)
5170     vect_defs.quick_push (NULL_TREE);
5171
5172   for (j = 0; j < ncopies; j++)
5173     {
5174       if (j == 0 || !single_defuse_cycle)
5175         {
5176           for (i = 0; i < vec_num; i++)
5177             {
5178               /* Create the reduction-phi that defines the reduction
5179                  operand.  */
5180               new_phi = create_phi_node (vec_dest, loop->header);
5181               set_vinfo_for_stmt (new_phi,
5182                                   new_stmt_vec_info (new_phi, loop_vinfo,
5183                                                      NULL));
5184                if (j == 0 || slp_node)
5185                  phis.quick_push (new_phi);
5186             }
5187         }
5188
5189       if (code == COND_EXPR)
5190         {
5191           gcc_assert (!slp_node);
5192           vectorizable_condition (stmt, gsi, vec_stmt,
5193                                   PHI_RESULT (phis[0]),
5194                                   reduc_index, NULL);
5195           /* Multiple types are not supported for condition.  */
5196           break;
5197         }
5198
5199       /* Handle uses.  */
5200       if (j == 0)
5201         {
5202           op0 = ops[!reduc_index];
5203           if (op_type == ternary_op)
5204             {
5205               if (reduc_index == 0)
5206                 op1 = ops[2];
5207               else
5208                 op1 = ops[1];
5209             }
5210
5211           if (slp_node)
5212             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5213                                slp_node, -1);
5214           else
5215             {
5216               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5217                                                             stmt, NULL);
5218               vec_oprnds0.quick_push (loop_vec_def0);
5219               if (op_type == ternary_op)
5220                {
5221                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5222                                                                NULL);
5223                  vec_oprnds1.quick_push (loop_vec_def1);
5224                }
5225             }
5226         }
5227       else
5228         {
5229           if (!slp_node)
5230             {
5231               enum vect_def_type dt;
5232               gimple dummy_stmt;
5233               tree dummy;
5234
5235               vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5236                                   &dummy_stmt, &dummy, &dt);
5237               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5238                                                               loop_vec_def0);
5239               vec_oprnds0[0] = loop_vec_def0;
5240               if (op_type == ternary_op)
5241                 {
5242                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5243                                       &dummy, &dt);
5244                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5245                                                                 loop_vec_def1);
5246                   vec_oprnds1[0] = loop_vec_def1;
5247                 }
5248             }
5249
5250           if (single_defuse_cycle)
5251             reduc_def = gimple_assign_lhs (new_stmt);
5252
5253           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5254         }
5255
5256       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5257         {
5258           if (slp_node)
5259             reduc_def = PHI_RESULT (phis[i]);
5260           else
5261             {
5262               if (!single_defuse_cycle || j == 0)
5263                 reduc_def = PHI_RESULT (new_phi);
5264             }
5265
5266           def1 = ((op_type == ternary_op)
5267                   ? vec_oprnds1[i] : NULL);
5268           if (op_type == binary_op)
5269             {
5270               if (reduc_index == 0)
5271                 expr = build2 (code, vectype_out, reduc_def, def0);
5272               else
5273                 expr = build2 (code, vectype_out, def0, reduc_def);
5274             }
5275           else
5276             {
5277               if (reduc_index == 0)
5278                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5279               else
5280                 {
5281                   if (reduc_index == 1)
5282                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5283                   else
5284                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5285                 }
5286             }
5287
5288           new_stmt = gimple_build_assign (vec_dest, expr);
5289           new_temp = make_ssa_name (vec_dest, new_stmt);
5290           gimple_assign_set_lhs (new_stmt, new_temp);
5291           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5292
5293           if (slp_node)
5294             {
5295               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5296               vect_defs.quick_push (new_temp);
5297             }
5298           else
5299             vect_defs[0] = new_temp;
5300         }
5301
5302       if (slp_node)
5303         continue;
5304
5305       if (j == 0)
5306         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5307       else
5308         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5309
5310       prev_stmt_info = vinfo_for_stmt (new_stmt);
5311       prev_phi_info = vinfo_for_stmt (new_phi);
5312     }
5313
5314   /* Finalize the reduction-phi (set its arguments) and create the
5315      epilog reduction code.  */
5316   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5317     {
5318       new_temp = gimple_assign_lhs (*vec_stmt);
5319       vect_defs[0] = new_temp;
5320     }
5321
5322   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5323                                     epilog_reduc_code, phis, reduc_index,
5324                                     double_reduc, slp_node);
5325
5326   return true;
5327 }
5328
5329 /* Function vect_min_worthwhile_factor.
5330
5331    For a loop where we could vectorize the operation indicated by CODE,
5332    return the minimum vectorization factor that makes it worthwhile
5333    to use generic vectors.  */
5334 int
5335 vect_min_worthwhile_factor (enum tree_code code)
5336 {
5337   switch (code)
5338     {
5339     case PLUS_EXPR:
5340     case MINUS_EXPR:
5341     case NEGATE_EXPR:
5342       return 4;
5343
5344     case BIT_AND_EXPR:
5345     case BIT_IOR_EXPR:
5346     case BIT_XOR_EXPR:
5347     case BIT_NOT_EXPR:
5348       return 2;
5349
5350     default:
5351       return INT_MAX;
5352     }
5353 }
5354
5355
5356 /* Function vectorizable_induction
5357
5358    Check if PHI performs an induction computation that can be vectorized.
5359    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5360    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5361    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5362
5363 bool
5364 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5365                         gimple *vec_stmt)
5366 {
5367   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5368   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5369   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5370   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5371   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5372   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5373   tree vec_def;
5374
5375   gcc_assert (ncopies >= 1);
5376   /* FORNOW. These restrictions should be relaxed.  */
5377   if (nested_in_vect_loop_p (loop, phi))
5378     {
5379       imm_use_iterator imm_iter;
5380       use_operand_p use_p;
5381       gimple exit_phi;
5382       edge latch_e;
5383       tree loop_arg;
5384
5385       if (ncopies > 1)
5386         {
5387           if (dump_enabled_p ())
5388             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5389                              "multiple types in nested loop.\n");
5390           return false;
5391         }
5392
5393       exit_phi = NULL;
5394       latch_e = loop_latch_edge (loop->inner);
5395       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5396       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5397         {
5398           if (!flow_bb_inside_loop_p (loop->inner,
5399                                       gimple_bb (USE_STMT (use_p))))
5400             {
5401               exit_phi = USE_STMT (use_p);
5402               break;
5403             }
5404         }
5405       if (exit_phi)
5406         {
5407           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5408           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5409                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5410             {
5411               if (dump_enabled_p ())
5412                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5413                                  "inner-loop induction only used outside "
5414                                  "of the outer vectorized loop.\n");
5415               return false;
5416             }
5417         }
5418     }
5419
5420   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5421     return false;
5422
5423   /* FORNOW: SLP not supported.  */
5424   if (STMT_SLP_TYPE (stmt_info))
5425     return false;
5426
5427   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5428
5429   if (gimple_code (phi) != GIMPLE_PHI)
5430     return false;
5431
5432   if (!vec_stmt) /* transformation not required.  */
5433     {
5434       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5435       if (dump_enabled_p ())
5436         dump_printf_loc (MSG_NOTE, vect_location,
5437                          "=== vectorizable_induction ===\n");
5438       vect_model_induction_cost (stmt_info, ncopies);
5439       return true;
5440     }
5441
5442   /** Transform.  **/
5443
5444   if (dump_enabled_p ())
5445     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5446
5447   vec_def = get_initial_def_for_induction (phi);
5448   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5449   return true;
5450 }
5451
5452 /* Function vectorizable_live_operation.
5453
5454    STMT computes a value that is used outside the loop.  Check if
5455    it can be supported.  */
5456
5457 bool
5458 vectorizable_live_operation (gimple stmt,
5459                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5460                              gimple *vec_stmt)
5461 {
5462   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5463   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5464   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5465   int i;
5466   int op_type;
5467   tree op;
5468   tree def;
5469   gimple def_stmt;
5470   enum vect_def_type dt;
5471   enum tree_code code;
5472   enum gimple_rhs_class rhs_class;
5473
5474   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5475
5476   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5477     return false;
5478
5479   if (!is_gimple_assign (stmt))
5480     {
5481       if (gimple_call_internal_p (stmt)
5482           && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5483           && gimple_call_lhs (stmt)
5484           && loop->simduid
5485           && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5486           && loop->simduid
5487              == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5488         {
5489           edge e = single_exit (loop);
5490           basic_block merge_bb = e->dest;
5491           imm_use_iterator imm_iter;
5492           use_operand_p use_p;
5493           tree lhs = gimple_call_lhs (stmt);
5494
5495           FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5496             {
5497               gimple use_stmt = USE_STMT (use_p);
5498               if (gimple_code (use_stmt) == GIMPLE_PHI
5499                   || gimple_bb (use_stmt) == merge_bb)
5500                 {
5501                   if (vec_stmt)
5502                     {
5503                       tree vfm1
5504                         = build_int_cst (unsigned_type_node,
5505                                          loop_vinfo->vectorization_factor - 1);
5506                       SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5507                     }
5508                   return true;
5509                 }
5510             }
5511         }
5512
5513       return false;
5514     }
5515
5516   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5517     return false;
5518
5519   /* FORNOW. CHECKME. */
5520   if (nested_in_vect_loop_p (loop, stmt))
5521     return false;
5522
5523   code = gimple_assign_rhs_code (stmt);
5524   op_type = TREE_CODE_LENGTH (code);
5525   rhs_class = get_gimple_rhs_class (code);
5526   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5527   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5528
5529   /* FORNOW: support only if all uses are invariant.  This means
5530      that the scalar operations can remain in place, unvectorized.
5531      The original last scalar value that they compute will be used.  */
5532
5533   for (i = 0; i < op_type; i++)
5534     {
5535       if (rhs_class == GIMPLE_SINGLE_RHS)
5536         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5537       else
5538         op = gimple_op (stmt, i + 1);
5539       if (op
5540           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5541                                   &dt))
5542         {
5543           if (dump_enabled_p ())
5544             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5545                              "use not simple.\n");
5546           return false;
5547         }
5548
5549       if (dt != vect_external_def && dt != vect_constant_def)
5550         return false;
5551     }
5552
5553   /* No transformation is required for the cases we currently support.  */
5554   return true;
5555 }
5556
5557 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5558
5559 static void
5560 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5561 {
5562   ssa_op_iter op_iter;
5563   imm_use_iterator imm_iter;
5564   def_operand_p def_p;
5565   gimple ustmt;
5566
5567   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5568     {
5569       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5570         {
5571           basic_block bb;
5572
5573           if (!is_gimple_debug (ustmt))
5574             continue;
5575
5576           bb = gimple_bb (ustmt);
5577
5578           if (!flow_bb_inside_loop_p (loop, bb))
5579             {
5580               if (gimple_debug_bind_p (ustmt))
5581                 {
5582                   if (dump_enabled_p ())
5583                     dump_printf_loc (MSG_NOTE, vect_location,
5584                                      "killing debug use\n");
5585
5586                   gimple_debug_bind_reset_value (ustmt);
5587                   update_stmt (ustmt);
5588                 }
5589               else
5590                 gcc_unreachable ();
5591             }
5592         }
5593     }
5594 }
5595
5596
5597 /* This function builds ni_name = number of iterations.  Statements
5598    are emitted on the loop preheader edge.  */
5599
5600 static tree
5601 vect_build_loop_niters (loop_vec_info loop_vinfo)
5602 {
5603   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5604   if (TREE_CODE (ni) == INTEGER_CST)
5605     return ni;
5606   else
5607     {
5608       tree ni_name, var;
5609       gimple_seq stmts = NULL;
5610       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5611
5612       var = create_tmp_var (TREE_TYPE (ni), "niters");
5613       ni_name = force_gimple_operand (ni, &stmts, false, var);
5614       if (stmts)
5615         gsi_insert_seq_on_edge_immediate (pe, stmts);
5616
5617       return ni_name;
5618     }
5619 }
5620
5621
5622 /* This function generates the following statements:
5623
5624    ni_name = number of iterations loop executes
5625    ratio = ni_name / vf
5626    ratio_mult_vf_name = ratio * vf
5627
5628    and places them on the loop preheader edge.  */
5629
5630 static void
5631 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5632                                  tree ni_name,
5633                                  tree *ratio_mult_vf_name_ptr,
5634                                  tree *ratio_name_ptr)
5635 {
5636   tree ni_minus_gap_name;
5637   tree var;
5638   tree ratio_name;
5639   tree ratio_mult_vf_name;
5640   tree ni = LOOP_VINFO_NITERS (loop_vinfo);
5641   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5642   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5643   tree log_vf;
5644
5645   log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
5646
5647   /* If epilogue loop is required because of data accesses with gaps, we
5648      subtract one iteration from the total number of iterations here for
5649      correct calculation of RATIO.  */
5650   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5651     {
5652       ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5653                                        ni_name,
5654                                        build_one_cst (TREE_TYPE (ni_name)));
5655       if (!is_gimple_val (ni_minus_gap_name))
5656         {
5657           var = create_tmp_var (TREE_TYPE (ni), "ni_gap");
5658           gimple stmts = NULL;
5659           ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5660                                                     true, var);
5661           gsi_insert_seq_on_edge_immediate (pe, stmts);
5662         }
5663     }
5664   else
5665     ni_minus_gap_name = ni_name;
5666
5667   /* Create: ratio = ni >> log2(vf) */
5668
5669   ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_minus_gap_name),
5670                             ni_minus_gap_name, log_vf);
5671   if (!is_gimple_val (ratio_name))
5672     {
5673       var = create_tmp_var (TREE_TYPE (ni), "bnd");
5674       gimple stmts = NULL;
5675       ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5676       gsi_insert_seq_on_edge_immediate (pe, stmts);
5677     }
5678   *ratio_name_ptr = ratio_name;
5679
5680   /* Create: ratio_mult_vf = ratio << log2 (vf).  */
5681
5682   if (ratio_mult_vf_name_ptr)
5683     {
5684       ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5685                                         ratio_name, log_vf);
5686       if (!is_gimple_val (ratio_mult_vf_name))
5687         {
5688           var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
5689           gimple stmts = NULL;
5690           ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5691                                                      true, var);
5692           gsi_insert_seq_on_edge_immediate (pe, stmts);
5693         }
5694       *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5695     }
5696
5697   return;
5698 }
5699
5700
5701 /* Function vect_transform_loop.
5702
5703    The analysis phase has determined that the loop is vectorizable.
5704    Vectorize the loop - created vectorized stmts to replace the scalar
5705    stmts in the loop, and update the loop exit condition.  */
5706
5707 void
5708 vect_transform_loop (loop_vec_info loop_vinfo)
5709 {
5710   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5711   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5712   int nbbs = loop->num_nodes;
5713   gimple_stmt_iterator si;
5714   int i;
5715   tree ratio = NULL;
5716   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5717   bool grouped_store;
5718   bool slp_scheduled = false;
5719   gimple stmt, pattern_stmt;
5720   gimple_seq pattern_def_seq = NULL;
5721   gimple_stmt_iterator pattern_def_si = gsi_none ();
5722   bool transform_pattern_stmt = false;
5723   bool check_profitability = false;
5724   int th;
5725   /* Record number of iterations before we started tampering with the profile. */
5726   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5727
5728   if (dump_enabled_p ())
5729     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5730
5731   /* If profile is inprecise, we have chance to fix it up.  */
5732   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5733     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5734
5735   /* Use the more conservative vectorization threshold.  If the number
5736      of iterations is constant assume the cost check has been performed
5737      by our caller.  If the threshold makes all loops profitable that
5738      run at least the vectorization factor number of times checking
5739      is pointless, too.  */
5740   th = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
5741          * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
5742   th = MAX (th, LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo));
5743   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5744       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5745     {
5746       if (dump_enabled_p ())
5747         dump_printf_loc (MSG_NOTE, vect_location,
5748                          "Profitability threshold is %d loop iterations.\n",
5749                          th);
5750       check_profitability = true;
5751     }
5752
5753   /* Version the loop first, if required, so the profitability check
5754      comes first.  */
5755
5756   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5757       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5758     {
5759       vect_loop_versioning (loop_vinfo, th, check_profitability);
5760       check_profitability = false;
5761     }
5762
5763   tree ni_name = vect_build_loop_niters (loop_vinfo);
5764   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5765
5766   /* Peel the loop if there are data refs with unknown alignment.
5767      Only one data ref with unknown store is allowed.  */
5768
5769   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5770     {
5771       vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5772                                      th, check_profitability);
5773       check_profitability = false;
5774       /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5775          be re-computed.  */
5776       ni_name = NULL_TREE;
5777     }
5778
5779   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5780      compile time constant), or it is a constant that doesn't divide by the
5781      vectorization factor, then an epilog loop needs to be created.
5782      We therefore duplicate the loop: the original loop will be vectorized,
5783      and will compute the first (n/VF) iterations.  The second copy of the loop
5784      will remain scalar and will compute the remaining (n%VF) iterations.
5785      (VF is the vectorization factor).  */
5786
5787   if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
5788       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5789     {
5790       tree ratio_mult_vf;
5791       if (!ni_name)
5792         ni_name = vect_build_loop_niters (loop_vinfo);
5793       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
5794                                        &ratio);
5795       vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
5796                                       th, check_profitability);
5797     }
5798   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5799     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5800                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5801   else
5802     {
5803       if (!ni_name)
5804         ni_name = vect_build_loop_niters (loop_vinfo);
5805       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
5806     }
5807
5808   /* 1) Make sure the loop header has exactly two entries
5809      2) Make sure we have a preheader basic block.  */
5810
5811   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5812
5813   split_edge (loop_preheader_edge (loop));
5814
5815   /* FORNOW: the vectorizer supports only loops which body consist
5816      of one basic block (header + empty latch). When the vectorizer will
5817      support more involved loop forms, the order by which the BBs are
5818      traversed need to be reconsidered.  */
5819
5820   for (i = 0; i < nbbs; i++)
5821     {
5822       basic_block bb = bbs[i];
5823       stmt_vec_info stmt_info;
5824       gimple phi;
5825
5826       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5827         {
5828           phi = gsi_stmt (si);
5829           if (dump_enabled_p ())
5830             {
5831               dump_printf_loc (MSG_NOTE, vect_location,
5832                                "------>vectorizing phi: ");
5833               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5834               dump_printf (MSG_NOTE, "\n");
5835             }
5836           stmt_info = vinfo_for_stmt (phi);
5837           if (!stmt_info)
5838             continue;
5839
5840           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5841             vect_loop_kill_debug_uses (loop, phi);
5842
5843           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5844               && !STMT_VINFO_LIVE_P (stmt_info))
5845             continue;
5846
5847           if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5848                 != (unsigned HOST_WIDE_INT) vectorization_factor)
5849               && dump_enabled_p ())
5850             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
5851
5852           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5853             {
5854               if (dump_enabled_p ())
5855                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
5856               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5857             }
5858         }
5859
5860       pattern_stmt = NULL;
5861       for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5862         {
5863           bool is_store;
5864
5865           if (transform_pattern_stmt)
5866             stmt = pattern_stmt;
5867           else
5868             {
5869               stmt = gsi_stmt (si);
5870               /* During vectorization remove existing clobber stmts.  */
5871               if (gimple_clobber_p (stmt))
5872                 {
5873                   unlink_stmt_vdef (stmt);
5874                   gsi_remove (&si, true);
5875                   release_defs (stmt);
5876                   continue;
5877                 }
5878             }
5879
5880           if (dump_enabled_p ())
5881             {
5882               dump_printf_loc (MSG_NOTE, vect_location,
5883                                "------>vectorizing statement: ");
5884               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5885               dump_printf (MSG_NOTE, "\n");
5886             }
5887
5888           stmt_info = vinfo_for_stmt (stmt);
5889
5890           /* vector stmts created in the outer-loop during vectorization of
5891              stmts in an inner-loop may not have a stmt_info, and do not
5892              need to be vectorized.  */
5893           if (!stmt_info)
5894             {
5895               gsi_next (&si);
5896               continue;
5897             }
5898
5899           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5900             vect_loop_kill_debug_uses (loop, stmt);
5901
5902           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5903               && !STMT_VINFO_LIVE_P (stmt_info))
5904             {
5905               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5906                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5907                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5908                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5909                 {
5910                   stmt = pattern_stmt;
5911                   stmt_info = vinfo_for_stmt (stmt);
5912                 }
5913               else
5914                 {
5915                   gsi_next (&si);
5916                   continue;
5917                 }
5918             }
5919           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5920                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5921                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5922                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5923             transform_pattern_stmt = true;
5924
5925           /* If pattern statement has def stmts, vectorize them too.  */
5926           if (is_pattern_stmt_p (stmt_info))
5927             {
5928               if (pattern_def_seq == NULL)
5929                 {
5930                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
5931                   pattern_def_si = gsi_start (pattern_def_seq);
5932                 }
5933               else if (!gsi_end_p (pattern_def_si))
5934                 gsi_next (&pattern_def_si);
5935               if (pattern_def_seq != NULL)
5936                 {
5937                   gimple pattern_def_stmt = NULL;
5938                   stmt_vec_info pattern_def_stmt_info = NULL;
5939
5940                   while (!gsi_end_p (pattern_def_si))
5941                     {
5942                       pattern_def_stmt = gsi_stmt (pattern_def_si);
5943                       pattern_def_stmt_info
5944                         = vinfo_for_stmt (pattern_def_stmt);
5945                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
5946                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
5947                         break;
5948                       gsi_next (&pattern_def_si);
5949                     }
5950
5951                   if (!gsi_end_p (pattern_def_si))
5952                     {
5953                       if (dump_enabled_p ())
5954                         {
5955                           dump_printf_loc (MSG_NOTE, vect_location,
5956                                            "==> vectorizing pattern def "
5957                                            "stmt: ");
5958                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
5959                                             pattern_def_stmt, 0);
5960                           dump_printf (MSG_NOTE, "\n");
5961                         }
5962
5963                       stmt = pattern_def_stmt;
5964                       stmt_info = pattern_def_stmt_info;
5965                     }
5966                   else
5967                     {
5968                       pattern_def_si = gsi_none ();
5969                       transform_pattern_stmt = false;
5970                     }
5971                 }
5972               else
5973                 transform_pattern_stmt = false;
5974             }
5975
5976           if (STMT_VINFO_VECTYPE (stmt_info))
5977             {
5978               unsigned int nunits
5979                 = (unsigned int)
5980                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
5981               if (!STMT_SLP_TYPE (stmt_info)
5982                   && nunits != (unsigned int) vectorization_factor
5983                   && dump_enabled_p ())
5984                   /* For SLP VF is set according to unrolling factor, and not
5985                      to vector size, hence for SLP this print is not valid.  */
5986                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
5987             }
5988
5989           /* SLP. Schedule all the SLP instances when the first SLP stmt is
5990              reached.  */
5991           if (STMT_SLP_TYPE (stmt_info))
5992             {
5993               if (!slp_scheduled)
5994                 {
5995                   slp_scheduled = true;
5996
5997                   if (dump_enabled_p ())
5998                     dump_printf_loc (MSG_NOTE, vect_location,
5999                                      "=== scheduling SLP instances ===\n");
6000
6001                   vect_schedule_slp (loop_vinfo, NULL);
6002                 }
6003
6004               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
6005               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6006                 {
6007                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6008                     {
6009                       pattern_def_seq = NULL;
6010                       gsi_next (&si);
6011                     }
6012                   continue;
6013                 }
6014             }
6015
6016           /* -------- vectorize statement ------------ */
6017           if (dump_enabled_p ())
6018             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6019
6020           grouped_store = false;
6021           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6022           if (is_store)
6023             {
6024               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6025                 {
6026                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6027                      interleaving chain was completed - free all the stores in
6028                      the chain.  */
6029                   gsi_next (&si);
6030                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6031                   continue;
6032                 }
6033               else
6034                 {
6035                   /* Free the attached stmt_vec_info and remove the stmt.  */
6036                   gimple store = gsi_stmt (si);
6037                   free_stmt_vec_info (store);
6038                   unlink_stmt_vdef (store);
6039                   gsi_remove (&si, true);
6040                   release_defs (store);
6041                   continue;
6042                 }
6043             }
6044
6045           if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6046             {
6047               pattern_def_seq = NULL;
6048               gsi_next (&si);
6049             }
6050         }                       /* stmts in BB */
6051     }                           /* BBs in loop */
6052
6053   slpeel_make_loop_iterate_ntimes (loop, ratio);
6054
6055   /* Reduce loop iterations by the vectorization factor.  */
6056   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6057                       expected_iterations / vectorization_factor);
6058   loop->nb_iterations_upper_bound
6059     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (vectorization_factor),
6060                                             FLOOR_DIV_EXPR);
6061   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6062       && loop->nb_iterations_upper_bound != double_int_zero)
6063     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - double_int_one;
6064   if (loop->any_estimate)
6065     {
6066       loop->nb_iterations_estimate
6067         = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (vectorization_factor),
6068                                              FLOOR_DIV_EXPR);
6069        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6070            && loop->nb_iterations_estimate != double_int_zero)
6071          loop->nb_iterations_estimate = loop->nb_iterations_estimate - double_int_one;
6072     }
6073
6074   if (dump_enabled_p ())
6075     {
6076       dump_printf_loc (MSG_NOTE, vect_location,
6077                        "LOOP VECTORIZED\n");
6078       if (loop->inner)
6079         dump_printf_loc (MSG_NOTE, vect_location,
6080                          "OUTER LOOP VECTORIZED\n");
6081       dump_printf (MSG_NOTE, "\n");
6082     }
6083 }