gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2013 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "ggc.h"
  28 #include "tree.h"
  29 #include "stor-layout.h"
  30 #include "basic-block.h"
  31 #include "gimple-pretty-print.h"
  32 #include "gimple.h"
  33 #include "gimplify.h"
  34 #include "gimple-iterator.h"
  35 #include "gimplify-me.h"
  36 #include "gimple-ssa.h"
  37 #include "tree-phinodes.h"
  38 #include "ssa-iterators.h"
  39 #include "stringpool.h"
  40 #include "tree-ssanames.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-pass.h"
  45 #include "cfgloop.h"
  46 #include "expr.h"
  47 #include "recog.h"
  48 #include "optabs.h"
  49 #include "params.h"
  50 #include "diagnostic-core.h"
  51 #include "tree-chrec.h"
  52 #include "tree-scalar-evolution.h"
  53 #include "tree-vectorizer.h"
  54 #include "target.h"
  55
  56 /* Loop Vectorization Pass.
  57
  58    This pass tries to vectorize loops.
  59
  60    For example, the vectorizer transforms the following simple loop:
  61
  62         short a[N]; short b[N]; short c[N]; int i;
  63
  64         for (i=0; i<N; i++){
  65           a[i] = b[i] + c[i];
  66         }
  67
  68    as if it was manually vectorized by rewriting the source code into:
  69
  70         typedef int __attribute__((mode(V8HI))) v8hi;
  71         short a[N];  short b[N]; short c[N];   int i;
  72         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  73         v8hi va, vb, vc;
  74
  75         for (i=0; i<N/8; i++){
  76           vb = pb[i];
  77           vc = pc[i];
  78           va = vb + vc;
  79           pa[i] = va;
  80         }
  81
  82         The main entry to this pass is vectorize_loops(), in which
  83    the vectorizer applies a set of analyses on a given set of loops,
  84    followed by the actual vectorization transformation for the loops that
  85    had successfully passed the analysis phase.
  86         Throughout this pass we make a distinction between two types of
  87    data: scalars (which are represented by SSA_NAMES), and memory references
  88    ("data-refs").  These two types of data require different handling both
  89    during analysis and transformation. The types of data-refs that the
  90    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  91    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  92    accesses are required to have a simple (consecutive) access pattern.
  93
  94    Analysis phase:
  95    ===============
  96         The driver for the analysis phase is vect_analyze_loop().
  97    It applies a set of analyses, some of which rely on the scalar evolution
  98    analyzer (scev) developed by Sebastian Pop.
  99
 100         During the analysis phase the vectorizer records some information
 101    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 102    loop, as well as general information about the loop as a whole, which is
 103    recorded in a "loop_vec_info" struct attached to each loop.
 104
 105    Transformation phase:
 106    =====================
 107         The loop transformation phase scans all the stmts in the loop, and
 108    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 109    the loop that needs to be vectorized.  It inserts the vector code sequence
 110    just before the scalar stmt S, and records a pointer to the vector code
 111    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 112    attached to S).  This pointer will be used for the vectorization of following
 113    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 114    otherwise, we rely on dead code elimination for removing it.
 115
 116         For example, say stmt S1 was vectorized into stmt VS1:
 117
 118    VS1: vb = px[i];
 119    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 120    S2:  a = b;
 121
 122    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 123    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 124    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 125    resulting sequence would be:
 126
 127    VS1: vb = px[i];
 128    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 129    VS2: va = vb;
 130    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 131
 132         Operands that are not SSA_NAMEs, are data-refs that appear in
 133    load/store operations (like 'x[i]' in S1), and are handled differently.
 134
 135    Target modeling:
 136    =================
 137         Currently the only target specific information that is used is the
 138    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 139    Targets that can support different sizes of vectors, for now will need
 140    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 141    flexibility will be added in the future.
 142
 143         Since we only vectorize operations which vector form can be
 144    expressed using existing tree codes, to verify that an operation is
 145    supported, the vectorizer checks the relevant optab at the relevant
 146    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 147    the value found is CODE_FOR_nothing, then there's no target support, and
 148    we can't vectorize the stmt.
 149
 150    For additional information on this project see:
 151    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 152 */
 153
 154 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 155
 156 /* Function vect_determine_vectorization_factor
 157
 158    Determine the vectorization factor (VF).  VF is the number of data elements
 159    that are operated upon in parallel in a single iteration of the vectorized
 160    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 161    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 162    elements can fit in a single vector register.
 163
 164    We currently support vectorization of loops in which all types operated upon
 165    are of the same size.  Therefore this function currently sets VF according to
 166    the size of the types operated upon, and fails if there are multiple sizes
 167    in the loop.
 168
 169    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 170    original loop:
 171         for (i=0; i<N; i++){
 172           a[i] = b[i] + c[i];
 173         }
 174
 175    vectorized loop:
 176         for (i=0; i<N; i+=VF){
 177           a[i:VF] = b[i:VF] + c[i:VF];
 178         }
 179 */
 180
 181 static bool
 182 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 183 {
 184   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 185   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 186   int nbbs = loop->num_nodes;
 187   gimple_stmt_iterator si;
 188   unsigned int vectorization_factor = 0;
 189   tree scalar_type;
 190   gimple phi;
 191   tree vectype;
 192   unsigned int nunits;
 193   stmt_vec_info stmt_info;
 194   int i;
 195   HOST_WIDE_INT dummy;
 196   gimple stmt, pattern_stmt = NULL;
 197   gimple_seq pattern_def_seq = NULL;
 198   gimple_stmt_iterator pattern_def_si = gsi_none ();
 199   bool analyze_pattern_stmt = false;
 200
 201   if (dump_enabled_p ())
 202     dump_printf_loc (MSG_NOTE, vect_location,
 203                      "=== vect_determine_vectorization_factor ===\n");
 204
 205   for (i = 0; i < nbbs; i++)
 206     {
 207       basic_block bb = bbs[i];
 208
 209       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 210         {
 211           phi = gsi_stmt (si);
 212           stmt_info = vinfo_for_stmt (phi);
 213           if (dump_enabled_p ())
 214             {
 215               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 216               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 217               dump_printf (MSG_NOTE, "\n");
 218             }
 219
 220           gcc_assert (stmt_info);
 221
 222           if (STMT_VINFO_RELEVANT_P (stmt_info))
 223             {
 224               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 225               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 226
 227               if (dump_enabled_p ())
 228                 {
 229                   dump_printf_loc (MSG_NOTE, vect_location,
 230                                    "get vectype for scalar type:  ");
 231                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 232                   dump_printf (MSG_NOTE, "\n");
 233                 }
 234
 235               vectype = get_vectype_for_scalar_type (scalar_type);
 236               if (!vectype)
 237                 {
 238                   if (dump_enabled_p ())
 239                     {
 240                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 241                                        "not vectorized: unsupported "
 242                                        "data-type ");
 243                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 244                                          scalar_type);
 245                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 246                     }
 247                   return false;
 248                 }
 249               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 250
 251               if (dump_enabled_p ())
 252                 {
 253                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 254                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 255                   dump_printf (MSG_NOTE, "\n");
 256                 }
 257
 258               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 259               if (dump_enabled_p ())
 260                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 261                                  nunits);
 262
 263               if (!vectorization_factor
 264                   || (nunits > vectorization_factor))
 265                 vectorization_factor = nunits;
 266             }
 267         }
 268
 269       for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
 270         {
 271           tree vf_vectype;
 272
 273           if (analyze_pattern_stmt)
 274             stmt = pattern_stmt;
 275           else
 276             stmt = gsi_stmt (si);
 277
 278           stmt_info = vinfo_for_stmt (stmt);
 279
 280           if (dump_enabled_p ())
 281             {
 282               dump_printf_loc (MSG_NOTE, vect_location,
 283                                "==> examining statement: ");
 284               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 285               dump_printf (MSG_NOTE, "\n");
 286             }
 287
 288           gcc_assert (stmt_info);
 289
 290           /* Skip stmts which do not need to be vectorized.  */
 291           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 292                && !STMT_VINFO_LIVE_P (stmt_info))
 293               || gimple_clobber_p (stmt))
 294             {
 295               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 296                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 297                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 298                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 299                 {
 300                   stmt = pattern_stmt;
 301                   stmt_info = vinfo_for_stmt (pattern_stmt);
 302                   if (dump_enabled_p ())
 303                     {
 304                       dump_printf_loc (MSG_NOTE, vect_location,
 305                                        "==> examining pattern statement: ");
 306                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 307                       dump_printf (MSG_NOTE, "\n");
 308                     }
 309                 }
 310               else
 311                 {
 312                   if (dump_enabled_p ())
 313                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 314                   gsi_next (&si);
 315                   continue;
 316                 }
 317             }
 318           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 319                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 320                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 321                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 322             analyze_pattern_stmt = true;
 323
 324           /* If a pattern statement has def stmts, analyze them too.  */
 325           if (is_pattern_stmt_p (stmt_info))
 326             {
 327               if (pattern_def_seq == NULL)
 328                 {
 329                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 330                   pattern_def_si = gsi_start (pattern_def_seq);
 331                 }
 332               else if (!gsi_end_p (pattern_def_si))
 333                 gsi_next (&pattern_def_si);
 334               if (pattern_def_seq != NULL)
 335                 {
 336                   gimple pattern_def_stmt = NULL;
 337                   stmt_vec_info pattern_def_stmt_info = NULL;
 338
 339                   while (!gsi_end_p (pattern_def_si))
 340                     {
 341                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 342                       pattern_def_stmt_info
 343                         = vinfo_for_stmt (pattern_def_stmt);
 344                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 345                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 346                         break;
 347                       gsi_next (&pattern_def_si);
 348                     }
 349
 350                   if (!gsi_end_p (pattern_def_si))
 351                     {
 352                       if (dump_enabled_p ())
 353                         {
 354                           dump_printf_loc (MSG_NOTE, vect_location,
 355                                            "==> examining pattern def stmt: ");
 356                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 357                                             pattern_def_stmt, 0);
 358                           dump_printf (MSG_NOTE, "\n");
 359                         }
 360
 361                       stmt = pattern_def_stmt;
 362                       stmt_info = pattern_def_stmt_info;
 363                     }
 364                   else
 365                     {
 366                       pattern_def_si = gsi_none ();
 367                       analyze_pattern_stmt = false;
 368                     }
 369                 }
 370               else
 371                 analyze_pattern_stmt = false;
 372             }
 373
 374           if (gimple_get_lhs (stmt) == NULL_TREE)
 375             {
 376               if (dump_enabled_p ())
 377                 {
 378                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 379                                    "not vectorized: irregular stmt.");
 380                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 381                                     0);
 382                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 383                 }
 384               return false;
 385             }
 386
 387           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 388             {
 389               if (dump_enabled_p ())
 390                 {
 391                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 392                                    "not vectorized: vector stmt in loop:");
 393                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 394                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 395                 }
 396               return false;
 397             }
 398
 399           if (STMT_VINFO_VECTYPE (stmt_info))
 400             {
 401               /* The only case when a vectype had been already set is for stmts
 402                  that contain a dataref, or for "pattern-stmts" (stmts
 403                  generated by the vectorizer to represent/replace a certain
 404                  idiom).  */
 405               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 406                           || is_pattern_stmt_p (stmt_info)
 407                           || !gsi_end_p (pattern_def_si));
 408               vectype = STMT_VINFO_VECTYPE (stmt_info);
 409             }
 410           else
 411             {
 412               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 413               scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 414               if (dump_enabled_p ())
 415                 {
 416                   dump_printf_loc (MSG_NOTE, vect_location,
 417                                    "get vectype for scalar type:  ");
 418                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 419                   dump_printf (MSG_NOTE, "\n");
 420                 }
 421               vectype = get_vectype_for_scalar_type (scalar_type);
 422               if (!vectype)
 423                 {
 424                   if (dump_enabled_p ())
 425                     {
 426                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 427                                        "not vectorized: unsupported "
 428                                        "data-type ");
 429                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 430                                          scalar_type);
 431                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 432                     }
 433                   return false;
 434                 }
 435
 436               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 437
 438               if (dump_enabled_p ())
 439                 {
 440                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 441                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 442                   dump_printf (MSG_NOTE, "\n");
 443                 }
 444             }
 445
 446           /* The vectorization factor is according to the smallest
 447              scalar type (or the largest vector size, but we only
 448              support one vector size per loop).  */
 449           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 450                                                        &dummy);
 451           if (dump_enabled_p ())
 452             {
 453               dump_printf_loc (MSG_NOTE, vect_location,
 454                                "get vectype for scalar type:  ");
 455               dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 456               dump_printf (MSG_NOTE, "\n");
 457             }
 458           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 459           if (!vf_vectype)
 460             {
 461               if (dump_enabled_p ())
 462                 {
 463                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 464                                    "not vectorized: unsupported data-type ");
 465                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 466                                      scalar_type);
 467                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 468                 }
 469               return false;
 470             }
 471
 472           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 473                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 474             {
 475               if (dump_enabled_p ())
 476                 {
 477                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 478                                    "not vectorized: different sized vector "
 479                                    "types in statement, ");
 480                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 481                                      vectype);
 482                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 483                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 484                                      vf_vectype);
 485                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 486                 }
 487               return false;
 488             }
 489
 490           if (dump_enabled_p ())
 491             {
 492               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 493               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 494               dump_printf (MSG_NOTE, "\n");
 495             }
 496
 497           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 498           if (dump_enabled_p ())
 499             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 500           if (!vectorization_factor
 501               || (nunits > vectorization_factor))
 502             vectorization_factor = nunits;
 503
 504           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 505             {
 506               pattern_def_seq = NULL;
 507               gsi_next (&si);
 508             }
 509         }
 510     }
 511
 512   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 513   if (dump_enabled_p ())
 514     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 515                      vectorization_factor);
 516   if (vectorization_factor <= 1)
 517     {
 518       if (dump_enabled_p ())
 519         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 520                          "not vectorized: unsupported data-type\n");
 521       return false;
 522     }
 523   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 524
 525   return true;
 526 }
 527
 528
 529 /* Function vect_is_simple_iv_evolution.
 530
 531    FORNOW: A simple evolution of an induction variables in the loop is
 532    considered a polynomial evolution.  */
 533
 534 static bool
 535 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 536                              tree * step)
 537 {
 538   tree init_expr;
 539   tree step_expr;
 540   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 541   basic_block bb;
 542
 543   /* When there is no evolution in this loop, the evolution function
 544      is not "simple".  */
 545   if (evolution_part == NULL_TREE)
 546     return false;
 547
 548   /* When the evolution is a polynomial of degree >= 2
 549      the evolution function is not "simple".  */
 550   if (tree_is_chrec (evolution_part))
 551     return false;
 552
 553   step_expr = evolution_part;
 554   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 555
 556   if (dump_enabled_p ())
 557     {
 558       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 559       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 560       dump_printf (MSG_NOTE, ",  init: ");
 561       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 562       dump_printf (MSG_NOTE, "\n");
 563     }
 564
 565   *init = init_expr;
 566   *step = step_expr;
 567
 568   if (TREE_CODE (step_expr) != INTEGER_CST
 569       && (TREE_CODE (step_expr) != SSA_NAME
 570           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 571               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 572           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 573               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 574                   || !flag_associative_math)))
 575       && (TREE_CODE (step_expr) != REAL_CST
 576           || !flag_associative_math))
 577     {
 578       if (dump_enabled_p ())
 579         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 580                          "step unknown.\n");
 581       return false;
 582     }
 583
 584   return true;
 585 }
 586
 587 /* Function vect_analyze_scalar_cycles_1.
 588
 589    Examine the cross iteration def-use cycles of scalar variables
 590    in LOOP.  LOOP_VINFO represents the loop that is now being
 591    considered for vectorization (can be LOOP, or an outer-loop
 592    enclosing LOOP).  */
 593
 594 static void
 595 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 596 {
 597   basic_block bb = loop->header;
 598   tree init, step;
 599   stack_vec<gimple, 64> worklist;
 600   gimple_stmt_iterator gsi;
 601   bool double_reduc;
 602
 603   if (dump_enabled_p ())
 604     dump_printf_loc (MSG_NOTE, vect_location,
 605                      "=== vect_analyze_scalar_cycles ===\n");
 606
 607   /* First - identify all inductions.  Reduction detection assumes that all the
 608      inductions have been identified, therefore, this order must not be
 609      changed.  */
 610   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 611     {
 612       gimple phi = gsi_stmt (gsi);
 613       tree access_fn = NULL;
 614       tree def = PHI_RESULT (phi);
 615       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 616
 617       if (dump_enabled_p ())
 618         {
 619           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 620           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 621           dump_printf (MSG_NOTE, "\n");
 622         }
 623
 624       /* Skip virtual phi's.  The data dependences that are associated with
 625          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 626       if (virtual_operand_p (def))
 627         continue;
 628
 629       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 630
 631       /* Analyze the evolution function.  */
 632       access_fn = analyze_scalar_evolution (loop, def);
 633       if (access_fn)
 634         {
 635           STRIP_NOPS (access_fn);
 636           if (dump_enabled_p ())
 637             {
 638               dump_printf_loc (MSG_NOTE, vect_location,
 639                                "Access function of PHI: ");
 640               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 641               dump_printf (MSG_NOTE, "\n");
 642             }
 643           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 644             = evolution_part_in_loop_num (access_fn, loop->num);
 645         }
 646
 647       if (!access_fn
 648           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 649           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 650               && TREE_CODE (step) != INTEGER_CST))
 651         {
 652           worklist.safe_push (phi);
 653           continue;
 654         }
 655
 656       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 657
 658       if (dump_enabled_p ())
 659         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 660       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 661     }
 662
 663
 664   /* Second - identify all reductions and nested cycles.  */
 665   while (worklist.length () > 0)
 666     {
 667       gimple phi = worklist.pop ();
 668       tree def = PHI_RESULT (phi);
 669       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 670       gimple reduc_stmt;
 671       bool nested_cycle;
 672
 673       if (dump_enabled_p ())
 674         {
 675           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 676           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 677           dump_printf (MSG_NOTE, "\n");
 678         }
 679
 680       gcc_assert (!virtual_operand_p (def)
 681                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 682
 683       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 684       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 685                                                 &double_reduc);
 686       if (reduc_stmt)
 687         {
 688           if (double_reduc)
 689             {
 690               if (dump_enabled_p ())
 691                 dump_printf_loc (MSG_NOTE, vect_location,
 692                                  "Detected double reduction.\n");
 693
 694               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 695               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 696                                                     vect_double_reduction_def;
 697             }
 698           else
 699             {
 700               if (nested_cycle)
 701                 {
 702                   if (dump_enabled_p ())
 703                     dump_printf_loc (MSG_NOTE, vect_location,
 704                                      "Detected vectorizable nested cycle.\n");
 705
 706                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 707                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 708                                                              vect_nested_cycle;
 709                 }
 710               else
 711                 {
 712                   if (dump_enabled_p ())
 713                     dump_printf_loc (MSG_NOTE, vect_location,
 714                                      "Detected reduction.\n");
 715
 716                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 717                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 718                                                            vect_reduction_def;
 719                   /* Store the reduction cycles for possible vectorization in
 720                      loop-aware SLP.  */
 721                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 722                 }
 723             }
 724         }
 725       else
 726         if (dump_enabled_p ())
 727           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 728                            "Unknown def-use cycle pattern.\n");
 729     }
 730 }
 731
 732
 733 /* Function vect_analyze_scalar_cycles.
 734
 735    Examine the cross iteration def-use cycles of scalar variables, by
 736    analyzing the loop-header PHIs of scalar variables.  Classify each
 737    cycle as one of the following: invariant, induction, reduction, unknown.
 738    We do that for the loop represented by LOOP_VINFO, and also to its
 739    inner-loop, if exists.
 740    Examples for scalar cycles:
 741
 742    Example1: reduction:
 743
 744               loop1:
 745               for (i=0; i<N; i++)
 746                  sum += a[i];
 747
 748    Example2: induction:
 749
 750               loop2:
 751               for (i=0; i<N; i++)
 752                  a[i] = i;  */
 753
 754 static void
 755 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 756 {
 757   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 758
 759   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 760
 761   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 762      Reductions in such inner-loop therefore have different properties than
 763      the reductions in the nest that gets vectorized:
 764      1. When vectorized, they are executed in the same order as in the original
 765         scalar loop, so we can't change the order of computation when
 766         vectorizing them.
 767      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 768         current checks are too strict.  */
 769
 770   if (loop->inner)
 771     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 772 }
 773
 774
 775 /* Function vect_get_loop_niters.
 776
 777    Determine how many iterations the loop is executed and place it
 778    in NUMBER_OF_ITERATIONS.
 779
 780    Return the loop exit condition.  */
 781
 782 static gimple
 783 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
 784 {
 785   tree niters;
 786
 787   if (dump_enabled_p ())
 788     dump_printf_loc (MSG_NOTE, vect_location,
 789                      "=== get_loop_niters ===\n");
 790
 791   niters = number_of_latch_executions (loop);
 792   /* We want the number of loop header executions which is the number
 793      of latch executions plus one.
 794      ???  For UINT_MAX latch executions this number overflows to zero
 795      for loops like do { n++; } while (n != 0);  */
 796   if (niters && !chrec_contains_undetermined (niters))
 797     niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), niters,
 798                           build_int_cst (TREE_TYPE (niters), 1));
 799   *number_of_iterations = niters;
 800
 801   return get_loop_exit_condition (loop);
 802 }
 803
 804
 805 /* Function bb_in_loop_p
 806
 807    Used as predicate for dfs order traversal of the loop bbs.  */
 808
 809 static bool
 810 bb_in_loop_p (const_basic_block bb, const void *data)
 811 {
 812   const struct loop *const loop = (const struct loop *)data;
 813   if (flow_bb_inside_loop_p (loop, bb))
 814     return true;
 815   return false;
 816 }
 817
 818
 819 /* Function new_loop_vec_info.
 820
 821    Create and initialize a new loop_vec_info struct for LOOP, as well as
 822    stmt_vec_info structs for all the stmts in LOOP.  */
 823
 824 static loop_vec_info
 825 new_loop_vec_info (struct loop *loop)
 826 {
 827   loop_vec_info res;
 828   basic_block *bbs;
 829   gimple_stmt_iterator si;
 830   unsigned int i, nbbs;
 831
 832   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 833   LOOP_VINFO_LOOP (res) = loop;
 834
 835   bbs = get_loop_body (loop);
 836
 837   /* Create/Update stmt_info for all stmts in the loop.  */
 838   for (i = 0; i < loop->num_nodes; i++)
 839     {
 840       basic_block bb = bbs[i];
 841
 842       /* BBs in a nested inner-loop will have been already processed (because
 843          we will have called vect_analyze_loop_form for any nested inner-loop).
 844          Therefore, for stmts in an inner-loop we just want to update the
 845          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 846          loop_info of the outer-loop we are currently considering to vectorize
 847          (instead of the loop_info of the inner-loop).
 848          For stmts in other BBs we need to create a stmt_info from scratch.  */
 849       if (bb->loop_father != loop)
 850         {
 851           /* Inner-loop bb.  */
 852           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 853           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 854             {
 855               gimple phi = gsi_stmt (si);
 856               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 857               loop_vec_info inner_loop_vinfo =
 858                 STMT_VINFO_LOOP_VINFO (stmt_info);
 859               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 860               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 861             }
 862           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 863            {
 864               gimple stmt = gsi_stmt (si);
 865               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 866               loop_vec_info inner_loop_vinfo =
 867                  STMT_VINFO_LOOP_VINFO (stmt_info);
 868               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 869               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 870            }
 871         }
 872       else
 873         {
 874           /* bb in current nest.  */
 875           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 876             {
 877               gimple phi = gsi_stmt (si);
 878               gimple_set_uid (phi, 0);
 879               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 880             }
 881
 882           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 883             {
 884               gimple stmt = gsi_stmt (si);
 885               gimple_set_uid (stmt, 0);
 886               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 887             }
 888         }
 889     }
 890
 891   /* CHECKME: We want to visit all BBs before their successors (except for
 892      latch blocks, for which this assertion wouldn't hold).  In the simple
 893      case of the loop forms we allow, a dfs order of the BBs would the same
 894      as reversed postorder traversal, so we are safe.  */
 895
 896    free (bbs);
 897    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 898    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 899                               bbs, loop->num_nodes, loop);
 900    gcc_assert (nbbs == loop->num_nodes);
 901
 902   LOOP_VINFO_BBS (res) = bbs;
 903   LOOP_VINFO_NITERS (res) = NULL;
 904   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 905   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 906   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 907   LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
 908   LOOP_VINFO_VECT_FACTOR (res) = 0;
 909   LOOP_VINFO_LOOP_NEST (res).create (3);
 910   LOOP_VINFO_DATAREFS (res).create (10);
 911   LOOP_VINFO_DDRS (res).create (10 * 10);
 912   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 913   LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
 914              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 915   LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
 916              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 917   LOOP_VINFO_GROUPED_STORES (res).create (10);
 918   LOOP_VINFO_REDUCTIONS (res).create (10);
 919   LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
 920   LOOP_VINFO_SLP_INSTANCES (res).create (10);
 921   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 922   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
 923   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
 924   LOOP_VINFO_PEELING_FOR_NITER (res) = false;
 925   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
 926
 927   return res;
 928 }
 929
 930
 931 /* Function destroy_loop_vec_info.
 932
 933    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 934    stmts in the loop.  */
 935
 936 void
 937 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 938 {
 939   struct loop *loop;
 940   basic_block *bbs;
 941   int nbbs;
 942   gimple_stmt_iterator si;
 943   int j;
 944   vec<slp_instance> slp_instances;
 945   slp_instance instance;
 946   bool swapped;
 947
 948   if (!loop_vinfo)
 949     return;
 950
 951   loop = LOOP_VINFO_LOOP (loop_vinfo);
 952
 953   bbs = LOOP_VINFO_BBS (loop_vinfo);
 954   nbbs = clean_stmts ? loop->num_nodes : 0;
 955   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
 956
 957   for (j = 0; j < nbbs; j++)
 958     {
 959       basic_block bb = bbs[j];
 960       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 961         free_stmt_vec_info (gsi_stmt (si));
 962
 963       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 964         {
 965           gimple stmt = gsi_stmt (si);
 966
 967           /* We may have broken canonical form by moving a constant
 968              into RHS1 of a commutative op.  Fix such occurrences.  */
 969           if (swapped && is_gimple_assign (stmt))
 970             {
 971               enum tree_code code = gimple_assign_rhs_code (stmt);
 972
 973               if ((code == PLUS_EXPR
 974                    || code == POINTER_PLUS_EXPR
 975                    || code == MULT_EXPR)
 976                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 977                 swap_ssa_operands (stmt,
 978                                    gimple_assign_rhs1_ptr (stmt),
 979                                    gimple_assign_rhs2_ptr (stmt));
 980             }
 981
 982           /* Free stmt_vec_info.  */
 983           free_stmt_vec_info (stmt);
 984           gsi_next (&si);
 985         }
 986     }
 987
 988   free (LOOP_VINFO_BBS (loop_vinfo));
 989   vect_destroy_datarefs (loop_vinfo, NULL);
 990   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 991   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
 992   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
 993   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
 994   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
 995   FOR_EACH_VEC_ELT (slp_instances, j, instance)
 996     vect_free_slp_instance (instance);
 997
 998   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
 999   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1000   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1001   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1002
1003   if (LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
1004     LOOP_VINFO_PEELING_HTAB (loop_vinfo).dispose ();
1005
1006   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1007
1008   free (loop_vinfo);
1009   loop->aux = NULL;
1010 }
1011
1012
1013 /* Function vect_analyze_loop_1.
1014
1015    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1016    for it. The different analyses will record information in the
1017    loop_vec_info struct.  This is a subset of the analyses applied in
1018    vect_analyze_loop, to be applied on an inner-loop nested in the loop
1019    that is now considered for (outer-loop) vectorization.  */
1020
1021 static loop_vec_info
1022 vect_analyze_loop_1 (struct loop *loop)
1023 {
1024   loop_vec_info loop_vinfo;
1025
1026   if (dump_enabled_p ())
1027     dump_printf_loc (MSG_NOTE, vect_location,
1028                      "===== analyze_loop_nest_1 =====\n");
1029
1030   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1031
1032   loop_vinfo = vect_analyze_loop_form (loop);
1033   if (!loop_vinfo)
1034     {
1035       if (dump_enabled_p ())
1036         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1037                          "bad inner-loop form.\n");
1038       return NULL;
1039     }
1040
1041   return loop_vinfo;
1042 }
1043
1044
1045 /* Function vect_analyze_loop_form.
1046
1047    Verify that certain CFG restrictions hold, including:
1048    - the loop has a pre-header
1049    - the loop has a single entry and exit
1050    - the loop exit condition is simple enough, and the number of iterations
1051      can be analyzed (a countable loop).  */
1052
1053 loop_vec_info
1054 vect_analyze_loop_form (struct loop *loop)
1055 {
1056   loop_vec_info loop_vinfo;
1057   gimple loop_cond;
1058   tree number_of_iterations = NULL;
1059   loop_vec_info inner_loop_vinfo = NULL;
1060
1061   if (dump_enabled_p ())
1062     dump_printf_loc (MSG_NOTE, vect_location,
1063                      "=== vect_analyze_loop_form ===\n");
1064
1065   /* Different restrictions apply when we are considering an inner-most loop,
1066      vs. an outer (nested) loop.
1067      (FORNOW. May want to relax some of these restrictions in the future).  */
1068
1069   if (!loop->inner)
1070     {
1071       /* Inner-most loop.  We currently require that the number of BBs is
1072          exactly 2 (the header and latch).  Vectorizable inner-most loops
1073          look like this:
1074
1075                         (pre-header)
1076                            |
1077                           header <--------+
1078                            | |            |
1079                            | +--> latch --+
1080                            |
1081                         (exit-bb)  */
1082
1083       if (loop->num_nodes != 2)
1084         {
1085           if (dump_enabled_p ())
1086             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1087                              "not vectorized: control flow in loop.\n");
1088           return NULL;
1089         }
1090
1091       if (empty_block_p (loop->header))
1092         {
1093           if (dump_enabled_p ())
1094             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1095                              "not vectorized: empty loop.\n");
1096           return NULL;
1097         }
1098     }
1099   else
1100     {
1101       struct loop *innerloop = loop->inner;
1102       edge entryedge;
1103
1104       /* Nested loop. We currently require that the loop is doubly-nested,
1105          contains a single inner loop, and the number of BBs is exactly 5.
1106          Vectorizable outer-loops look like this:
1107
1108                         (pre-header)
1109                            |
1110                           header <---+
1111                            |         |
1112                           inner-loop |
1113                            |         |
1114                           tail ------+
1115                            |
1116                         (exit-bb)
1117
1118          The inner-loop has the properties expected of inner-most loops
1119          as described above.  */
1120
1121       if ((loop->inner)->inner || (loop->inner)->next)
1122         {
1123           if (dump_enabled_p ())
1124             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1125                              "not vectorized: multiple nested loops.\n");
1126           return NULL;
1127         }
1128
1129       /* Analyze the inner-loop.  */
1130       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1131       if (!inner_loop_vinfo)
1132         {
1133           if (dump_enabled_p ())
1134             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1135                              "not vectorized: Bad inner loop.\n");
1136           return NULL;
1137         }
1138
1139       if (!expr_invariant_in_loop_p (loop,
1140                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
1141         {
1142           if (dump_enabled_p ())
1143             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1144                              "not vectorized: inner-loop count not"
1145                              " invariant.\n");
1146           destroy_loop_vec_info (inner_loop_vinfo, true);
1147           return NULL;
1148         }
1149
1150       if (loop->num_nodes != 5)
1151         {
1152           if (dump_enabled_p ())
1153             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1154                              "not vectorized: control flow in loop.\n");
1155           destroy_loop_vec_info (inner_loop_vinfo, true);
1156           return NULL;
1157         }
1158
1159       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1160       entryedge = EDGE_PRED (innerloop->header, 0);
1161       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1162         entryedge = EDGE_PRED (innerloop->header, 1);
1163
1164       if (entryedge->src != loop->header
1165           || !single_exit (innerloop)
1166           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1167         {
1168           if (dump_enabled_p ())
1169             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1170                              "not vectorized: unsupported outerloop form.\n");
1171           destroy_loop_vec_info (inner_loop_vinfo, true);
1172           return NULL;
1173         }
1174
1175       if (dump_enabled_p ())
1176         dump_printf_loc (MSG_NOTE, vect_location,
1177                          "Considering outer-loop vectorization.\n");
1178     }
1179
1180   if (!single_exit (loop)
1181       || EDGE_COUNT (loop->header->preds) != 2)
1182     {
1183       if (dump_enabled_p ())
1184         {
1185           if (!single_exit (loop))
1186             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1187                              "not vectorized: multiple exits.\n");
1188           else if (EDGE_COUNT (loop->header->preds) != 2)
1189             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1190                              "not vectorized: too many incoming edges.\n");
1191         }
1192       if (inner_loop_vinfo)
1193         destroy_loop_vec_info (inner_loop_vinfo, true);
1194       return NULL;
1195     }
1196
1197   /* We assume that the loop exit condition is at the end of the loop. i.e,
1198      that the loop is represented as a do-while (with a proper if-guard
1199      before the loop if needed), where the loop header contains all the
1200      executable statements, and the latch is empty.  */
1201   if (!empty_block_p (loop->latch)
1202       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1203     {
1204       if (dump_enabled_p ())
1205         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1206                          "not vectorized: latch block not empty.\n");
1207       if (inner_loop_vinfo)
1208         destroy_loop_vec_info (inner_loop_vinfo, true);
1209       return NULL;
1210     }
1211
1212   /* Make sure there exists a single-predecessor exit bb:  */
1213   if (!single_pred_p (single_exit (loop)->dest))
1214     {
1215       edge e = single_exit (loop);
1216       if (!(e->flags & EDGE_ABNORMAL))
1217         {
1218           split_loop_exit_edge (e);
1219           if (dump_enabled_p ())
1220             dump_printf (MSG_NOTE, "split exit edge.\n");
1221         }
1222       else
1223         {
1224           if (dump_enabled_p ())
1225             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1226                              "not vectorized: abnormal loop exit edge.\n");
1227           if (inner_loop_vinfo)
1228             destroy_loop_vec_info (inner_loop_vinfo, true);
1229           return NULL;
1230         }
1231     }
1232
1233   loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1234   if (!loop_cond)
1235     {
1236       if (dump_enabled_p ())
1237         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1238                          "not vectorized: complicated exit condition.\n");
1239       if (inner_loop_vinfo)
1240         destroy_loop_vec_info (inner_loop_vinfo, true);
1241       return NULL;
1242     }
1243
1244   if (!number_of_iterations
1245       || chrec_contains_undetermined (number_of_iterations))
1246     {
1247       if (dump_enabled_p ())
1248         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1249                          "not vectorized: number of iterations cannot be "
1250                          "computed.\n");
1251       if (inner_loop_vinfo)
1252         destroy_loop_vec_info (inner_loop_vinfo, true);
1253       return NULL;
1254     }
1255
1256   if (integer_zerop (number_of_iterations))
1257     {
1258       if (dump_enabled_p ())
1259         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1260                          "not vectorized: number of iterations = 0.\n");
1261       if (inner_loop_vinfo)
1262         destroy_loop_vec_info (inner_loop_vinfo, true);
1263       return NULL;
1264     }
1265
1266   loop_vinfo = new_loop_vec_info (loop);
1267   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1268   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1269
1270   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1271     {
1272       if (dump_enabled_p ())
1273         {
1274           dump_printf_loc (MSG_NOTE, vect_location,
1275                            "Symbolic number of iterations is ");
1276           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1277           dump_printf (MSG_NOTE, "\n");
1278         }
1279     }
1280
1281   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1282
1283   /* CHECKME: May want to keep it around it in the future.  */
1284   if (inner_loop_vinfo)
1285     destroy_loop_vec_info (inner_loop_vinfo, false);
1286
1287   gcc_assert (!loop->aux);
1288   loop->aux = loop_vinfo;
1289   return loop_vinfo;
1290 }
1291
1292
1293 /* Function vect_analyze_loop_operations.
1294
1295    Scan the loop stmts and make sure they are all vectorizable.  */
1296
1297 static bool
1298 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1299 {
1300   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1301   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1302   int nbbs = loop->num_nodes;
1303   gimple_stmt_iterator si;
1304   unsigned int vectorization_factor = 0;
1305   int i;
1306   gimple phi;
1307   stmt_vec_info stmt_info;
1308   bool need_to_vectorize = false;
1309   int min_profitable_iters;
1310   int min_scalar_loop_bound;
1311   unsigned int th;
1312   bool only_slp_in_loop = true, ok;
1313   HOST_WIDE_INT max_niter;
1314   HOST_WIDE_INT estimated_niter;
1315   int min_profitable_estimate;
1316
1317   if (dump_enabled_p ())
1318     dump_printf_loc (MSG_NOTE, vect_location,
1319                      "=== vect_analyze_loop_operations ===\n");
1320
1321   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1322   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1323   if (slp)
1324     {
1325       /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1326          vectorization factor of the loop is the unrolling factor required by
1327          the SLP instances.  If that unrolling factor is 1, we say, that we
1328          perform pure SLP on loop - cross iteration parallelism is not
1329          exploited.  */
1330       for (i = 0; i < nbbs; i++)
1331         {
1332           basic_block bb = bbs[i];
1333           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1334             {
1335               gimple stmt = gsi_stmt (si);
1336               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1337               gcc_assert (stmt_info);
1338               if ((STMT_VINFO_RELEVANT_P (stmt_info)
1339                    || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1340                   && !PURE_SLP_STMT (stmt_info))
1341                 /* STMT needs both SLP and loop-based vectorization.  */
1342                 only_slp_in_loop = false;
1343             }
1344         }
1345
1346       if (only_slp_in_loop)
1347         vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1348       else
1349         vectorization_factor = least_common_multiple (vectorization_factor,
1350                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1351
1352       LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1353       if (dump_enabled_p ())
1354         dump_printf_loc (MSG_NOTE, vect_location,
1355                          "Updating vectorization factor to %d\n",
1356                          vectorization_factor);
1357     }
1358
1359   for (i = 0; i < nbbs; i++)
1360     {
1361       basic_block bb = bbs[i];
1362
1363       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1364         {
1365           phi = gsi_stmt (si);
1366           ok = true;
1367
1368           stmt_info = vinfo_for_stmt (phi);
1369           if (dump_enabled_p ())
1370             {
1371               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1372               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1373               dump_printf (MSG_NOTE, "\n");
1374             }
1375
1376           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1377              (i.e., a phi in the tail of the outer-loop).  */
1378           if (! is_loop_header_bb_p (bb))
1379             {
1380               /* FORNOW: we currently don't support the case that these phis
1381                  are not used in the outerloop (unless it is double reduction,
1382                  i.e., this phi is vect_reduction_def), cause this case
1383                  requires to actually do something here.  */
1384               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1385                    || STMT_VINFO_LIVE_P (stmt_info))
1386                   && STMT_VINFO_DEF_TYPE (stmt_info)
1387                      != vect_double_reduction_def)
1388                 {
1389                   if (dump_enabled_p ())
1390                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1391                                      "Unsupported loop-closed phi in "
1392                                      "outer-loop.\n");
1393                   return false;
1394                 }
1395
1396               /* If PHI is used in the outer loop, we check that its operand
1397                  is defined in the inner loop.  */
1398               if (STMT_VINFO_RELEVANT_P (stmt_info))
1399                 {
1400                   tree phi_op;
1401                   gimple op_def_stmt;
1402
1403                   if (gimple_phi_num_args (phi) != 1)
1404                     return false;
1405
1406                   phi_op = PHI_ARG_DEF (phi, 0);
1407                   if (TREE_CODE (phi_op) != SSA_NAME)
1408                     return false;
1409
1410                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1411                   if (gimple_nop_p (op_def_stmt)
1412                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1413                       || !vinfo_for_stmt (op_def_stmt))
1414                     return false;
1415
1416                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1417                         != vect_used_in_outer
1418                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1419                            != vect_used_in_outer_by_reduction)
1420                     return false;
1421                 }
1422
1423               continue;
1424             }
1425
1426           gcc_assert (stmt_info);
1427
1428           if (STMT_VINFO_LIVE_P (stmt_info))
1429             {
1430               /* FORNOW: not yet supported.  */
1431               if (dump_enabled_p ())
1432                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1433                                  "not vectorized: value used after loop.\n");
1434               return false;
1435             }
1436
1437           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1438               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1439             {
1440               /* A scalar-dependence cycle that we don't support.  */
1441               if (dump_enabled_p ())
1442                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1443                                  "not vectorized: scalar dependence cycle.\n");
1444               return false;
1445             }
1446
1447           if (STMT_VINFO_RELEVANT_P (stmt_info))
1448             {
1449               need_to_vectorize = true;
1450               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1451                 ok = vectorizable_induction (phi, NULL, NULL);
1452             }
1453
1454           if (!ok)
1455             {
1456               if (dump_enabled_p ())
1457                 {
1458                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1459                                    "not vectorized: relevant phi not "
1460                                    "supported: ");
1461                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1462                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1463                 }
1464               return false;
1465             }
1466         }
1467
1468       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1469         {
1470           gimple stmt = gsi_stmt (si);
1471           if (!gimple_clobber_p (stmt)
1472               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1473             return false;
1474         }
1475     } /* bbs */
1476
1477   /* All operations in the loop are either irrelevant (deal with loop
1478      control, or dead), or only used outside the loop and can be moved
1479      out of the loop (e.g. invariants, inductions).  The loop can be
1480      optimized away by scalar optimizations.  We're better off not
1481      touching this loop.  */
1482   if (!need_to_vectorize)
1483     {
1484       if (dump_enabled_p ())
1485         dump_printf_loc (MSG_NOTE, vect_location,
1486                          "All the computation can be taken out of the loop.\n");
1487       if (dump_enabled_p ())
1488         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1489                          "not vectorized: redundant loop. no profit to "
1490                          "vectorize.\n");
1491       return false;
1492     }
1493
1494   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1495     dump_printf_loc (MSG_NOTE, vect_location,
1496                      "vectorization_factor = %d, niters = "
1497                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1498                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1499
1500   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1501        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1502       || ((max_niter = max_stmt_executions_int (loop)) != -1
1503           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1504     {
1505       if (dump_enabled_p ())
1506         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1507                          "not vectorized: iteration count too small.\n");
1508       if (dump_enabled_p ())
1509         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1510                          "not vectorized: iteration count smaller than "
1511                          "vectorization factor.\n");
1512       return false;
1513     }
1514
1515   /* Analyze cost.  Decide if worth while to vectorize.  */
1516
1517   /* Once VF is set, SLP costs should be updated since the number of created
1518      vector stmts depends on VF.  */
1519   vect_update_slp_costs_according_to_vf (loop_vinfo);
1520
1521   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1522                                       &min_profitable_estimate);
1523   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1524
1525   if (min_profitable_iters < 0)
1526     {
1527       if (dump_enabled_p ())
1528         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1529                          "not vectorized: vectorization not profitable.\n");
1530       if (dump_enabled_p ())
1531         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1532                          "not vectorized: vector version will never be "
1533                          "profitable.\n");
1534       return false;
1535     }
1536
1537   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1538                             * vectorization_factor) - 1);
1539
1540
1541   /* Use the cost model only if it is more conservative than user specified
1542      threshold.  */
1543
1544   th = (unsigned) min_scalar_loop_bound;
1545   if (min_profitable_iters
1546       && (!min_scalar_loop_bound
1547           || min_profitable_iters > min_scalar_loop_bound))
1548     th = (unsigned) min_profitable_iters;
1549
1550   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1551       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1552     {
1553       if (dump_enabled_p ())
1554         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1555                          "not vectorized: vectorization not profitable.\n");
1556       if (dump_enabled_p ())
1557         dump_printf_loc (MSG_NOTE, vect_location,
1558                          "not vectorized: iteration count smaller than user "
1559                          "specified loop bound parameter or minimum profitable "
1560                          "iterations (whichever is more conservative).\n");
1561       return false;
1562     }
1563
1564   if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1565       && ((unsigned HOST_WIDE_INT) estimated_niter
1566           <= MAX (th, (unsigned)min_profitable_estimate)))
1567     {
1568       if (dump_enabled_p ())
1569         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1570                          "not vectorized: estimated iteration count too "
1571                          "small.\n");
1572       if (dump_enabled_p ())
1573         dump_printf_loc (MSG_NOTE, vect_location,
1574                          "not vectorized: estimated iteration count smaller "
1575                          "than specified loop bound parameter or minimum "
1576                          "profitable iterations (whichever is more "
1577                          "conservative).\n");
1578       return false;
1579     }
1580
1581   return true;
1582 }
1583
1584
1585 /* Function vect_analyze_loop_2.
1586
1587    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1588    for it.  The different analyses will record information in the
1589    loop_vec_info struct.  */
1590 static bool
1591 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1592 {
1593   bool ok, slp = false;
1594   int max_vf = MAX_VECTORIZATION_FACTOR;
1595   int min_vf = 2;
1596
1597   /* Find all data references in the loop (which correspond to vdefs/vuses)
1598      and analyze their evolution in the loop.  Also adjust the minimal
1599      vectorization factor according to the loads and stores.
1600
1601      FORNOW: Handle only simple, array references, which
1602      alignment can be forced, and aligned pointer-references.  */
1603
1604   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1605   if (!ok)
1606     {
1607       if (dump_enabled_p ())
1608         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1609                          "bad data references.\n");
1610       return false;
1611     }
1612
1613   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1614      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1615
1616   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1617   if (!ok)
1618     {
1619       if (dump_enabled_p ())
1620         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1621                          "bad data access.\n");
1622       return false;
1623     }
1624
1625   /* Classify all cross-iteration scalar data-flow cycles.
1626      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1627
1628   vect_analyze_scalar_cycles (loop_vinfo);
1629
1630   vect_pattern_recog (loop_vinfo, NULL);
1631
1632   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1633
1634   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1635   if (!ok)
1636     {
1637       if (dump_enabled_p ())
1638         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1639                          "unexpected pattern.\n");
1640       return false;
1641     }
1642
1643   /* Analyze data dependences between the data-refs in the loop
1644      and adjust the maximum vectorization factor according to
1645      the dependences.
1646      FORNOW: fail at the first data dependence that we encounter.  */
1647
1648   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1649   if (!ok
1650       || max_vf < min_vf)
1651     {
1652       if (dump_enabled_p ())
1653             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1654                              "bad data dependence.\n");
1655       return false;
1656     }
1657
1658   ok = vect_determine_vectorization_factor (loop_vinfo);
1659   if (!ok)
1660     {
1661       if (dump_enabled_p ())
1662         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1663                          "can't determine vectorization factor.\n");
1664       return false;
1665     }
1666   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1667     {
1668       if (dump_enabled_p ())
1669         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1670                          "bad data dependence.\n");
1671       return false;
1672     }
1673
1674   /* Analyze the alignment of the data-refs in the loop.
1675      Fail if a data reference is found that cannot be vectorized.  */
1676
1677   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1678   if (!ok)
1679     {
1680       if (dump_enabled_p ())
1681         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1682                          "bad data alignment.\n");
1683       return false;
1684     }
1685
1686   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1687      It is important to call pruning after vect_analyze_data_ref_accesses,
1688      since we use grouping information gathered by interleaving analysis.  */
1689   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1690   if (!ok)
1691     {
1692       if (dump_enabled_p ())
1693         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1694                          "too long list of versioning for alias "
1695                          "run-time tests.\n");
1696       return false;
1697     }
1698
1699   /* This pass will decide on using loop versioning and/or loop peeling in
1700      order to enhance the alignment of data references in the loop.  */
1701
1702   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1703   if (!ok)
1704     {
1705       if (dump_enabled_p ())
1706         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1707                          "bad data alignment.\n");
1708       return false;
1709     }
1710
1711   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1712   ok = vect_analyze_slp (loop_vinfo, NULL);
1713   if (ok)
1714     {
1715       /* Decide which possible SLP instances to SLP.  */
1716       slp = vect_make_slp_decision (loop_vinfo);
1717
1718       /* Find stmts that need to be both vectorized and SLPed.  */
1719       vect_detect_hybrid_slp (loop_vinfo);
1720     }
1721   else
1722     return false;
1723
1724   /* Scan all the operations in the loop and make sure they are
1725      vectorizable.  */
1726
1727   ok = vect_analyze_loop_operations (loop_vinfo, slp);
1728   if (!ok)
1729     {
1730       if (dump_enabled_p ())
1731         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1732                          "bad operation or unsupported loop bound.\n");
1733       return false;
1734     }
1735
1736   /* Decide whether we need to create an epilogue loop to handle
1737      remaining scalar iterations.  */
1738   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1739       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1740     {
1741       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1742                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1743           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1744         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1745     }
1746   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1747            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1748                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
1749     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1750
1751   /* If an epilogue loop is required make sure we can create one.  */
1752   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1753       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1754     {
1755       if (dump_enabled_p ())
1756         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1757       if (!vect_can_advance_ivs_p (loop_vinfo)
1758           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1759                                            single_exit (LOOP_VINFO_LOOP
1760                                                          (loop_vinfo))))
1761         {
1762           if (dump_enabled_p ())
1763             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1764                              "not vectorized: can't create required "
1765                              "epilog loop\n");
1766           return false;
1767         }
1768     }
1769
1770   return true;
1771 }
1772
1773 /* Function vect_analyze_loop.
1774
1775    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1776    for it.  The different analyses will record information in the
1777    loop_vec_info struct.  */
1778 loop_vec_info
1779 vect_analyze_loop (struct loop *loop)
1780 {
1781   loop_vec_info loop_vinfo;
1782   unsigned int vector_sizes;
1783
1784   /* Autodetect first vector size we try.  */
1785   current_vector_size = 0;
1786   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1787
1788   if (dump_enabled_p ())
1789     dump_printf_loc (MSG_NOTE, vect_location,
1790                      "===== analyze_loop_nest =====\n");
1791
1792   if (loop_outer (loop)
1793       && loop_vec_info_for_loop (loop_outer (loop))
1794       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1795     {
1796       if (dump_enabled_p ())
1797         dump_printf_loc (MSG_NOTE, vect_location,
1798                          "outer-loop already vectorized.\n");
1799       return NULL;
1800     }
1801
1802   while (1)
1803     {
1804       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1805       loop_vinfo = vect_analyze_loop_form (loop);
1806       if (!loop_vinfo)
1807         {
1808           if (dump_enabled_p ())
1809             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1810                              "bad loop form.\n");
1811           return NULL;
1812         }
1813
1814       if (vect_analyze_loop_2 (loop_vinfo))
1815         {
1816           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1817
1818           return loop_vinfo;
1819         }
1820
1821       destroy_loop_vec_info (loop_vinfo, true);
1822
1823       vector_sizes &= ~current_vector_size;
1824       if (vector_sizes == 0
1825           || current_vector_size == 0)
1826         return NULL;
1827
1828       /* Try the next biggest vector size.  */
1829       current_vector_size = 1 << floor_log2 (vector_sizes);
1830       if (dump_enabled_p ())
1831         dump_printf_loc (MSG_NOTE, vect_location,
1832                          "***** Re-trying analysis with "
1833                          "vector size %d\n", current_vector_size);
1834     }
1835 }
1836
1837
1838 /* Function reduction_code_for_scalar_code
1839
1840    Input:
1841    CODE - tree_code of a reduction operations.
1842
1843    Output:
1844    REDUC_CODE - the corresponding tree-code to be used to reduce the
1845       vector of partial results into a single scalar result (which
1846       will also reside in a vector) or ERROR_MARK if the operation is
1847       a supported reduction operation, but does not have such tree-code.
1848
1849    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1850
1851 static bool
1852 reduction_code_for_scalar_code (enum tree_code code,
1853                                 enum tree_code *reduc_code)
1854 {
1855   switch (code)
1856     {
1857       case MAX_EXPR:
1858         *reduc_code = REDUC_MAX_EXPR;
1859         return true;
1860
1861       case MIN_EXPR:
1862         *reduc_code = REDUC_MIN_EXPR;
1863         return true;
1864
1865       case PLUS_EXPR:
1866         *reduc_code = REDUC_PLUS_EXPR;
1867         return true;
1868
1869       case MULT_EXPR:
1870       case MINUS_EXPR:
1871       case BIT_IOR_EXPR:
1872       case BIT_XOR_EXPR:
1873       case BIT_AND_EXPR:
1874         *reduc_code = ERROR_MARK;
1875         return true;
1876
1877       default:
1878        return false;
1879     }
1880 }
1881
1882
1883 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
1884    STMT is printed with a message MSG. */
1885
1886 static void
1887 report_vect_op (int msg_type, gimple stmt, const char *msg)
1888 {
1889   dump_printf_loc (msg_type, vect_location, "%s", msg);
1890   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1891   dump_printf (msg_type, "\n");
1892 }
1893
1894
1895 /* Detect SLP reduction of the form:
1896
1897    #a1 = phi <a5, a0>
1898    a2 = operation (a1)
1899    a3 = operation (a2)
1900    a4 = operation (a3)
1901    a5 = operation (a4)
1902
1903    #a = phi <a5>
1904
1905    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1906    FIRST_STMT is the first reduction stmt in the chain
1907    (a2 = operation (a1)).
1908
1909    Return TRUE if a reduction chain was detected.  */
1910
1911 static bool
1912 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1913 {
1914   struct loop *loop = (gimple_bb (phi))->loop_father;
1915   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1916   enum tree_code code;
1917   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1918   stmt_vec_info use_stmt_info, current_stmt_info;
1919   tree lhs;
1920   imm_use_iterator imm_iter;
1921   use_operand_p use_p;
1922   int nloop_uses, size = 0, n_out_of_loop_uses;
1923   bool found = false;
1924
1925   if (loop != vect_loop)
1926     return false;
1927
1928   lhs = PHI_RESULT (phi);
1929   code = gimple_assign_rhs_code (first_stmt);
1930   while (1)
1931     {
1932       nloop_uses = 0;
1933       n_out_of_loop_uses = 0;
1934       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1935         {
1936           gimple use_stmt = USE_STMT (use_p);
1937           if (is_gimple_debug (use_stmt))
1938             continue;
1939
1940           use_stmt = USE_STMT (use_p);
1941
1942           /* Check if we got back to the reduction phi.  */
1943           if (use_stmt == phi)
1944             {
1945               loop_use_stmt = use_stmt;
1946               found = true;
1947               break;
1948             }
1949
1950           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1951             {
1952               if (vinfo_for_stmt (use_stmt)
1953                   && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1954                 {
1955                   loop_use_stmt = use_stmt;
1956                   nloop_uses++;
1957                 }
1958             }
1959            else
1960              n_out_of_loop_uses++;
1961
1962            /* There are can be either a single use in the loop or two uses in
1963               phi nodes.  */
1964            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1965              return false;
1966         }
1967
1968       if (found)
1969         break;
1970
1971       /* We reached a statement with no loop uses.  */
1972       if (nloop_uses == 0)
1973         return false;
1974
1975       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
1976       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
1977         return false;
1978
1979       if (!is_gimple_assign (loop_use_stmt)
1980           || code != gimple_assign_rhs_code (loop_use_stmt)
1981           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
1982         return false;
1983
1984       /* Insert USE_STMT into reduction chain.  */
1985       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
1986       if (current_stmt)
1987         {
1988           current_stmt_info = vinfo_for_stmt (current_stmt);
1989           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
1990           GROUP_FIRST_ELEMENT (use_stmt_info)
1991             = GROUP_FIRST_ELEMENT (current_stmt_info);
1992         }
1993       else
1994         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
1995
1996       lhs = gimple_assign_lhs (loop_use_stmt);
1997       current_stmt = loop_use_stmt;
1998       size++;
1999    }
2000
2001   if (!found || loop_use_stmt != phi || size < 2)
2002     return false;
2003
2004   /* Swap the operands, if needed, to make the reduction operand be the second
2005      operand.  */
2006   lhs = PHI_RESULT (phi);
2007   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2008   while (next_stmt)
2009     {
2010       if (gimple_assign_rhs2 (next_stmt) == lhs)
2011         {
2012           tree op = gimple_assign_rhs1 (next_stmt);
2013           gimple def_stmt = NULL;
2014
2015           if (TREE_CODE (op) == SSA_NAME)
2016             def_stmt = SSA_NAME_DEF_STMT (op);
2017
2018           /* Check that the other def is either defined in the loop
2019              ("vect_internal_def"), or it's an induction (defined by a
2020              loop-header phi-node).  */
2021           if (def_stmt
2022               && gimple_bb (def_stmt)
2023               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2024               && (is_gimple_assign (def_stmt)
2025                   || is_gimple_call (def_stmt)
2026                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2027                            == vect_induction_def
2028                   || (gimple_code (def_stmt) == GIMPLE_PHI
2029                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2030                                   == vect_internal_def
2031                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2032             {
2033               lhs = gimple_assign_lhs (next_stmt);
2034               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2035               continue;
2036             }
2037
2038           return false;
2039         }
2040       else
2041         {
2042           tree op = gimple_assign_rhs2 (next_stmt);
2043           gimple def_stmt = NULL;
2044
2045           if (TREE_CODE (op) == SSA_NAME)
2046             def_stmt = SSA_NAME_DEF_STMT (op);
2047
2048           /* Check that the other def is either defined in the loop
2049             ("vect_internal_def"), or it's an induction (defined by a
2050             loop-header phi-node).  */
2051           if (def_stmt
2052               && gimple_bb (def_stmt)
2053               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2054               && (is_gimple_assign (def_stmt)
2055                   || is_gimple_call (def_stmt)
2056                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2057                               == vect_induction_def
2058                   || (gimple_code (def_stmt) == GIMPLE_PHI
2059                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2060                                   == vect_internal_def
2061                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2062             {
2063               if (dump_enabled_p ())
2064                 {
2065                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2066                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2067                   dump_printf (MSG_NOTE, "\n");
2068                 }
2069
2070               swap_ssa_operands (next_stmt,
2071                                  gimple_assign_rhs1_ptr (next_stmt),
2072                                  gimple_assign_rhs2_ptr (next_stmt));
2073               update_stmt (next_stmt);
2074
2075               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2076                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2077             }
2078           else
2079             return false;
2080         }
2081
2082       lhs = gimple_assign_lhs (next_stmt);
2083       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2084     }
2085
2086   /* Save the chain for further analysis in SLP detection.  */
2087   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2088   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2089   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2090
2091   return true;
2092 }
2093
2094
2095 /* Function vect_is_simple_reduction_1
2096
2097    (1) Detect a cross-iteration def-use cycle that represents a simple
2098    reduction computation.  We look for the following pattern:
2099
2100    loop_header:
2101      a1 = phi < a0, a2 >
2102      a3 = ...
2103      a2 = operation (a3, a1)
2104
2105    or
2106
2107    a3 = ...
2108    loop_header:
2109      a1 = phi < a0, a2 >
2110      a2 = operation (a3, a1)
2111
2112    such that:
2113    1. operation is commutative and associative and it is safe to
2114       change the order of the computation (if CHECK_REDUCTION is true)
2115    2. no uses for a2 in the loop (a2 is used out of the loop)
2116    3. no uses of a1 in the loop besides the reduction operation
2117    4. no uses of a1 outside the loop.
2118
2119    Conditions 1,4 are tested here.
2120    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2121
2122    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2123    nested cycles, if CHECK_REDUCTION is false.
2124
2125    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2126    reductions:
2127
2128      a1 = phi < a0, a2 >
2129      inner loop (def of a3)
2130      a2 = phi < a3 >
2131
2132    If MODIFY is true it tries also to rework the code in-place to enable
2133    detection of more reduction patterns.  For the time being we rewrite
2134    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2135 */
2136
2137 static gimple
2138 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2139                             bool check_reduction, bool *double_reduc,
2140                             bool modify)
2141 {
2142   struct loop *loop = (gimple_bb (phi))->loop_father;
2143   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2144   edge latch_e = loop_latch_edge (loop);
2145   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2146   gimple def_stmt, def1 = NULL, def2 = NULL;
2147   enum tree_code orig_code, code;
2148   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2149   tree type;
2150   int nloop_uses;
2151   tree name;
2152   imm_use_iterator imm_iter;
2153   use_operand_p use_p;
2154   bool phi_def;
2155
2156   *double_reduc = false;
2157
2158   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2159      otherwise, we assume outer loop vectorization.  */
2160   gcc_assert ((check_reduction && loop == vect_loop)
2161               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2162
2163   name = PHI_RESULT (phi);
2164   nloop_uses = 0;
2165   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2166     {
2167       gimple use_stmt = USE_STMT (use_p);
2168       if (is_gimple_debug (use_stmt))
2169         continue;
2170
2171       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2172         {
2173           if (dump_enabled_p ())
2174             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2175                              "intermediate value used outside loop.\n");
2176
2177           return NULL;
2178         }
2179
2180       if (vinfo_for_stmt (use_stmt)
2181           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2182         nloop_uses++;
2183       if (nloop_uses > 1)
2184         {
2185           if (dump_enabled_p ())
2186             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2187                              "reduction used in loop.\n");
2188           return NULL;
2189         }
2190     }
2191
2192   if (TREE_CODE (loop_arg) != SSA_NAME)
2193     {
2194       if (dump_enabled_p ())
2195         {
2196           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2197                            "reduction: not ssa_name: ");
2198           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2199           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2200         }
2201       return NULL;
2202     }
2203
2204   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2205   if (!def_stmt)
2206     {
2207       if (dump_enabled_p ())
2208         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2209                          "reduction: no def_stmt.\n");
2210       return NULL;
2211     }
2212
2213   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2214     {
2215       if (dump_enabled_p ())
2216         {
2217           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2218           dump_printf (MSG_NOTE, "\n");
2219         }
2220       return NULL;
2221     }
2222
2223   if (is_gimple_assign (def_stmt))
2224     {
2225       name = gimple_assign_lhs (def_stmt);
2226       phi_def = false;
2227     }
2228   else
2229     {
2230       name = PHI_RESULT (def_stmt);
2231       phi_def = true;
2232     }
2233
2234   nloop_uses = 0;
2235   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2236     {
2237       gimple use_stmt = USE_STMT (use_p);
2238       if (is_gimple_debug (use_stmt))
2239         continue;
2240       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2241           && vinfo_for_stmt (use_stmt)
2242           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2243         nloop_uses++;
2244       if (nloop_uses > 1)
2245         {
2246           if (dump_enabled_p ())
2247             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2248                              "reduction used in loop.\n");
2249           return NULL;
2250         }
2251     }
2252
2253   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2254      defined in the inner loop.  */
2255   if (phi_def)
2256     {
2257       op1 = PHI_ARG_DEF (def_stmt, 0);
2258
2259       if (gimple_phi_num_args (def_stmt) != 1
2260           || TREE_CODE (op1) != SSA_NAME)
2261         {
2262           if (dump_enabled_p ())
2263             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2264                              "unsupported phi node definition.\n");
2265
2266           return NULL;
2267         }
2268
2269       def1 = SSA_NAME_DEF_STMT (op1);
2270       if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2271           && loop->inner
2272           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2273           && is_gimple_assign (def1))
2274         {
2275           if (dump_enabled_p ())
2276             report_vect_op (MSG_NOTE, def_stmt,
2277                             "detected double reduction: ");
2278
2279           *double_reduc = true;
2280           return def_stmt;
2281         }
2282
2283       return NULL;
2284     }
2285
2286   code = orig_code = gimple_assign_rhs_code (def_stmt);
2287
2288   /* We can handle "res -= x[i]", which is non-associative by
2289      simply rewriting this into "res += -x[i]".  Avoid changing
2290      gimple instruction for the first simple tests and only do this
2291      if we're allowed to change code at all.  */
2292   if (code == MINUS_EXPR
2293       && modify
2294       && (op1 = gimple_assign_rhs1 (def_stmt))
2295       && TREE_CODE (op1) == SSA_NAME
2296       && SSA_NAME_DEF_STMT (op1) == phi)
2297     code = PLUS_EXPR;
2298
2299   if (check_reduction
2300       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2301     {
2302       if (dump_enabled_p ())
2303         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2304                         "reduction: not commutative/associative: ");
2305       return NULL;
2306     }
2307
2308   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2309     {
2310       if (code != COND_EXPR)
2311         {
2312           if (dump_enabled_p ())
2313             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2314                             "reduction: not binary operation: ");
2315
2316           return NULL;
2317         }
2318
2319       op3 = gimple_assign_rhs1 (def_stmt);
2320       if (COMPARISON_CLASS_P (op3))
2321         {
2322           op4 = TREE_OPERAND (op3, 1);
2323           op3 = TREE_OPERAND (op3, 0);
2324         }
2325
2326       op1 = gimple_assign_rhs2 (def_stmt);
2327       op2 = gimple_assign_rhs3 (def_stmt);
2328
2329       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2330         {
2331           if (dump_enabled_p ())
2332             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2333                             "reduction: uses not ssa_names: ");
2334
2335           return NULL;
2336         }
2337     }
2338   else
2339     {
2340       op1 = gimple_assign_rhs1 (def_stmt);
2341       op2 = gimple_assign_rhs2 (def_stmt);
2342
2343       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2344         {
2345           if (dump_enabled_p ())
2346             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2347                             "reduction: uses not ssa_names: ");
2348
2349           return NULL;
2350         }
2351    }
2352
2353   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2354   if ((TREE_CODE (op1) == SSA_NAME
2355        && !types_compatible_p (type,TREE_TYPE (op1)))
2356       || (TREE_CODE (op2) == SSA_NAME
2357           && !types_compatible_p (type, TREE_TYPE (op2)))
2358       || (op3 && TREE_CODE (op3) == SSA_NAME
2359           && !types_compatible_p (type, TREE_TYPE (op3)))
2360       || (op4 && TREE_CODE (op4) == SSA_NAME
2361           && !types_compatible_p (type, TREE_TYPE (op4))))
2362     {
2363       if (dump_enabled_p ())
2364         {
2365           dump_printf_loc (MSG_NOTE, vect_location,
2366                            "reduction: multiple types: operation type: ");
2367           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2368           dump_printf (MSG_NOTE, ", operands types: ");
2369           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2370                              TREE_TYPE (op1));
2371           dump_printf (MSG_NOTE, ",");
2372           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2373                              TREE_TYPE (op2));
2374           if (op3)
2375             {
2376               dump_printf (MSG_NOTE, ",");
2377               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2378                                  TREE_TYPE (op3));
2379             }
2380
2381           if (op4)
2382             {
2383               dump_printf (MSG_NOTE, ",");
2384               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2385                                  TREE_TYPE (op4));
2386             }
2387           dump_printf (MSG_NOTE, "\n");
2388         }
2389
2390       return NULL;
2391     }
2392
2393   /* Check that it's ok to change the order of the computation.
2394      Generally, when vectorizing a reduction we change the order of the
2395      computation.  This may change the behavior of the program in some
2396      cases, so we need to check that this is ok.  One exception is when
2397      vectorizing an outer-loop: the inner-loop is executed sequentially,
2398      and therefore vectorizing reductions in the inner-loop during
2399      outer-loop vectorization is safe.  */
2400
2401   /* CHECKME: check for !flag_finite_math_only too?  */
2402   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2403       && check_reduction)
2404     {
2405       /* Changing the order of operations changes the semantics.  */
2406       if (dump_enabled_p ())
2407         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2408                         "reduction: unsafe fp math optimization: ");
2409       return NULL;
2410     }
2411   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2412            && check_reduction)
2413     {
2414       /* Changing the order of operations changes the semantics.  */
2415       if (dump_enabled_p ())
2416         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2417                         "reduction: unsafe int math optimization: ");
2418       return NULL;
2419     }
2420   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2421     {
2422       /* Changing the order of operations changes the semantics.  */
2423       if (dump_enabled_p ())
2424         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2425                         "reduction: unsafe fixed-point math optimization: ");
2426       return NULL;
2427     }
2428
2429   /* If we detected "res -= x[i]" earlier, rewrite it into
2430      "res += -x[i]" now.  If this turns out to be useless reassoc
2431      will clean it up again.  */
2432   if (orig_code == MINUS_EXPR)
2433     {
2434       tree rhs = gimple_assign_rhs2 (def_stmt);
2435       tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
2436       gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2437                                                          rhs, NULL);
2438       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2439       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2440                                                           loop_info, NULL));
2441       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2442       gimple_assign_set_rhs2 (def_stmt, negrhs);
2443       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2444       update_stmt (def_stmt);
2445     }
2446
2447   /* Reduction is safe. We're dealing with one of the following:
2448      1) integer arithmetic and no trapv
2449      2) floating point arithmetic, and special flags permit this optimization
2450      3) nested cycle (i.e., outer loop vectorization).  */
2451   if (TREE_CODE (op1) == SSA_NAME)
2452     def1 = SSA_NAME_DEF_STMT (op1);
2453
2454   if (TREE_CODE (op2) == SSA_NAME)
2455     def2 = SSA_NAME_DEF_STMT (op2);
2456
2457   if (code != COND_EXPR
2458       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2459     {
2460       if (dump_enabled_p ())
2461         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2462       return NULL;
2463     }
2464
2465   /* Check that one def is the reduction def, defined by PHI,
2466      the other def is either defined in the loop ("vect_internal_def"),
2467      or it's an induction (defined by a loop-header phi-node).  */
2468
2469   if (def2 && def2 == phi
2470       && (code == COND_EXPR
2471           || !def1 || gimple_nop_p (def1)
2472           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2473           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2474               && (is_gimple_assign (def1)
2475                   || is_gimple_call (def1)
2476                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2477                       == vect_induction_def
2478                   || (gimple_code (def1) == GIMPLE_PHI
2479                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2480                           == vect_internal_def
2481                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2482     {
2483       if (dump_enabled_p ())
2484         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2485       return def_stmt;
2486     }
2487
2488   if (def1 && def1 == phi
2489       && (code == COND_EXPR
2490           || !def2 || gimple_nop_p (def2)
2491           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2492           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2493               && (is_gimple_assign (def2)
2494                   || is_gimple_call (def2)
2495                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2496                       == vect_induction_def
2497                   || (gimple_code (def2) == GIMPLE_PHI
2498                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2499                           == vect_internal_def
2500                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2501     {
2502       if (check_reduction)
2503         {
2504           /* Swap operands (just for simplicity - so that the rest of the code
2505              can assume that the reduction variable is always the last (second)
2506              argument).  */
2507           if (dump_enabled_p ())
2508             report_vect_op (MSG_NOTE, def_stmt,
2509                             "detected reduction: need to swap operands: ");
2510
2511           swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2512                              gimple_assign_rhs2_ptr (def_stmt));
2513
2514           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2515             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2516         }
2517       else
2518         {
2519           if (dump_enabled_p ())
2520             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2521         }
2522
2523       return def_stmt;
2524     }
2525
2526   /* Try to find SLP reduction chain.  */
2527   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2528     {
2529       if (dump_enabled_p ())
2530         report_vect_op (MSG_NOTE, def_stmt,
2531                         "reduction: detected reduction chain: ");
2532
2533       return def_stmt;
2534     }
2535
2536   if (dump_enabled_p ())
2537     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2538                     "reduction: unknown pattern: ");
2539
2540   return NULL;
2541 }
2542
2543 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2544    in-place.  Arguments as there.  */
2545
2546 static gimple
2547 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2548                           bool check_reduction, bool *double_reduc)
2549 {
2550   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2551                                      double_reduc, false);
2552 }
2553
2554 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2555    in-place if it enables detection of more reductions.  Arguments
2556    as there.  */
2557
2558 gimple
2559 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2560                           bool check_reduction, bool *double_reduc)
2561 {
2562   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2563                                      double_reduc, true);
2564 }
2565
2566 /* Calculate the cost of one scalar iteration of the loop.  */
2567 int
2568 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2569 {
2570   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2571   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2572   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2573   int innerloop_iters, i, stmt_cost;
2574
2575   /* Count statements in scalar loop.  Using this as scalar cost for a single
2576      iteration for now.
2577
2578      TODO: Add outer loop support.
2579
2580      TODO: Consider assigning different costs to different scalar
2581      statements.  */
2582
2583   /* FORNOW.  */
2584   innerloop_iters = 1;
2585   if (loop->inner)
2586     innerloop_iters = 50; /* FIXME */
2587
2588   for (i = 0; i < nbbs; i++)
2589     {
2590       gimple_stmt_iterator si;
2591       basic_block bb = bbs[i];
2592
2593       if (bb->loop_father == loop->inner)
2594         factor = innerloop_iters;
2595       else
2596         factor = 1;
2597
2598       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2599         {
2600           gimple stmt = gsi_stmt (si);
2601           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2602
2603           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2604             continue;
2605
2606           /* Skip stmts that are not vectorized inside the loop.  */
2607           if (stmt_info
2608               && !STMT_VINFO_RELEVANT_P (stmt_info)
2609               && (!STMT_VINFO_LIVE_P (stmt_info)
2610                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2611               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2612             continue;
2613
2614           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2615             {
2616               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2617                stmt_cost = vect_get_stmt_cost (scalar_load);
2618              else
2619                stmt_cost = vect_get_stmt_cost (scalar_store);
2620             }
2621           else
2622             stmt_cost = vect_get_stmt_cost (scalar_stmt);
2623
2624           scalar_single_iter_cost += stmt_cost * factor;
2625         }
2626     }
2627   return scalar_single_iter_cost;
2628 }
2629
2630 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2631 int
2632 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2633                              int *peel_iters_epilogue,
2634                              int scalar_single_iter_cost,
2635                              stmt_vector_for_cost *prologue_cost_vec,
2636                              stmt_vector_for_cost *epilogue_cost_vec)
2637 {
2638   int retval = 0;
2639   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2640
2641   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2642     {
2643       *peel_iters_epilogue = vf/2;
2644       if (dump_enabled_p ())
2645         dump_printf_loc (MSG_NOTE, vect_location,
2646                          "cost model: epilogue peel iters set to vf/2 "
2647                          "because loop iterations are unknown .\n");
2648
2649       /* If peeled iterations are known but number of scalar loop
2650          iterations are unknown, count a taken branch per peeled loop.  */
2651       retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2652                                  NULL, 0, vect_prologue);
2653     }
2654   else
2655     {
2656       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2657       peel_iters_prologue = niters < peel_iters_prologue ?
2658                             niters : peel_iters_prologue;
2659       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2660       /* If we need to peel for gaps, but no peeling is required, we have to
2661          peel VF iterations.  */
2662       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2663         *peel_iters_epilogue = vf;
2664     }
2665
2666   if (peel_iters_prologue)
2667     retval += record_stmt_cost (prologue_cost_vec,
2668                                 peel_iters_prologue * scalar_single_iter_cost,
2669                                 scalar_stmt, NULL, 0, vect_prologue);
2670   if (*peel_iters_epilogue)
2671     retval += record_stmt_cost (epilogue_cost_vec,
2672                                 *peel_iters_epilogue * scalar_single_iter_cost,
2673                                 scalar_stmt, NULL, 0, vect_epilogue);
2674   return retval;
2675 }
2676
2677 /* Function vect_estimate_min_profitable_iters
2678
2679    Return the number of iterations required for the vector version of the
2680    loop to be profitable relative to the cost of the scalar version of the
2681    loop.  */
2682
2683 static void
2684 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2685                                     int *ret_min_profitable_niters,
2686                                     int *ret_min_profitable_estimate)
2687 {
2688   int min_profitable_iters;
2689   int min_profitable_estimate;
2690   int peel_iters_prologue;
2691   int peel_iters_epilogue;
2692   unsigned vec_inside_cost = 0;
2693   int vec_outside_cost = 0;
2694   unsigned vec_prologue_cost = 0;
2695   unsigned vec_epilogue_cost = 0;
2696   int scalar_single_iter_cost = 0;
2697   int scalar_outside_cost = 0;
2698   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2699   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2700   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2701
2702   /* Cost model disabled.  */
2703   if (unlimited_cost_model ())
2704     {
2705       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2706       *ret_min_profitable_niters = 0;
2707       *ret_min_profitable_estimate = 0;
2708       return;
2709     }
2710
2711   /* Requires loop versioning tests to handle misalignment.  */
2712   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2713     {
2714       /*  FIXME: Make cost depend on complexity of individual check.  */
2715       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2716       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2717                             vect_prologue);
2718       dump_printf (MSG_NOTE,
2719                    "cost model: Adding cost of checks for loop "
2720                    "versioning to treat misalignment.\n");
2721     }
2722
2723   /* Requires loop versioning with alias checks.  */
2724   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2725     {
2726       /*  FIXME: Make cost depend on complexity of individual check.  */
2727       unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2728       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2729                             vect_prologue);
2730       dump_printf (MSG_NOTE,
2731                    "cost model: Adding cost of checks for loop "
2732                    "versioning aliasing.\n");
2733     }
2734
2735   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2736       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2737     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2738                           vect_prologue);
2739
2740   /* Count statements in scalar loop.  Using this as scalar cost for a single
2741      iteration for now.
2742
2743      TODO: Add outer loop support.
2744
2745      TODO: Consider assigning different costs to different scalar
2746      statements.  */
2747
2748   scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2749
2750   /* Add additional cost for the peeled instructions in prologue and epilogue
2751      loop.
2752
2753      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2754      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2755
2756      TODO: Build an expression that represents peel_iters for prologue and
2757      epilogue to be used in a run-time test.  */
2758
2759   if (npeel  < 0)
2760     {
2761       peel_iters_prologue = vf/2;
2762       dump_printf (MSG_NOTE, "cost model: "
2763                    "prologue peel iters set to vf/2.\n");
2764
2765       /* If peeling for alignment is unknown, loop bound of main loop becomes
2766          unknown.  */
2767       peel_iters_epilogue = vf/2;
2768       dump_printf (MSG_NOTE, "cost model: "
2769                    "epilogue peel iters set to vf/2 because "
2770                    "peeling for alignment is unknown.\n");
2771
2772       /* If peeled iterations are unknown, count a taken branch and a not taken
2773          branch per peeled loop. Even if scalar loop iterations are known,
2774          vector iterations are not known since peeled prologue iterations are
2775          not known. Hence guards remain the same.  */
2776       (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2777                             NULL, 0, vect_prologue);
2778       (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2779                             NULL, 0, vect_prologue);
2780       /* FORNOW: Don't attempt to pass individual scalar instructions to
2781          the model; just assume linear cost for scalar iterations.  */
2782       (void) add_stmt_cost (target_cost_data,
2783                             peel_iters_prologue * scalar_single_iter_cost,
2784                             scalar_stmt, NULL, 0, vect_prologue);
2785       (void) add_stmt_cost (target_cost_data,
2786                             peel_iters_epilogue * scalar_single_iter_cost,
2787                             scalar_stmt, NULL, 0, vect_epilogue);
2788     }
2789   else
2790     {
2791       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2792       stmt_info_for_cost *si;
2793       int j;
2794       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2795
2796       prologue_cost_vec.create (2);
2797       epilogue_cost_vec.create (2);
2798       peel_iters_prologue = npeel;
2799
2800       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2801                                           &peel_iters_epilogue,
2802                                           scalar_single_iter_cost,
2803                                           &prologue_cost_vec,
2804                                           &epilogue_cost_vec);
2805
2806       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2807         {
2808           struct _stmt_vec_info *stmt_info
2809             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2810           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2811                                 si->misalign, vect_prologue);
2812         }
2813
2814       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2815         {
2816           struct _stmt_vec_info *stmt_info
2817             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2818           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2819                                 si->misalign, vect_epilogue);
2820         }
2821
2822       prologue_cost_vec.release ();
2823       epilogue_cost_vec.release ();
2824     }
2825
2826   /* FORNOW: The scalar outside cost is incremented in one of the
2827      following ways:
2828
2829      1. The vectorizer checks for alignment and aliasing and generates
2830      a condition that allows dynamic vectorization.  A cost model
2831      check is ANDED with the versioning condition.  Hence scalar code
2832      path now has the added cost of the versioning check.
2833
2834        if (cost > th & versioning_check)
2835          jmp to vector code
2836
2837      Hence run-time scalar is incremented by not-taken branch cost.
2838
2839      2. The vectorizer then checks if a prologue is required.  If the
2840      cost model check was not done before during versioning, it has to
2841      be done before the prologue check.
2842
2843        if (cost <= th)
2844          prologue = scalar_iters
2845        if (prologue == 0)
2846          jmp to vector code
2847        else
2848          execute prologue
2849        if (prologue == num_iters)
2850          go to exit
2851
2852      Hence the run-time scalar cost is incremented by a taken branch,
2853      plus a not-taken branch, plus a taken branch cost.
2854
2855      3. The vectorizer then checks if an epilogue is required.  If the
2856      cost model check was not done before during prologue check, it
2857      has to be done with the epilogue check.
2858
2859        if (prologue == 0)
2860          jmp to vector code
2861        else
2862          execute prologue
2863        if (prologue == num_iters)
2864          go to exit
2865        vector code:
2866          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2867            jmp to epilogue
2868
2869      Hence the run-time scalar cost should be incremented by 2 taken
2870      branches.
2871
2872      TODO: The back end may reorder the BBS's differently and reverse
2873      conditions/branch directions.  Change the estimates below to
2874      something more reasonable.  */
2875
2876   /* If the number of iterations is known and we do not do versioning, we can
2877      decide whether to vectorize at compile time.  Hence the scalar version
2878      do not carry cost model guard costs.  */
2879   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2880       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2881       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2882     {
2883       /* Cost model check occurs at versioning.  */
2884       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2885           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2886         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2887       else
2888         {
2889           /* Cost model check occurs at prologue generation.  */
2890           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2891             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2892               + vect_get_stmt_cost (cond_branch_not_taken);
2893           /* Cost model check occurs at epilogue generation.  */
2894           else
2895             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2896         }
2897     }
2898
2899   /* Complete the target-specific cost calculations.  */
2900   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2901                &vec_inside_cost, &vec_epilogue_cost);
2902
2903   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2904
2905   /* Calculate number of iterations required to make the vector version
2906      profitable, relative to the loop bodies only.  The following condition
2907      must hold true:
2908      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2909      where
2910      SIC = scalar iteration cost, VIC = vector iteration cost,
2911      VOC = vector outside cost, VF = vectorization factor,
2912      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2913      SOC = scalar outside cost for run time cost model check.  */
2914
2915   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2916     {
2917       if (vec_outside_cost <= 0)
2918         min_profitable_iters = 1;
2919       else
2920         {
2921           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2922                                   - vec_inside_cost * peel_iters_prologue
2923                                   - vec_inside_cost * peel_iters_epilogue)
2924                                  / ((scalar_single_iter_cost * vf)
2925                                     - vec_inside_cost);
2926
2927           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2928               <= (((int) vec_inside_cost * min_profitable_iters)
2929                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
2930             min_profitable_iters++;
2931         }
2932     }
2933   /* vector version will never be profitable.  */
2934   else
2935     {
2936       if (dump_enabled_p ())
2937         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2938                          "cost model: the vector iteration cost = %d "
2939                          "divided by the scalar iteration cost = %d "
2940                          "is greater or equal to the vectorization factor = %d"
2941                          ".\n",
2942                          vec_inside_cost, scalar_single_iter_cost, vf);
2943       *ret_min_profitable_niters = -1;
2944       *ret_min_profitable_estimate = -1;
2945       return;
2946     }
2947
2948   if (dump_enabled_p ())
2949     {
2950       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2951       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
2952                    vec_inside_cost);
2953       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
2954                    vec_prologue_cost);
2955       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
2956                    vec_epilogue_cost);
2957       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
2958                    scalar_single_iter_cost);
2959       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
2960                    scalar_outside_cost);
2961       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
2962                    vec_outside_cost);
2963       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
2964                    peel_iters_prologue);
2965       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
2966                    peel_iters_epilogue);
2967       dump_printf (MSG_NOTE,
2968                    "  Calculated minimum iters for profitability: %d\n",
2969                    min_profitable_iters);
2970       dump_printf (MSG_NOTE, "\n");
2971     }
2972
2973   min_profitable_iters =
2974         min_profitable_iters < vf ? vf : min_profitable_iters;
2975
2976   /* Because the condition we create is:
2977      if (niters <= min_profitable_iters)
2978        then skip the vectorized loop.  */
2979   min_profitable_iters--;
2980
2981   if (dump_enabled_p ())
2982     dump_printf_loc (MSG_NOTE, vect_location,
2983                      "  Runtime profitability threshold = %d\n",
2984                      min_profitable_iters);
2985
2986   *ret_min_profitable_niters = min_profitable_iters;
2987
2988   /* Calculate number of iterations required to make the vector version
2989      profitable, relative to the loop bodies only.
2990
2991      Non-vectorized variant is SIC * niters and it must win over vector
2992      variant on the expected loop trip count.  The following condition must hold true:
2993      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
2994
2995   if (vec_outside_cost <= 0)
2996     min_profitable_estimate = 1;
2997   else
2998     {
2999       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3000                                  - vec_inside_cost * peel_iters_prologue
3001                                  - vec_inside_cost * peel_iters_epilogue)
3002                                  / ((scalar_single_iter_cost * vf)
3003                                    - vec_inside_cost);
3004     }
3005   min_profitable_estimate --;
3006   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3007   if (dump_enabled_p ())
3008     dump_printf_loc (MSG_NOTE, vect_location,
3009                      "  Static estimate profitability threshold = %d\n",
3010                       min_profitable_iters);
3011
3012   *ret_min_profitable_estimate = min_profitable_estimate;
3013 }
3014
3015
3016 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3017    functions. Design better to avoid maintenance issues.  */
3018
3019 /* Function vect_model_reduction_cost.
3020
3021    Models cost for a reduction operation, including the vector ops
3022    generated within the strip-mine loop, the initial definition before
3023    the loop, and the epilogue code that must be generated.  */
3024
3025 static bool
3026 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3027                            int ncopies)
3028 {
3029   int prologue_cost = 0, epilogue_cost = 0;
3030   enum tree_code code;
3031   optab optab;
3032   tree vectype;
3033   gimple stmt, orig_stmt;
3034   tree reduction_op;
3035   enum machine_mode mode;
3036   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3037   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3038   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3039
3040   /* Cost of reduction op inside loop.  */
3041   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3042                                         stmt_info, 0, vect_body);
3043   stmt = STMT_VINFO_STMT (stmt_info);
3044
3045   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3046     {
3047     case GIMPLE_SINGLE_RHS:
3048       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
3049       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
3050       break;
3051     case GIMPLE_UNARY_RHS:
3052       reduction_op = gimple_assign_rhs1 (stmt);
3053       break;
3054     case GIMPLE_BINARY_RHS:
3055       reduction_op = gimple_assign_rhs2 (stmt);
3056       break;
3057     case GIMPLE_TERNARY_RHS:
3058       reduction_op = gimple_assign_rhs3 (stmt);
3059       break;
3060     default:
3061       gcc_unreachable ();
3062     }
3063
3064   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3065   if (!vectype)
3066     {
3067       if (dump_enabled_p ())
3068         {
3069           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3070                            "unsupported data-type ");
3071           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3072                              TREE_TYPE (reduction_op));
3073           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3074         }
3075       return false;
3076    }
3077
3078   mode = TYPE_MODE (vectype);
3079   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3080
3081   if (!orig_stmt)
3082     orig_stmt = STMT_VINFO_STMT (stmt_info);
3083
3084   code = gimple_assign_rhs_code (orig_stmt);
3085
3086   /* Add in cost for initial definition.  */
3087   prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3088                                   stmt_info, 0, vect_prologue);
3089
3090   /* Determine cost of epilogue code.
3091
3092      We have a reduction operator that will reduce the vector in one statement.
3093      Also requires scalar extract.  */
3094
3095   if (!nested_in_vect_loop_p (loop, orig_stmt))
3096     {
3097       if (reduc_code != ERROR_MARK)
3098         {
3099           epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3100                                           stmt_info, 0, vect_epilogue);
3101           epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3102                                           stmt_info, 0, vect_epilogue);
3103         }
3104       else
3105         {
3106           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3107           tree bitsize =
3108             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3109           int element_bitsize = tree_to_uhwi (bitsize);
3110           int nelements = vec_size_in_bits / element_bitsize;
3111
3112           optab = optab_for_tree_code (code, vectype, optab_default);
3113
3114           /* We have a whole vector shift available.  */
3115           if (VECTOR_MODE_P (mode)
3116               && optab_handler (optab, mode) != CODE_FOR_nothing
3117               && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3118             {
3119               /* Final reduction via vector shifts and the reduction operator.
3120                  Also requires scalar extract.  */
3121               epilogue_cost += add_stmt_cost (target_cost_data,
3122                                               exact_log2 (nelements) * 2,
3123                                               vector_stmt, stmt_info, 0,
3124                                               vect_epilogue);
3125               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3126                                               vec_to_scalar, stmt_info, 0,
3127                                               vect_epilogue);
3128             }
3129           else
3130             /* Use extracts and reduction op for final reduction.  For N
3131                elements, we have N extracts and N-1 reduction ops.  */
3132             epilogue_cost += add_stmt_cost (target_cost_data,
3133                                             nelements + nelements - 1,
3134                                             vector_stmt, stmt_info, 0,
3135                                             vect_epilogue);
3136         }
3137     }
3138
3139   if (dump_enabled_p ())
3140     dump_printf (MSG_NOTE,
3141                  "vect_model_reduction_cost: inside_cost = %d, "
3142                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3143                  prologue_cost, epilogue_cost);
3144
3145   return true;
3146 }
3147
3148
3149 /* Function vect_model_induction_cost.
3150
3151    Models cost for induction operations.  */
3152
3153 static void
3154 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3155 {
3156   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3157   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3158   unsigned inside_cost, prologue_cost;
3159
3160   /* loop cost for vec_loop.  */
3161   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3162                                stmt_info, 0, vect_body);
3163
3164   /* prologue cost for vec_init and vec_step.  */
3165   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3166                                  stmt_info, 0, vect_prologue);
3167
3168   if (dump_enabled_p ())
3169     dump_printf_loc (MSG_NOTE, vect_location,
3170                      "vect_model_induction_cost: inside_cost = %d, "
3171                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3172 }
3173
3174
3175 /* Function get_initial_def_for_induction
3176
3177    Input:
3178    STMT - a stmt that performs an induction operation in the loop.
3179    IV_PHI - the initial value of the induction variable
3180
3181    Output:
3182    Return a vector variable, initialized with the first VF values of
3183    the induction variable.  E.g., for an iv with IV_PHI='X' and
3184    evolution S, for a vector of 4 units, we want to return:
3185    [X, X + S, X + 2*S, X + 3*S].  */
3186
3187 static tree
3188 get_initial_def_for_induction (gimple iv_phi)
3189 {
3190   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3191   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3192   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3193   tree vectype;
3194   int nunits;
3195   edge pe = loop_preheader_edge (loop);
3196   struct loop *iv_loop;
3197   basic_block new_bb;
3198   tree new_vec, vec_init, vec_step, t;
3199   tree access_fn;
3200   tree new_var;
3201   tree new_name;
3202   gimple init_stmt, induction_phi, new_stmt;
3203   tree induc_def, vec_def, vec_dest;
3204   tree init_expr, step_expr;
3205   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3206   int i;
3207   bool ok;
3208   int ncopies;
3209   tree expr;
3210   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3211   bool nested_in_vect_loop = false;
3212   gimple_seq stmts = NULL;
3213   imm_use_iterator imm_iter;
3214   use_operand_p use_p;
3215   gimple exit_phi;
3216   edge latch_e;
3217   tree loop_arg;
3218   gimple_stmt_iterator si;
3219   basic_block bb = gimple_bb (iv_phi);
3220   tree stepvectype;
3221   tree resvectype;
3222
3223   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3224   if (nested_in_vect_loop_p (loop, iv_phi))
3225     {
3226       nested_in_vect_loop = true;
3227       iv_loop = loop->inner;
3228     }
3229   else
3230     iv_loop = loop;
3231   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3232
3233   latch_e = loop_latch_edge (iv_loop);
3234   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3235
3236   access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
3237   gcc_assert (access_fn);
3238   STRIP_NOPS (access_fn);
3239   ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
3240                                     &init_expr, &step_expr);
3241   gcc_assert (ok);
3242   pe = loop_preheader_edge (iv_loop);
3243
3244   vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3245   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3246   gcc_assert (vectype);
3247   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3248   ncopies = vf / nunits;
3249
3250   gcc_assert (phi_info);
3251   gcc_assert (ncopies >= 1);
3252
3253   /* Find the first insertion point in the BB.  */
3254   si = gsi_after_labels (bb);
3255
3256   /* Create the vector that holds the initial_value of the induction.  */
3257   if (nested_in_vect_loop)
3258     {
3259       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3260          been created during vectorization of previous stmts.  We obtain it
3261          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3262       tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3263                                            loop_preheader_edge (iv_loop));
3264       vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
3265       /* If the initial value is not of proper type, convert it.  */
3266       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3267         {
3268           new_stmt = gimple_build_assign_with_ops
3269               (VIEW_CONVERT_EXPR,
3270                vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"),
3271                build1 (VIEW_CONVERT_EXPR, vectype, vec_init), NULL_TREE);
3272           vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3273           gimple_assign_set_lhs (new_stmt, vec_init);
3274           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3275                                                  new_stmt);
3276           gcc_assert (!new_bb);
3277           set_vinfo_for_stmt (new_stmt,
3278                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3279         }
3280     }
3281   else
3282     {
3283       vec<constructor_elt, va_gc> *v;
3284
3285       /* iv_loop is the loop to be vectorized. Create:
3286          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3287       new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3288                                        vect_scalar_var, "var_");
3289       new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3290                                                      init_expr),
3291                                        &stmts, false, new_var);
3292       if (stmts)
3293         {
3294           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3295           gcc_assert (!new_bb);
3296         }
3297
3298       vec_alloc (v, nunits);
3299       bool constant_p = is_gimple_min_invariant (new_name);
3300       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3301       for (i = 1; i < nunits; i++)
3302         {
3303           /* Create: new_name_i = new_name + step_expr  */
3304           new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3305                                   new_name, step_expr);
3306           if (!is_gimple_min_invariant (new_name))
3307             {
3308               init_stmt = gimple_build_assign (new_var, new_name);
3309               new_name = make_ssa_name (new_var, init_stmt);
3310               gimple_assign_set_lhs (init_stmt, new_name);
3311               new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3312               gcc_assert (!new_bb);
3313               if (dump_enabled_p ())
3314                 {
3315                   dump_printf_loc (MSG_NOTE, vect_location,
3316                                    "created new init_stmt: ");
3317                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3318                   dump_printf (MSG_NOTE, "\n");
3319                 }
3320               constant_p = false;
3321             }
3322           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3323         }
3324       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3325       if (constant_p)
3326         new_vec = build_vector_from_ctor (vectype, v);
3327       else
3328         new_vec = build_constructor (vectype, v);
3329       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3330     }
3331
3332
3333   /* Create the vector that holds the step of the induction.  */
3334   if (nested_in_vect_loop)
3335     /* iv_loop is nested in the loop to be vectorized. Generate:
3336        vec_step = [S, S, S, S]  */
3337     new_name = step_expr;
3338   else
3339     {
3340       /* iv_loop is the loop to be vectorized. Generate:
3341           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3342       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3343         {
3344           expr = build_int_cst (integer_type_node, vf);
3345           expr = fold_convert (TREE_TYPE (step_expr), expr);
3346         }
3347       else
3348         expr = build_int_cst (TREE_TYPE (step_expr), vf);
3349       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3350                               expr, step_expr);
3351       if (TREE_CODE (step_expr) == SSA_NAME)
3352         new_name = vect_init_vector (iv_phi, new_name,
3353                                      TREE_TYPE (step_expr), NULL);
3354     }
3355
3356   t = unshare_expr (new_name);
3357   gcc_assert (CONSTANT_CLASS_P (new_name)
3358               || TREE_CODE (new_name) == SSA_NAME);
3359   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3360   gcc_assert (stepvectype);
3361   new_vec = build_vector_from_val (stepvectype, t);
3362   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3363
3364
3365   /* Create the following def-use cycle:
3366      loop prolog:
3367          vec_init = ...
3368          vec_step = ...
3369      loop:
3370          vec_iv = PHI <vec_init, vec_loop>
3371          ...
3372          STMT
3373          ...
3374          vec_loop = vec_iv + vec_step;  */
3375
3376   /* Create the induction-phi that defines the induction-operand.  */
3377   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3378   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3379   set_vinfo_for_stmt (induction_phi,
3380                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3381   induc_def = PHI_RESULT (induction_phi);
3382
3383   /* Create the iv update inside the loop  */
3384   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3385                                            induc_def, vec_step);
3386   vec_def = make_ssa_name (vec_dest, new_stmt);
3387   gimple_assign_set_lhs (new_stmt, vec_def);
3388   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3389   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3390                                                    NULL));
3391
3392   /* Set the arguments of the phi node:  */
3393   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3394   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3395                UNKNOWN_LOCATION);
3396
3397
3398   /* In case that vectorization factor (VF) is bigger than the number
3399      of elements that we can fit in a vectype (nunits), we have to generate
3400      more than one vector stmt - i.e - we need to "unroll" the
3401      vector stmt by a factor VF/nunits.  For more details see documentation
3402      in vectorizable_operation.  */
3403
3404   if (ncopies > 1)
3405     {
3406       stmt_vec_info prev_stmt_vinfo;
3407       /* FORNOW. This restriction should be relaxed.  */
3408       gcc_assert (!nested_in_vect_loop);
3409
3410       /* Create the vector that holds the step of the induction.  */
3411       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3412         {
3413           expr = build_int_cst (integer_type_node, nunits);
3414           expr = fold_convert (TREE_TYPE (step_expr), expr);
3415         }
3416       else
3417         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3418       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3419                               expr, step_expr);
3420       if (TREE_CODE (step_expr) == SSA_NAME)
3421         new_name = vect_init_vector (iv_phi, new_name,
3422                                      TREE_TYPE (step_expr), NULL);
3423       t = unshare_expr (new_name);
3424       gcc_assert (CONSTANT_CLASS_P (new_name)
3425                   || TREE_CODE (new_name) == SSA_NAME);
3426       new_vec = build_vector_from_val (stepvectype, t);
3427       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3428
3429       vec_def = induc_def;
3430       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3431       for (i = 1; i < ncopies; i++)
3432         {
3433           /* vec_i = vec_prev + vec_step  */
3434           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3435                                                    vec_def, vec_step);
3436           vec_def = make_ssa_name (vec_dest, new_stmt);
3437           gimple_assign_set_lhs (new_stmt, vec_def);
3438
3439           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3440           if (!useless_type_conversion_p (resvectype, vectype))
3441             {
3442               new_stmt = gimple_build_assign_with_ops
3443                   (VIEW_CONVERT_EXPR,
3444                    vect_get_new_vect_var (resvectype, vect_simple_var,
3445                                           "vec_iv_"),
3446                    build1 (VIEW_CONVERT_EXPR, resvectype,
3447                            gimple_assign_lhs (new_stmt)), NULL_TREE);
3448               gimple_assign_set_lhs (new_stmt,
3449                                      make_ssa_name
3450                                        (gimple_assign_lhs (new_stmt), new_stmt));
3451               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3452             }
3453           set_vinfo_for_stmt (new_stmt,
3454                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3455           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3456           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3457         }
3458     }
3459
3460   if (nested_in_vect_loop)
3461     {
3462       /* Find the loop-closed exit-phi of the induction, and record
3463          the final vector of induction results:  */
3464       exit_phi = NULL;
3465       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3466         {
3467           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3468             {
3469               exit_phi = USE_STMT (use_p);
3470               break;
3471             }
3472         }
3473       if (exit_phi)
3474         {
3475           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3476           /* FORNOW. Currently not supporting the case that an inner-loop induction
3477              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3478           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3479                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3480
3481           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3482           if (dump_enabled_p ())
3483             {
3484               dump_printf_loc (MSG_NOTE, vect_location,
3485                                "vector of inductions after inner-loop:");
3486               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3487               dump_printf (MSG_NOTE, "\n");
3488             }
3489         }
3490     }
3491
3492
3493   if (dump_enabled_p ())
3494     {
3495       dump_printf_loc (MSG_NOTE, vect_location,
3496                        "transform induction: created def-use cycle: ");
3497       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3498       dump_printf (MSG_NOTE, "\n");
3499       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3500                         SSA_NAME_DEF_STMT (vec_def), 0);
3501       dump_printf (MSG_NOTE, "\n");
3502     }
3503
3504   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3505   if (!useless_type_conversion_p (resvectype, vectype))
3506     {
3507       new_stmt = gimple_build_assign_with_ops
3508          (VIEW_CONVERT_EXPR,
3509           vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3510           build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3511       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3512       gimple_assign_set_lhs (new_stmt, induc_def);
3513       si = gsi_after_labels (bb);
3514       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3515       set_vinfo_for_stmt (new_stmt,
3516                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3517       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3518         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3519     }
3520
3521   return induc_def;
3522 }
3523
3524
3525 /* Function get_initial_def_for_reduction
3526
3527    Input:
3528    STMT - a stmt that performs a reduction operation in the loop.
3529    INIT_VAL - the initial value of the reduction variable
3530
3531    Output:
3532    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3533         of the reduction (used for adjusting the epilog - see below).
3534    Return a vector variable, initialized according to the operation that STMT
3535         performs. This vector will be used as the initial value of the
3536         vector of partial results.
3537
3538    Option1 (adjust in epilog): Initialize the vector as follows:
3539      add/bit or/xor:    [0,0,...,0,0]
3540      mult/bit and:      [1,1,...,1,1]
3541      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3542    and when necessary (e.g. add/mult case) let the caller know
3543    that it needs to adjust the result by init_val.
3544
3545    Option2: Initialize the vector as follows:
3546      add/bit or/xor:    [init_val,0,0,...,0]
3547      mult/bit and:      [init_val,1,1,...,1]
3548      min/max/cond_expr: [init_val,init_val,...,init_val]
3549    and no adjustments are needed.
3550
3551    For example, for the following code:
3552
3553    s = init_val;
3554    for (i=0;i<n;i++)
3555      s = s + a[i];
3556
3557    STMT is 's = s + a[i]', and the reduction variable is 's'.
3558    For a vector of 4 units, we want to return either [0,0,0,init_val],
3559    or [0,0,0,0] and let the caller know that it needs to adjust
3560    the result at the end by 'init_val'.
3561
3562    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3563    initialization vector is simpler (same element in all entries), if
3564    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3565
3566    A cost model should help decide between these two schemes.  */
3567
3568 tree
3569 get_initial_def_for_reduction (gimple stmt, tree init_val,
3570                                tree *adjustment_def)
3571 {
3572   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3573   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3574   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3575   tree scalar_type = TREE_TYPE (init_val);
3576   tree vectype = get_vectype_for_scalar_type (scalar_type);
3577   int nunits;
3578   enum tree_code code = gimple_assign_rhs_code (stmt);
3579   tree def_for_init;
3580   tree init_def;
3581   tree *elts;
3582   int i;
3583   bool nested_in_vect_loop = false;
3584   tree init_value;
3585   REAL_VALUE_TYPE real_init_val = dconst0;
3586   int int_init_val = 0;
3587   gimple def_stmt = NULL;
3588
3589   gcc_assert (vectype);
3590   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3591
3592   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3593               || SCALAR_FLOAT_TYPE_P (scalar_type));
3594
3595   if (nested_in_vect_loop_p (loop, stmt))
3596     nested_in_vect_loop = true;
3597   else
3598     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3599
3600   /* In case of double reduction we only create a vector variable to be put
3601      in the reduction phi node.  The actual statement creation is done in
3602      vect_create_epilog_for_reduction.  */
3603   if (adjustment_def && nested_in_vect_loop
3604       && TREE_CODE (init_val) == SSA_NAME
3605       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3606       && gimple_code (def_stmt) == GIMPLE_PHI
3607       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3608       && vinfo_for_stmt (def_stmt)
3609       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3610           == vect_double_reduction_def)
3611     {
3612       *adjustment_def = NULL;
3613       return vect_create_destination_var (init_val, vectype);
3614     }
3615
3616   if (TREE_CONSTANT (init_val))
3617     {
3618       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3619         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3620       else
3621         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3622     }
3623   else
3624     init_value = init_val;
3625
3626   switch (code)
3627     {
3628       case WIDEN_SUM_EXPR:
3629       case DOT_PROD_EXPR:
3630       case PLUS_EXPR:
3631       case MINUS_EXPR:
3632       case BIT_IOR_EXPR:
3633       case BIT_XOR_EXPR:
3634       case MULT_EXPR:
3635       case BIT_AND_EXPR:
3636         /* ADJUSMENT_DEF is NULL when called from
3637            vect_create_epilog_for_reduction to vectorize double reduction.  */
3638         if (adjustment_def)
3639           {
3640             if (nested_in_vect_loop)
3641               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3642                                                               NULL);
3643             else
3644               *adjustment_def = init_val;
3645           }
3646
3647         if (code == MULT_EXPR)
3648           {
3649             real_init_val = dconst1;
3650             int_init_val = 1;
3651           }
3652
3653         if (code == BIT_AND_EXPR)
3654           int_init_val = -1;
3655
3656         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3657           def_for_init = build_real (scalar_type, real_init_val);
3658         else
3659           def_for_init = build_int_cst (scalar_type, int_init_val);
3660
3661         /* Create a vector of '0' or '1' except the first element.  */
3662         elts = XALLOCAVEC (tree, nunits);
3663         for (i = nunits - 2; i >= 0; --i)
3664           elts[i + 1] = def_for_init;
3665
3666         /* Option1: the first element is '0' or '1' as well.  */
3667         if (adjustment_def)
3668           {
3669             elts[0] = def_for_init;
3670             init_def = build_vector (vectype, elts);
3671             break;
3672           }
3673
3674         /* Option2: the first element is INIT_VAL.  */
3675         elts[0] = init_val;
3676         if (TREE_CONSTANT (init_val))
3677           init_def = build_vector (vectype, elts);
3678         else
3679           {
3680             vec<constructor_elt, va_gc> *v;
3681             vec_alloc (v, nunits);
3682             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3683             for (i = 1; i < nunits; ++i)
3684               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3685             init_def = build_constructor (vectype, v);
3686           }
3687
3688         break;
3689
3690       case MIN_EXPR:
3691       case MAX_EXPR:
3692       case COND_EXPR:
3693         if (adjustment_def)
3694           {
3695             *adjustment_def = NULL_TREE;
3696             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3697             break;
3698           }
3699
3700         init_def = build_vector_from_val (vectype, init_value);
3701         break;
3702
3703       default:
3704         gcc_unreachable ();
3705     }
3706
3707   return init_def;
3708 }
3709
3710
3711 /* Function vect_create_epilog_for_reduction
3712
3713    Create code at the loop-epilog to finalize the result of a reduction
3714    computation.
3715
3716    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3717      reduction statements.
3718    STMT is the scalar reduction stmt that is being vectorized.
3719    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3720      number of elements that we can fit in a vectype (nunits).  In this case
3721      we have to generate more than one vector stmt - i.e - we need to "unroll"
3722      the vector stmt by a factor VF/nunits.  For more details see documentation
3723      in vectorizable_operation.
3724    REDUC_CODE is the tree-code for the epilog reduction.
3725    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3726      computation.
3727    REDUC_INDEX is the index of the operand in the right hand side of the
3728      statement that is defined by REDUCTION_PHI.
3729    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3730    SLP_NODE is an SLP node containing a group of reduction statements. The
3731      first one in this group is STMT.
3732
3733    This function:
3734    1. Creates the reduction def-use cycles: sets the arguments for
3735       REDUCTION_PHIS:
3736       The loop-entry argument is the vectorized initial-value of the reduction.
3737       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3738       sums.
3739    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3740       by applying the operation specified by REDUC_CODE if available, or by
3741       other means (whole-vector shifts or a scalar loop).
3742       The function also creates a new phi node at the loop exit to preserve
3743       loop-closed form, as illustrated below.
3744
3745      The flow at the entry to this function:
3746
3747         loop:
3748           vec_def = phi <null, null>            # REDUCTION_PHI
3749           VECT_DEF = vector_stmt                # vectorized form of STMT
3750           s_loop = scalar_stmt                  # (scalar) STMT
3751         loop_exit:
3752           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3753           use <s_out0>
3754           use <s_out0>
3755
3756      The above is transformed by this function into:
3757
3758         loop:
3759           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3760           VECT_DEF = vector_stmt                # vectorized form of STMT
3761           s_loop = scalar_stmt                  # (scalar) STMT
3762         loop_exit:
3763           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3764           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3765           v_out2 = reduce <v_out1>
3766           s_out3 = extract_field <v_out2, 0>
3767           s_out4 = adjust_result <s_out3>
3768           use <s_out4>
3769           use <s_out4>
3770 */
3771
3772 static void
3773 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3774                                   int ncopies, enum tree_code reduc_code,
3775                                   vec<gimple> reduction_phis,
3776                                   int reduc_index, bool double_reduc,
3777                                   slp_tree slp_node)
3778 {
3779   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3780   stmt_vec_info prev_phi_info;
3781   tree vectype;
3782   enum machine_mode mode;
3783   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3784   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3785   basic_block exit_bb;
3786   tree scalar_dest;
3787   tree scalar_type;
3788   gimple new_phi = NULL, phi;
3789   gimple_stmt_iterator exit_gsi;
3790   tree vec_dest;
3791   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3792   gimple epilog_stmt = NULL;
3793   enum tree_code code = gimple_assign_rhs_code (stmt);
3794   gimple exit_phi;
3795   tree bitsize, bitpos;
3796   tree adjustment_def = NULL;
3797   tree vec_initial_def = NULL;
3798   tree reduction_op, expr, def;
3799   tree orig_name, scalar_result;
3800   imm_use_iterator imm_iter, phi_imm_iter;
3801   use_operand_p use_p, phi_use_p;
3802   bool extract_scalar_result = false;
3803   gimple use_stmt, orig_stmt, reduction_phi = NULL;
3804   bool nested_in_vect_loop = false;
3805   vec<gimple> new_phis = vNULL;
3806   vec<gimple> inner_phis = vNULL;
3807   enum vect_def_type dt = vect_unknown_def_type;
3808   int j, i;
3809   vec<tree> scalar_results = vNULL;
3810   unsigned int group_size = 1, k, ratio;
3811   vec<tree> vec_initial_defs = vNULL;
3812   vec<gimple> phis;
3813   bool slp_reduc = false;
3814   tree new_phi_result;
3815   gimple inner_phi = NULL;
3816
3817   if (slp_node)
3818     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3819
3820   if (nested_in_vect_loop_p (loop, stmt))
3821     {
3822       outer_loop = loop;
3823       loop = loop->inner;
3824       nested_in_vect_loop = true;
3825       gcc_assert (!slp_node);
3826     }
3827
3828   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3829     {
3830     case GIMPLE_SINGLE_RHS:
3831       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3832                   == ternary_op);
3833       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3834       break;
3835     case GIMPLE_UNARY_RHS:
3836       reduction_op = gimple_assign_rhs1 (stmt);
3837       break;
3838     case GIMPLE_BINARY_RHS:
3839       reduction_op = reduc_index ?
3840                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3841       break;
3842     case GIMPLE_TERNARY_RHS:
3843       reduction_op = gimple_op (stmt, reduc_index + 1);
3844       break;
3845     default:
3846       gcc_unreachable ();
3847     }
3848
3849   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3850   gcc_assert (vectype);
3851   mode = TYPE_MODE (vectype);
3852
3853   /* 1. Create the reduction def-use cycle:
3854      Set the arguments of REDUCTION_PHIS, i.e., transform
3855
3856         loop:
3857           vec_def = phi <null, null>            # REDUCTION_PHI
3858           VECT_DEF = vector_stmt                # vectorized form of STMT
3859           ...
3860
3861      into:
3862
3863         loop:
3864           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3865           VECT_DEF = vector_stmt                # vectorized form of STMT
3866           ...
3867
3868      (in case of SLP, do it for all the phis). */
3869
3870   /* Get the loop-entry arguments.  */
3871   if (slp_node)
3872     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3873                        NULL, slp_node, reduc_index);
3874   else
3875     {
3876       vec_initial_defs.create (1);
3877      /* For the case of reduction, vect_get_vec_def_for_operand returns
3878         the scalar def before the loop, that defines the initial value
3879         of the reduction variable.  */
3880       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3881                                                       &adjustment_def);
3882       vec_initial_defs.quick_push (vec_initial_def);
3883     }
3884
3885   /* Set phi nodes arguments.  */
3886   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
3887     {
3888       tree vec_init_def = vec_initial_defs[i];
3889       tree def = vect_defs[i];
3890       for (j = 0; j < ncopies; j++)
3891         {
3892           /* Set the loop-entry arg of the reduction-phi.  */
3893           add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3894                        UNKNOWN_LOCATION);
3895
3896           /* Set the loop-latch arg for the reduction-phi.  */
3897           if (j > 0)
3898             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3899
3900           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3901
3902           if (dump_enabled_p ())
3903             {
3904               dump_printf_loc (MSG_NOTE, vect_location,
3905                                "transform reduction: created def-use cycle: ");
3906               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
3907               dump_printf (MSG_NOTE, "\n");
3908               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
3909               dump_printf (MSG_NOTE, "\n");
3910             }
3911
3912           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3913         }
3914     }
3915
3916   vec_initial_defs.release ();
3917
3918   /* 2. Create epilog code.
3919         The reduction epilog code operates across the elements of the vector
3920         of partial results computed by the vectorized loop.
3921         The reduction epilog code consists of:
3922
3923         step 1: compute the scalar result in a vector (v_out2)
3924         step 2: extract the scalar result (s_out3) from the vector (v_out2)
3925         step 3: adjust the scalar result (s_out3) if needed.
3926
3927         Step 1 can be accomplished using one the following three schemes:
3928           (scheme 1) using reduc_code, if available.
3929           (scheme 2) using whole-vector shifts, if available.
3930           (scheme 3) using a scalar loop. In this case steps 1+2 above are
3931                      combined.
3932
3933           The overall epilog code looks like this:
3934
3935           s_out0 = phi <s_loop>         # original EXIT_PHI
3936           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
3937           v_out2 = reduce <v_out1>              # step 1
3938           s_out3 = extract_field <v_out2, 0>    # step 2
3939           s_out4 = adjust_result <s_out3>       # step 3
3940
3941           (step 3 is optional, and steps 1 and 2 may be combined).
3942           Lastly, the uses of s_out0 are replaced by s_out4.  */
3943
3944
3945   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3946          v_out1 = phi <VECT_DEF>
3947          Store them in NEW_PHIS.  */
3948
3949   exit_bb = single_exit (loop)->dest;
3950   prev_phi_info = NULL;
3951   new_phis.create (vect_defs.length ());
3952   FOR_EACH_VEC_ELT (vect_defs, i, def)
3953     {
3954       for (j = 0; j < ncopies; j++)
3955         {
3956           tree new_def = copy_ssa_name (def, NULL);
3957           phi = create_phi_node (new_def, exit_bb);
3958           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3959           if (j == 0)
3960             new_phis.quick_push (phi);
3961           else
3962             {
3963               def = vect_get_vec_def_for_stmt_copy (dt, def);
3964               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3965             }
3966
3967           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3968           prev_phi_info = vinfo_for_stmt (phi);
3969         }
3970     }
3971
3972   /* The epilogue is created for the outer-loop, i.e., for the loop being
3973      vectorized.  Create exit phis for the outer loop.  */
3974   if (double_reduc)
3975     {
3976       loop = outer_loop;
3977       exit_bb = single_exit (loop)->dest;
3978       inner_phis.create (vect_defs.length ());
3979       FOR_EACH_VEC_ELT (new_phis, i, phi)
3980         {
3981           tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3982           gimple outer_phi = create_phi_node (new_result, exit_bb);
3983           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3984                            PHI_RESULT (phi));
3985           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3986                                                             loop_vinfo, NULL));
3987           inner_phis.quick_push (phi);
3988           new_phis[i] = outer_phi;
3989           prev_phi_info = vinfo_for_stmt (outer_phi);
3990           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
3991             {
3992               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3993               new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3994               outer_phi = create_phi_node (new_result, exit_bb);
3995               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3996                                PHI_RESULT (phi));
3997               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3998                                                         loop_vinfo, NULL));
3999               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4000               prev_phi_info = vinfo_for_stmt (outer_phi);
4001             }
4002         }
4003     }
4004
4005   exit_gsi = gsi_after_labels (exit_bb);
4006
4007   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4008          (i.e. when reduc_code is not available) and in the final adjustment
4009          code (if needed).  Also get the original scalar reduction variable as
4010          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4011          represents a reduction pattern), the tree-code and scalar-def are
4012          taken from the original stmt that the pattern-stmt (STMT) replaces.
4013          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4014          are taken from STMT.  */
4015
4016   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4017   if (!orig_stmt)
4018     {
4019       /* Regular reduction  */
4020       orig_stmt = stmt;
4021     }
4022   else
4023     {
4024       /* Reduction pattern  */
4025       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4026       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4027       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4028     }
4029
4030   code = gimple_assign_rhs_code (orig_stmt);
4031   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4032      partial results are added and not subtracted.  */
4033   if (code == MINUS_EXPR)
4034     code = PLUS_EXPR;
4035
4036   scalar_dest = gimple_assign_lhs (orig_stmt);
4037   scalar_type = TREE_TYPE (scalar_dest);
4038   scalar_results.create (group_size);
4039   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4040   bitsize = TYPE_SIZE (scalar_type);
4041
4042   /* In case this is a reduction in an inner-loop while vectorizing an outer
4043      loop - we don't need to extract a single scalar result at the end of the
4044      inner-loop (unless it is double reduction, i.e., the use of reduction is
4045      outside the outer-loop).  The final vector of partial results will be used
4046      in the vectorized outer-loop, or reduced to a scalar result at the end of
4047      the outer-loop.  */
4048   if (nested_in_vect_loop && !double_reduc)
4049     goto vect_finalize_reduction;
4050
4051   /* SLP reduction without reduction chain, e.g.,
4052      # a1 = phi <a2, a0>
4053      # b1 = phi <b2, b0>
4054      a2 = operation (a1)
4055      b2 = operation (b1)  */
4056   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4057
4058   /* In case of reduction chain, e.g.,
4059      # a1 = phi <a3, a0>
4060      a2 = operation (a1)
4061      a3 = operation (a2),
4062
4063      we may end up with more than one vector result.  Here we reduce them to
4064      one vector.  */
4065   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4066     {
4067       tree first_vect = PHI_RESULT (new_phis[0]);
4068       tree tmp;
4069       gimple new_vec_stmt = NULL;
4070
4071       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4072       for (k = 1; k < new_phis.length (); k++)
4073         {
4074           gimple next_phi = new_phis[k];
4075           tree second_vect = PHI_RESULT (next_phi);
4076
4077           tmp = build2 (code, vectype,  first_vect, second_vect);
4078           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4079           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4080           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4081           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4082         }
4083
4084       new_phi_result = first_vect;
4085       if (new_vec_stmt)
4086         {
4087           new_phis.truncate (0);
4088           new_phis.safe_push (new_vec_stmt);
4089         }
4090     }
4091   else
4092     new_phi_result = PHI_RESULT (new_phis[0]);
4093
4094   /* 2.3 Create the reduction code, using one of the three schemes described
4095          above. In SLP we simply need to extract all the elements from the
4096          vector (without reducing them), so we use scalar shifts.  */
4097   if (reduc_code != ERROR_MARK && !slp_reduc)
4098     {
4099       tree tmp;
4100
4101       /*** Case 1:  Create:
4102            v_out2 = reduc_expr <v_out1>  */
4103
4104       if (dump_enabled_p ())
4105         dump_printf_loc (MSG_NOTE, vect_location,
4106                          "Reduce using direct vector reduction.\n");
4107
4108       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4109       tmp = build1 (reduc_code, vectype, new_phi_result);
4110       epilog_stmt = gimple_build_assign (vec_dest, tmp);
4111       new_temp = make_ssa_name (vec_dest, epilog_stmt);
4112       gimple_assign_set_lhs (epilog_stmt, new_temp);
4113       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4114
4115       extract_scalar_result = true;
4116     }
4117   else
4118     {
4119       enum tree_code shift_code = ERROR_MARK;
4120       bool have_whole_vector_shift = true;
4121       int bit_offset;
4122       int element_bitsize = tree_to_uhwi (bitsize);
4123       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4124       tree vec_temp;
4125
4126       if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4127         shift_code = VEC_RSHIFT_EXPR;
4128       else
4129         have_whole_vector_shift = false;
4130
4131       /* Regardless of whether we have a whole vector shift, if we're
4132          emulating the operation via tree-vect-generic, we don't want
4133          to use it.  Only the first round of the reduction is likely
4134          to still be profitable via emulation.  */
4135       /* ??? It might be better to emit a reduction tree code here, so that
4136          tree-vect-generic can expand the first round via bit tricks.  */
4137       if (!VECTOR_MODE_P (mode))
4138         have_whole_vector_shift = false;
4139       else
4140         {
4141           optab optab = optab_for_tree_code (code, vectype, optab_default);
4142           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4143             have_whole_vector_shift = false;
4144         }
4145
4146       if (have_whole_vector_shift && !slp_reduc)
4147         {
4148           /*** Case 2: Create:
4149              for (offset = VS/2; offset >= element_size; offset/=2)
4150                 {
4151                   Create:  va' = vec_shift <va, offset>
4152                   Create:  va = vop <va, va'>
4153                 }  */
4154
4155           if (dump_enabled_p ())
4156             dump_printf_loc (MSG_NOTE, vect_location,
4157                              "Reduce using vector shifts\n");
4158
4159           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4160           new_temp = new_phi_result;
4161           for (bit_offset = vec_size_in_bits/2;
4162                bit_offset >= element_bitsize;
4163                bit_offset /= 2)
4164             {
4165               tree bitpos = size_int (bit_offset);
4166
4167               epilog_stmt = gimple_build_assign_with_ops (shift_code,
4168                                                vec_dest, new_temp, bitpos);
4169               new_name = make_ssa_name (vec_dest, epilog_stmt);
4170               gimple_assign_set_lhs (epilog_stmt, new_name);
4171               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4172
4173               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4174                                                           new_name, new_temp);
4175               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4176               gimple_assign_set_lhs (epilog_stmt, new_temp);
4177               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4178             }
4179
4180           extract_scalar_result = true;
4181         }
4182       else
4183         {
4184           tree rhs;
4185
4186           /*** Case 3: Create:
4187              s = extract_field <v_out2, 0>
4188              for (offset = element_size;
4189                   offset < vector_size;
4190                   offset += element_size;)
4191                {
4192                  Create:  s' = extract_field <v_out2, offset>
4193                  Create:  s = op <s, s'>  // For non SLP cases
4194                }  */
4195
4196           if (dump_enabled_p ())
4197             dump_printf_loc (MSG_NOTE, vect_location,
4198                              "Reduce using scalar code.\n");
4199
4200           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4201           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4202             {
4203               if (gimple_code (new_phi) == GIMPLE_PHI)
4204                 vec_temp = PHI_RESULT (new_phi);
4205               else
4206                 vec_temp = gimple_assign_lhs (new_phi);
4207               rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4208                             bitsize_zero_node);
4209               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4210               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4211               gimple_assign_set_lhs (epilog_stmt, new_temp);
4212               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4213
4214               /* In SLP we don't need to apply reduction operation, so we just
4215                  collect s' values in SCALAR_RESULTS.  */
4216               if (slp_reduc)
4217                 scalar_results.safe_push (new_temp);
4218
4219               for (bit_offset = element_bitsize;
4220                    bit_offset < vec_size_in_bits;
4221                    bit_offset += element_bitsize)
4222                 {
4223                   tree bitpos = bitsize_int (bit_offset);
4224                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4225                                      bitsize, bitpos);
4226
4227                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4228                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4229                   gimple_assign_set_lhs (epilog_stmt, new_name);
4230                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4231
4232                   if (slp_reduc)
4233                     {
4234                       /* In SLP we don't need to apply reduction operation, so
4235                          we just collect s' values in SCALAR_RESULTS.  */
4236                       new_temp = new_name;
4237                       scalar_results.safe_push (new_name);
4238                     }
4239                   else
4240                     {
4241                       epilog_stmt = gimple_build_assign_with_ops (code,
4242                                           new_scalar_dest, new_name, new_temp);
4243                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4244                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4245                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4246                     }
4247                 }
4248             }
4249
4250           /* The only case where we need to reduce scalar results in SLP, is
4251              unrolling.  If the size of SCALAR_RESULTS is greater than
4252              GROUP_SIZE, we reduce them combining elements modulo
4253              GROUP_SIZE.  */
4254           if (slp_reduc)
4255             {
4256               tree res, first_res, new_res;
4257               gimple new_stmt;
4258
4259               /* Reduce multiple scalar results in case of SLP unrolling.  */
4260               for (j = group_size; scalar_results.iterate (j, &res);
4261                    j++)
4262                 {
4263                   first_res = scalar_results[j % group_size];
4264                   new_stmt = gimple_build_assign_with_ops (code,
4265                                               new_scalar_dest, first_res, res);
4266                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4267                   gimple_assign_set_lhs (new_stmt, new_res);
4268                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4269                   scalar_results[j % group_size] = new_res;
4270                 }
4271             }
4272           else
4273             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4274             scalar_results.safe_push (new_temp);
4275
4276           extract_scalar_result = false;
4277         }
4278     }
4279
4280   /* 2.4  Extract the final scalar result.  Create:
4281           s_out3 = extract_field <v_out2, bitpos>  */
4282
4283   if (extract_scalar_result)
4284     {
4285       tree rhs;
4286
4287       if (dump_enabled_p ())
4288         dump_printf_loc (MSG_NOTE, vect_location,
4289                          "extract scalar result\n");
4290
4291       if (BYTES_BIG_ENDIAN)
4292         bitpos = size_binop (MULT_EXPR,
4293                              bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4294                              TYPE_SIZE (scalar_type));
4295       else
4296         bitpos = bitsize_zero_node;
4297
4298       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4299       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4300       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4301       gimple_assign_set_lhs (epilog_stmt, new_temp);
4302       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4303       scalar_results.safe_push (new_temp);
4304     }
4305
4306 vect_finalize_reduction:
4307
4308   if (double_reduc)
4309     loop = loop->inner;
4310
4311   /* 2.5 Adjust the final result by the initial value of the reduction
4312          variable. (When such adjustment is not needed, then
4313          'adjustment_def' is zero).  For example, if code is PLUS we create:
4314          new_temp = loop_exit_def + adjustment_def  */
4315
4316   if (adjustment_def)
4317     {
4318       gcc_assert (!slp_reduc);
4319       if (nested_in_vect_loop)
4320         {
4321           new_phi = new_phis[0];
4322           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4323           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4324           new_dest = vect_create_destination_var (scalar_dest, vectype);
4325         }
4326       else
4327         {
4328           new_temp = scalar_results[0];
4329           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4330           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4331           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4332         }
4333
4334       epilog_stmt = gimple_build_assign (new_dest, expr);
4335       new_temp = make_ssa_name (new_dest, epilog_stmt);
4336       gimple_assign_set_lhs (epilog_stmt, new_temp);
4337       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4338       if (nested_in_vect_loop)
4339         {
4340           set_vinfo_for_stmt (epilog_stmt,
4341                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
4342                                                  NULL));
4343           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4344                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4345
4346           if (!double_reduc)
4347             scalar_results.quick_push (new_temp);
4348           else
4349             scalar_results[0] = new_temp;
4350         }
4351       else
4352         scalar_results[0] = new_temp;
4353
4354       new_phis[0] = epilog_stmt;
4355     }
4356
4357   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4358           phis with new adjusted scalar results, i.e., replace use <s_out0>
4359           with use <s_out4>.
4360
4361      Transform:
4362         loop_exit:
4363           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4364           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4365           v_out2 = reduce <v_out1>
4366           s_out3 = extract_field <v_out2, 0>
4367           s_out4 = adjust_result <s_out3>
4368           use <s_out0>
4369           use <s_out0>
4370
4371      into:
4372
4373         loop_exit:
4374           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4375           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4376           v_out2 = reduce <v_out1>
4377           s_out3 = extract_field <v_out2, 0>
4378           s_out4 = adjust_result <s_out3>
4379           use <s_out4>
4380           use <s_out4> */
4381
4382
4383   /* In SLP reduction chain we reduce vector results into one vector if
4384      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4385      the last stmt in the reduction chain, since we are looking for the loop
4386      exit phi node.  */
4387   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4388     {
4389       scalar_dest = gimple_assign_lhs (
4390                         SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4391       group_size = 1;
4392     }
4393
4394   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4395      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4396      need to match SCALAR_RESULTS with corresponding statements.  The first
4397      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4398      the first vector stmt, etc.
4399      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4400   if (group_size > new_phis.length ())
4401     {
4402       ratio = group_size / new_phis.length ();
4403       gcc_assert (!(group_size % new_phis.length ()));
4404     }
4405   else
4406     ratio = 1;
4407
4408   for (k = 0; k < group_size; k++)
4409     {
4410       if (k % ratio == 0)
4411         {
4412           epilog_stmt = new_phis[k / ratio];
4413           reduction_phi = reduction_phis[k / ratio];
4414           if (double_reduc)
4415             inner_phi = inner_phis[k / ratio];
4416         }
4417
4418       if (slp_reduc)
4419         {
4420           gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4421
4422           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4423           /* SLP statements can't participate in patterns.  */
4424           gcc_assert (!orig_stmt);
4425           scalar_dest = gimple_assign_lhs (current_stmt);
4426         }
4427
4428       phis.create (3);
4429       /* Find the loop-closed-use at the loop exit of the original scalar
4430          result.  (The reduction result is expected to have two immediate uses -
4431          one at the latch block, and one at the loop exit).  */
4432       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4433         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4434             && !is_gimple_debug (USE_STMT (use_p)))
4435           phis.safe_push (USE_STMT (use_p));
4436
4437       /* While we expect to have found an exit_phi because of loop-closed-ssa
4438          form we can end up without one if the scalar cycle is dead.  */
4439
4440       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4441         {
4442           if (outer_loop)
4443             {
4444               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4445               gimple vect_phi;
4446
4447               /* FORNOW. Currently not supporting the case that an inner-loop
4448                  reduction is not used in the outer-loop (but only outside the
4449                  outer-loop), unless it is double reduction.  */
4450               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4451                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4452                           || double_reduc);
4453
4454               STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4455               if (!double_reduc
4456                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4457                       != vect_double_reduction_def)
4458                 continue;
4459
4460               /* Handle double reduction:
4461
4462                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4463                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4464                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4465                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4466
4467                  At that point the regular reduction (stmt2 and stmt3) is
4468                  already vectorized, as well as the exit phi node, stmt4.
4469                  Here we vectorize the phi node of double reduction, stmt1, and
4470                  update all relevant statements.  */
4471
4472               /* Go through all the uses of s2 to find double reduction phi
4473                  node, i.e., stmt1 above.  */
4474               orig_name = PHI_RESULT (exit_phi);
4475               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4476                 {
4477                   stmt_vec_info use_stmt_vinfo;
4478                   stmt_vec_info new_phi_vinfo;
4479                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4480                   basic_block bb = gimple_bb (use_stmt);
4481                   gimple use;
4482
4483                   /* Check that USE_STMT is really double reduction phi
4484                      node.  */
4485                   if (gimple_code (use_stmt) != GIMPLE_PHI
4486                       || gimple_phi_num_args (use_stmt) != 2
4487                       || bb->loop_father != outer_loop)
4488                     continue;
4489                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4490                   if (!use_stmt_vinfo
4491                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4492                           != vect_double_reduction_def)
4493                     continue;
4494
4495                   /* Create vector phi node for double reduction:
4496                      vs1 = phi <vs0, vs2>
4497                      vs1 was created previously in this function by a call to
4498                        vect_get_vec_def_for_operand and is stored in
4499                        vec_initial_def;
4500                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4501                      vs0 is created here.  */
4502
4503                   /* Create vector phi node.  */
4504                   vect_phi = create_phi_node (vec_initial_def, bb);
4505                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4506                                     loop_vec_info_for_loop (outer_loop), NULL);
4507                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4508
4509                   /* Create vs0 - initial def of the double reduction phi.  */
4510                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4511                                              loop_preheader_edge (outer_loop));
4512                   init_def = get_initial_def_for_reduction (stmt,
4513                                                           preheader_arg, NULL);
4514                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4515                                                     vectype, NULL);
4516
4517                   /* Update phi node arguments with vs0 and vs2.  */
4518                   add_phi_arg (vect_phi, vect_phi_init,
4519                                loop_preheader_edge (outer_loop),
4520                                UNKNOWN_LOCATION);
4521                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4522                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4523                   if (dump_enabled_p ())
4524                     {
4525                       dump_printf_loc (MSG_NOTE, vect_location,
4526                                        "created double reduction phi node: ");
4527                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4528                       dump_printf (MSG_NOTE, "\n");
4529                     }
4530
4531                   vect_phi_res = PHI_RESULT (vect_phi);
4532
4533                   /* Replace the use, i.e., set the correct vs1 in the regular
4534                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4535                      loop is redundant.  */
4536                   use = reduction_phi;
4537                   for (j = 0; j < ncopies; j++)
4538                     {
4539                       edge pr_edge = loop_preheader_edge (loop);
4540                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4541                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4542                     }
4543                 }
4544             }
4545         }
4546
4547       phis.release ();
4548       if (nested_in_vect_loop)
4549         {
4550           if (double_reduc)
4551             loop = outer_loop;
4552           else
4553             continue;
4554         }
4555
4556       phis.create (3);
4557       /* Find the loop-closed-use at the loop exit of the original scalar
4558          result.  (The reduction result is expected to have two immediate uses,
4559          one at the latch block, and one at the loop exit).  For double
4560          reductions we are looking for exit phis of the outer loop.  */
4561       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4562         {
4563           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4564             {
4565               if (!is_gimple_debug (USE_STMT (use_p)))
4566                 phis.safe_push (USE_STMT (use_p));
4567             }
4568           else
4569             {
4570               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4571                 {
4572                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4573
4574                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4575                     {
4576                       if (!flow_bb_inside_loop_p (loop,
4577                                              gimple_bb (USE_STMT (phi_use_p)))
4578                           && !is_gimple_debug (USE_STMT (phi_use_p)))
4579                         phis.safe_push (USE_STMT (phi_use_p));
4580                     }
4581                 }
4582             }
4583         }
4584
4585       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4586         {
4587           /* Replace the uses:  */
4588           orig_name = PHI_RESULT (exit_phi);
4589           scalar_result = scalar_results[k];
4590           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4591             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4592               SET_USE (use_p, scalar_result);
4593         }
4594
4595       phis.release ();
4596     }
4597
4598   scalar_results.release ();
4599   inner_phis.release ();
4600   new_phis.release ();
4601 }
4602
4603
4604 /* Function vectorizable_reduction.
4605
4606    Check if STMT performs a reduction operation that can be vectorized.
4607    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4608    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4609    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4610
4611    This function also handles reduction idioms (patterns) that have been
4612    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4613    of this form:
4614      X = pattern_expr (arg0, arg1, ..., X)
4615    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4616    sequence that had been detected and replaced by the pattern-stmt (STMT).
4617
4618    In some cases of reduction patterns, the type of the reduction variable X is
4619    different than the type of the other arguments of STMT.
4620    In such cases, the vectype that is used when transforming STMT into a vector
4621    stmt is different than the vectype that is used to determine the
4622    vectorization factor, because it consists of a different number of elements
4623    than the actual number of elements that are being operated upon in parallel.
4624
4625    For example, consider an accumulation of shorts into an int accumulator.
4626    On some targets it's possible to vectorize this pattern operating on 8
4627    shorts at a time (hence, the vectype for purposes of determining the
4628    vectorization factor should be V8HI); on the other hand, the vectype that
4629    is used to create the vector form is actually V4SI (the type of the result).
4630
4631    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4632    indicates what is the actual level of parallelism (V8HI in the example), so
4633    that the right vectorization factor would be derived.  This vectype
4634    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4635    be used to create the vectorized stmt.  The right vectype for the vectorized
4636    stmt is obtained from the type of the result X:
4637         get_vectype_for_scalar_type (TREE_TYPE (X))
4638
4639    This means that, contrary to "regular" reductions (or "regular" stmts in
4640    general), the following equation:
4641       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4642    does *NOT* necessarily hold for reduction patterns.  */
4643
4644 bool
4645 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4646                         gimple *vec_stmt, slp_tree slp_node)
4647 {
4648   tree vec_dest;
4649   tree scalar_dest;
4650   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4651   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4652   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4653   tree vectype_in = NULL_TREE;
4654   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4655   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4656   enum tree_code code, orig_code, epilog_reduc_code;
4657   enum machine_mode vec_mode;
4658   int op_type;
4659   optab optab, reduc_optab;
4660   tree new_temp = NULL_TREE;
4661   tree def;
4662   gimple def_stmt;
4663   enum vect_def_type dt;
4664   gimple new_phi = NULL;
4665   tree scalar_type;
4666   bool is_simple_use;
4667   gimple orig_stmt;
4668   stmt_vec_info orig_stmt_info;
4669   tree expr = NULL_TREE;
4670   int i;
4671   int ncopies;
4672   int epilog_copies;
4673   stmt_vec_info prev_stmt_info, prev_phi_info;
4674   bool single_defuse_cycle = false;
4675   tree reduc_def = NULL_TREE;
4676   gimple new_stmt = NULL;
4677   int j;
4678   tree ops[3];
4679   bool nested_cycle = false, found_nested_cycle_def = false;
4680   gimple reduc_def_stmt = NULL;
4681   /* The default is that the reduction variable is the last in statement.  */
4682   int reduc_index = 2;
4683   bool double_reduc = false, dummy;
4684   basic_block def_bb;
4685   struct loop * def_stmt_loop, *outer_loop = NULL;
4686   tree def_arg;
4687   gimple def_arg_stmt;
4688   vec<tree> vec_oprnds0 = vNULL;
4689   vec<tree> vec_oprnds1 = vNULL;
4690   vec<tree> vect_defs = vNULL;
4691   vec<gimple> phis = vNULL;
4692   int vec_num;
4693   tree def0, def1, tem, op0, op1 = NULL_TREE;
4694
4695   /* In case of reduction chain we switch to the first stmt in the chain, but
4696      we don't update STMT_INFO, since only the last stmt is marked as reduction
4697      and has reduction properties.  */
4698   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4699     stmt = GROUP_FIRST_ELEMENT (stmt_info);
4700
4701   if (nested_in_vect_loop_p (loop, stmt))
4702     {
4703       outer_loop = loop;
4704       loop = loop->inner;
4705       nested_cycle = true;
4706     }
4707
4708   /* 1. Is vectorizable reduction?  */
4709   /* Not supportable if the reduction variable is used in the loop, unless
4710      it's a reduction chain.  */
4711   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4712       && !GROUP_FIRST_ELEMENT (stmt_info))
4713     return false;
4714
4715   /* Reductions that are not used even in an enclosing outer-loop,
4716      are expected to be "live" (used out of the loop).  */
4717   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4718       && !STMT_VINFO_LIVE_P (stmt_info))
4719     return false;
4720
4721   /* Make sure it was already recognized as a reduction computation.  */
4722   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4723       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4724     return false;
4725
4726   /* 2. Has this been recognized as a reduction pattern?
4727
4728      Check if STMT represents a pattern that has been recognized
4729      in earlier analysis stages.  For stmts that represent a pattern,
4730      the STMT_VINFO_RELATED_STMT field records the last stmt in
4731      the original sequence that constitutes the pattern.  */
4732
4733   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4734   if (orig_stmt)
4735     {
4736       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4737       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4738       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4739     }
4740
4741   /* 3. Check the operands of the operation.  The first operands are defined
4742         inside the loop body. The last operand is the reduction variable,
4743         which is defined by the loop-header-phi.  */
4744
4745   gcc_assert (is_gimple_assign (stmt));
4746
4747   /* Flatten RHS.  */
4748   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4749     {
4750     case GIMPLE_SINGLE_RHS:
4751       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4752       if (op_type == ternary_op)
4753         {
4754           tree rhs = gimple_assign_rhs1 (stmt);
4755           ops[0] = TREE_OPERAND (rhs, 0);
4756           ops[1] = TREE_OPERAND (rhs, 1);
4757           ops[2] = TREE_OPERAND (rhs, 2);
4758           code = TREE_CODE (rhs);
4759         }
4760       else
4761         return false;
4762       break;
4763
4764     case GIMPLE_BINARY_RHS:
4765       code = gimple_assign_rhs_code (stmt);
4766       op_type = TREE_CODE_LENGTH (code);
4767       gcc_assert (op_type == binary_op);
4768       ops[0] = gimple_assign_rhs1 (stmt);
4769       ops[1] = gimple_assign_rhs2 (stmt);
4770       break;
4771
4772     case GIMPLE_TERNARY_RHS:
4773       code = gimple_assign_rhs_code (stmt);
4774       op_type = TREE_CODE_LENGTH (code);
4775       gcc_assert (op_type == ternary_op);
4776       ops[0] = gimple_assign_rhs1 (stmt);
4777       ops[1] = gimple_assign_rhs2 (stmt);
4778       ops[2] = gimple_assign_rhs3 (stmt);
4779       break;
4780
4781     case GIMPLE_UNARY_RHS:
4782       return false;
4783
4784     default:
4785       gcc_unreachable ();
4786     }
4787
4788   if (code == COND_EXPR && slp_node)
4789     return false;
4790
4791   scalar_dest = gimple_assign_lhs (stmt);
4792   scalar_type = TREE_TYPE (scalar_dest);
4793   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4794       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4795     return false;
4796
4797   /* Do not try to vectorize bit-precision reductions.  */
4798   if ((TYPE_PRECISION (scalar_type)
4799        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4800     return false;
4801
4802   /* All uses but the last are expected to be defined in the loop.
4803      The last use is the reduction variable.  In case of nested cycle this
4804      assumption is not true: we use reduc_index to record the index of the
4805      reduction variable.  */
4806   for (i = 0; i < op_type - 1; i++)
4807     {
4808       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4809       if (i == 0 && code == COND_EXPR)
4810         continue;
4811
4812       is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4813                                             &def_stmt, &def, &dt, &tem);
4814       if (!vectype_in)
4815         vectype_in = tem;
4816       gcc_assert (is_simple_use);
4817
4818       if (dt != vect_internal_def
4819           && dt != vect_external_def
4820           && dt != vect_constant_def
4821           && dt != vect_induction_def
4822           && !(dt == vect_nested_cycle && nested_cycle))
4823         return false;
4824
4825       if (dt == vect_nested_cycle)
4826         {
4827           found_nested_cycle_def = true;
4828           reduc_def_stmt = def_stmt;
4829           reduc_index = i;
4830         }
4831     }
4832
4833   is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4834                                         &def_stmt, &def, &dt, &tem);
4835   if (!vectype_in)
4836     vectype_in = tem;
4837   gcc_assert (is_simple_use);
4838   if (!(dt == vect_reduction_def
4839         || dt == vect_nested_cycle
4840         || ((dt == vect_internal_def || dt == vect_external_def
4841              || dt == vect_constant_def || dt == vect_induction_def)
4842             && nested_cycle && found_nested_cycle_def)))
4843     {
4844       /* For pattern recognized stmts, orig_stmt might be a reduction,
4845          but some helper statements for the pattern might not, or
4846          might be COND_EXPRs with reduction uses in the condition.  */
4847       gcc_assert (orig_stmt);
4848       return false;
4849     }
4850   if (!found_nested_cycle_def)
4851     reduc_def_stmt = def_stmt;
4852
4853   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4854   if (orig_stmt)
4855     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4856                                                        reduc_def_stmt,
4857                                                        !nested_cycle,
4858                                                        &dummy));
4859   else
4860     {
4861       gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4862                                              !nested_cycle, &dummy);
4863       /* We changed STMT to be the first stmt in reduction chain, hence we
4864          check that in this case the first element in the chain is STMT.  */
4865       gcc_assert (stmt == tmp
4866                   || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4867     }
4868
4869   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4870     return false;
4871
4872   if (slp_node || PURE_SLP_STMT (stmt_info))
4873     ncopies = 1;
4874   else
4875     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4876                / TYPE_VECTOR_SUBPARTS (vectype_in));
4877
4878   gcc_assert (ncopies >= 1);
4879
4880   vec_mode = TYPE_MODE (vectype_in);
4881
4882   if (code == COND_EXPR)
4883     {
4884       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4885         {
4886           if (dump_enabled_p ())
4887             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4888                              "unsupported condition in reduction\n");
4889
4890             return false;
4891         }
4892     }
4893   else
4894     {
4895       /* 4. Supportable by target?  */
4896
4897       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
4898           || code == LROTATE_EXPR || code == RROTATE_EXPR)
4899         {
4900           /* Shifts and rotates are only supported by vectorizable_shifts,
4901              not vectorizable_reduction.  */
4902           if (dump_enabled_p ())
4903             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4904                              "unsupported shift or rotation.\n");
4905           return false;
4906         }
4907
4908       /* 4.1. check support for the operation in the loop  */
4909       optab = optab_for_tree_code (code, vectype_in, optab_default);
4910       if (!optab)
4911         {
4912           if (dump_enabled_p ())
4913             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4914                              "no optab.\n");
4915
4916           return false;
4917         }
4918
4919       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4920         {
4921           if (dump_enabled_p ())
4922             dump_printf (MSG_NOTE, "op not supported by target.\n");
4923
4924           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4925               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4926                   < vect_min_worthwhile_factor (code))
4927             return false;
4928
4929           if (dump_enabled_p ())
4930             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
4931         }
4932
4933       /* Worthwhile without SIMD support?  */
4934       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4935           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4936              < vect_min_worthwhile_factor (code))
4937         {
4938           if (dump_enabled_p ())
4939             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4940                              "not worthwhile without SIMD support.\n");
4941
4942           return false;
4943         }
4944     }
4945
4946   /* 4.2. Check support for the epilog operation.
4947
4948           If STMT represents a reduction pattern, then the type of the
4949           reduction variable may be different than the type of the rest
4950           of the arguments.  For example, consider the case of accumulation
4951           of shorts into an int accumulator; The original code:
4952                         S1: int_a = (int) short_a;
4953           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
4954
4955           was replaced with:
4956                         STMT: int_acc = widen_sum <short_a, int_acc>
4957
4958           This means that:
4959           1. The tree-code that is used to create the vector operation in the
4960              epilog code (that reduces the partial results) is not the
4961              tree-code of STMT, but is rather the tree-code of the original
4962              stmt from the pattern that STMT is replacing.  I.e, in the example
4963              above we want to use 'widen_sum' in the loop, but 'plus' in the
4964              epilog.
4965           2. The type (mode) we use to check available target support
4966              for the vector operation to be created in the *epilog*, is
4967              determined by the type of the reduction variable (in the example
4968              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4969              However the type (mode) we use to check available target support
4970              for the vector operation to be created *inside the loop*, is
4971              determined by the type of the other arguments to STMT (in the
4972              example we'd check this: optab_handler (widen_sum_optab,
4973              vect_short_mode)).
4974
4975           This is contrary to "regular" reductions, in which the types of all
4976           the arguments are the same as the type of the reduction variable.
4977           For "regular" reductions we can therefore use the same vector type
4978           (and also the same tree-code) when generating the epilog code and
4979           when generating the code inside the loop.  */
4980
4981   if (orig_stmt)
4982     {
4983       /* This is a reduction pattern: get the vectype from the type of the
4984          reduction variable, and get the tree-code from orig_stmt.  */
4985       orig_code = gimple_assign_rhs_code (orig_stmt);
4986       gcc_assert (vectype_out);
4987       vec_mode = TYPE_MODE (vectype_out);
4988     }
4989   else
4990     {
4991       /* Regular reduction: use the same vectype and tree-code as used for
4992          the vector code inside the loop can be used for the epilog code. */
4993       orig_code = code;
4994     }
4995
4996   if (nested_cycle)
4997     {
4998       def_bb = gimple_bb (reduc_def_stmt);
4999       def_stmt_loop = def_bb->loop_father;
5000       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5001                                        loop_preheader_edge (def_stmt_loop));
5002       if (TREE_CODE (def_arg) == SSA_NAME
5003           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5004           && gimple_code (def_arg_stmt) == GIMPLE_PHI
5005           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5006           && vinfo_for_stmt (def_arg_stmt)
5007           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5008               == vect_double_reduction_def)
5009         double_reduc = true;
5010     }
5011
5012   epilog_reduc_code = ERROR_MARK;
5013   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5014     {
5015       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5016                                          optab_default);
5017       if (!reduc_optab)
5018         {
5019           if (dump_enabled_p ())
5020             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5021                              "no optab for reduction.\n");
5022
5023           epilog_reduc_code = ERROR_MARK;
5024         }
5025
5026       if (reduc_optab
5027           && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5028         {
5029           if (dump_enabled_p ())
5030             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5031                              "reduc op not supported by target.\n");
5032
5033           epilog_reduc_code = ERROR_MARK;
5034         }
5035     }
5036   else
5037     {
5038       if (!nested_cycle || double_reduc)
5039         {
5040           if (dump_enabled_p ())
5041             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5042                              "no reduc code for scalar code.\n");
5043
5044           return false;
5045         }
5046     }
5047
5048   if (double_reduc && ncopies > 1)
5049     {
5050       if (dump_enabled_p ())
5051         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5052                          "multiple types in double reduction\n");
5053
5054       return false;
5055     }
5056
5057   /* In case of widenning multiplication by a constant, we update the type
5058      of the constant to be the type of the other operand.  We check that the
5059      constant fits the type in the pattern recognition pass.  */
5060   if (code == DOT_PROD_EXPR
5061       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5062     {
5063       if (TREE_CODE (ops[0]) == INTEGER_CST)
5064         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5065       else if (TREE_CODE (ops[1]) == INTEGER_CST)
5066         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5067       else
5068         {
5069           if (dump_enabled_p ())
5070             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5071                              "invalid types in dot-prod\n");
5072
5073           return false;
5074         }
5075     }
5076
5077   if (!vec_stmt) /* transformation not required.  */
5078     {
5079       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
5080         return false;
5081       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5082       return true;
5083     }
5084
5085   /** Transform.  **/
5086
5087   if (dump_enabled_p ())
5088     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5089
5090   /* FORNOW: Multiple types are not supported for condition.  */
5091   if (code == COND_EXPR)
5092     gcc_assert (ncopies == 1);
5093
5094   /* Create the destination vector  */
5095   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5096
5097   /* In case the vectorization factor (VF) is bigger than the number
5098      of elements that we can fit in a vectype (nunits), we have to generate
5099      more than one vector stmt - i.e - we need to "unroll" the
5100      vector stmt by a factor VF/nunits.  For more details see documentation
5101      in vectorizable_operation.  */
5102
5103   /* If the reduction is used in an outer loop we need to generate
5104      VF intermediate results, like so (e.g. for ncopies=2):
5105         r0 = phi (init, r0)
5106         r1 = phi (init, r1)
5107         r0 = x0 + r0;
5108         r1 = x1 + r1;
5109     (i.e. we generate VF results in 2 registers).
5110     In this case we have a separate def-use cycle for each copy, and therefore
5111     for each copy we get the vector def for the reduction variable from the
5112     respective phi node created for this copy.
5113
5114     Otherwise (the reduction is unused in the loop nest), we can combine
5115     together intermediate results, like so (e.g. for ncopies=2):
5116         r = phi (init, r)
5117         r = x0 + r;
5118         r = x1 + r;
5119    (i.e. we generate VF/2 results in a single register).
5120    In this case for each copy we get the vector def for the reduction variable
5121    from the vectorized reduction operation generated in the previous iteration.
5122   */
5123
5124   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5125     {
5126       single_defuse_cycle = true;
5127       epilog_copies = 1;
5128     }
5129   else
5130     epilog_copies = ncopies;
5131
5132   prev_stmt_info = NULL;
5133   prev_phi_info = NULL;
5134   if (slp_node)
5135     {
5136       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5137       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5138                   == TYPE_VECTOR_SUBPARTS (vectype_in));
5139     }
5140   else
5141     {
5142       vec_num = 1;
5143       vec_oprnds0.create (1);
5144       if (op_type == ternary_op)
5145         vec_oprnds1.create (1);
5146     }
5147
5148   phis.create (vec_num);
5149   vect_defs.create (vec_num);
5150   if (!slp_node)
5151     vect_defs.quick_push (NULL_TREE);
5152
5153   for (j = 0; j < ncopies; j++)
5154     {
5155       if (j == 0 || !single_defuse_cycle)
5156         {
5157           for (i = 0; i < vec_num; i++)
5158             {
5159               /* Create the reduction-phi that defines the reduction
5160                  operand.  */
5161               new_phi = create_phi_node (vec_dest, loop->header);
5162               set_vinfo_for_stmt (new_phi,
5163                                   new_stmt_vec_info (new_phi, loop_vinfo,
5164                                                      NULL));
5165                if (j == 0 || slp_node)
5166                  phis.quick_push (new_phi);
5167             }
5168         }
5169
5170       if (code == COND_EXPR)
5171         {
5172           gcc_assert (!slp_node);
5173           vectorizable_condition (stmt, gsi, vec_stmt,
5174                                   PHI_RESULT (phis[0]),
5175                                   reduc_index, NULL);
5176           /* Multiple types are not supported for condition.  */
5177           break;
5178         }
5179
5180       /* Handle uses.  */
5181       if (j == 0)
5182         {
5183           op0 = ops[!reduc_index];
5184           if (op_type == ternary_op)
5185             {
5186               if (reduc_index == 0)
5187                 op1 = ops[2];
5188               else
5189                 op1 = ops[1];
5190             }
5191
5192           if (slp_node)
5193             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5194                                slp_node, -1);
5195           else
5196             {
5197               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5198                                                             stmt, NULL);
5199               vec_oprnds0.quick_push (loop_vec_def0);
5200               if (op_type == ternary_op)
5201                {
5202                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5203                                                                NULL);
5204                  vec_oprnds1.quick_push (loop_vec_def1);
5205                }
5206             }
5207         }
5208       else
5209         {
5210           if (!slp_node)
5211             {
5212               enum vect_def_type dt;
5213               gimple dummy_stmt;
5214               tree dummy;
5215
5216               vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5217                                   &dummy_stmt, &dummy, &dt);
5218               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5219                                                               loop_vec_def0);
5220               vec_oprnds0[0] = loop_vec_def0;
5221               if (op_type == ternary_op)
5222                 {
5223                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5224                                       &dummy, &dt);
5225                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5226                                                                 loop_vec_def1);
5227                   vec_oprnds1[0] = loop_vec_def1;
5228                 }
5229             }
5230
5231           if (single_defuse_cycle)
5232             reduc_def = gimple_assign_lhs (new_stmt);
5233
5234           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5235         }
5236
5237       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5238         {
5239           if (slp_node)
5240             reduc_def = PHI_RESULT (phis[i]);
5241           else
5242             {
5243               if (!single_defuse_cycle || j == 0)
5244                 reduc_def = PHI_RESULT (new_phi);
5245             }
5246
5247           def1 = ((op_type == ternary_op)
5248                   ? vec_oprnds1[i] : NULL);
5249           if (op_type == binary_op)
5250             {
5251               if (reduc_index == 0)
5252                 expr = build2 (code, vectype_out, reduc_def, def0);
5253               else
5254                 expr = build2 (code, vectype_out, def0, reduc_def);
5255             }
5256           else
5257             {
5258               if (reduc_index == 0)
5259                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5260               else
5261                 {
5262                   if (reduc_index == 1)
5263                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5264                   else
5265                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5266                 }
5267             }
5268
5269           new_stmt = gimple_build_assign (vec_dest, expr);
5270           new_temp = make_ssa_name (vec_dest, new_stmt);
5271           gimple_assign_set_lhs (new_stmt, new_temp);
5272           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5273
5274           if (slp_node)
5275             {
5276               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5277               vect_defs.quick_push (new_temp);
5278             }
5279           else
5280             vect_defs[0] = new_temp;
5281         }
5282
5283       if (slp_node)
5284         continue;
5285
5286       if (j == 0)
5287         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5288       else
5289         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5290
5291       prev_stmt_info = vinfo_for_stmt (new_stmt);
5292       prev_phi_info = vinfo_for_stmt (new_phi);
5293     }
5294
5295   /* Finalize the reduction-phi (set its arguments) and create the
5296      epilog reduction code.  */
5297   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5298     {
5299       new_temp = gimple_assign_lhs (*vec_stmt);
5300       vect_defs[0] = new_temp;
5301     }
5302
5303   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5304                                     epilog_reduc_code, phis, reduc_index,
5305                                     double_reduc, slp_node);
5306
5307   phis.release ();
5308   vect_defs.release ();
5309   vec_oprnds0.release ();
5310   vec_oprnds1.release ();
5311
5312   return true;
5313 }
5314
5315 /* Function vect_min_worthwhile_factor.
5316
5317    For a loop where we could vectorize the operation indicated by CODE,
5318    return the minimum vectorization factor that makes it worthwhile
5319    to use generic vectors.  */
5320 int
5321 vect_min_worthwhile_factor (enum tree_code code)
5322 {
5323   switch (code)
5324     {
5325     case PLUS_EXPR:
5326     case MINUS_EXPR:
5327     case NEGATE_EXPR:
5328       return 4;
5329
5330     case BIT_AND_EXPR:
5331     case BIT_IOR_EXPR:
5332     case BIT_XOR_EXPR:
5333     case BIT_NOT_EXPR:
5334       return 2;
5335
5336     default:
5337       return INT_MAX;
5338     }
5339 }
5340
5341
5342 /* Function vectorizable_induction
5343
5344    Check if PHI performs an induction computation that can be vectorized.
5345    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5346    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5347    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5348
5349 bool
5350 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5351                         gimple *vec_stmt)
5352 {
5353   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5354   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5355   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5356   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5357   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5358   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5359   tree vec_def;
5360
5361   gcc_assert (ncopies >= 1);
5362   /* FORNOW. These restrictions should be relaxed.  */
5363   if (nested_in_vect_loop_p (loop, phi))
5364     {
5365       imm_use_iterator imm_iter;
5366       use_operand_p use_p;
5367       gimple exit_phi;
5368       edge latch_e;
5369       tree loop_arg;
5370
5371       if (ncopies > 1)
5372         {
5373           if (dump_enabled_p ())
5374             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5375                              "multiple types in nested loop.\n");
5376           return false;
5377         }
5378
5379       exit_phi = NULL;
5380       latch_e = loop_latch_edge (loop->inner);
5381       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5382       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5383         {
5384           if (!flow_bb_inside_loop_p (loop->inner,
5385                                       gimple_bb (USE_STMT (use_p))))
5386             {
5387               exit_phi = USE_STMT (use_p);
5388               break;
5389             }
5390         }
5391       if (exit_phi)
5392         {
5393           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5394           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5395                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5396             {
5397               if (dump_enabled_p ())
5398                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5399                                  "inner-loop induction only used outside "
5400                                  "of the outer vectorized loop.\n");
5401               return false;
5402             }
5403         }
5404     }
5405
5406   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5407     return false;
5408
5409   /* FORNOW: SLP not supported.  */
5410   if (STMT_SLP_TYPE (stmt_info))
5411     return false;
5412
5413   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5414
5415   if (gimple_code (phi) != GIMPLE_PHI)
5416     return false;
5417
5418   if (!vec_stmt) /* transformation not required.  */
5419     {
5420       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5421       if (dump_enabled_p ())
5422         dump_printf_loc (MSG_NOTE, vect_location,
5423                          "=== vectorizable_induction ===\n");
5424       vect_model_induction_cost (stmt_info, ncopies);
5425       return true;
5426     }
5427
5428   /** Transform.  **/
5429
5430   if (dump_enabled_p ())
5431     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5432
5433   vec_def = get_initial_def_for_induction (phi);
5434   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5435   return true;
5436 }
5437
5438 /* Function vectorizable_live_operation.
5439
5440    STMT computes a value that is used outside the loop.  Check if
5441    it can be supported.  */
5442
5443 bool
5444 vectorizable_live_operation (gimple stmt,
5445                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5446                              gimple *vec_stmt)
5447 {
5448   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5449   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5450   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5451   int i;
5452   int op_type;
5453   tree op;
5454   tree def;
5455   gimple def_stmt;
5456   enum vect_def_type dt;
5457   enum tree_code code;
5458   enum gimple_rhs_class rhs_class;
5459
5460   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5461
5462   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5463     return false;
5464
5465   if (!is_gimple_assign (stmt))
5466     {
5467       if (gimple_call_internal_p (stmt)
5468           && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5469           && gimple_call_lhs (stmt)
5470           && loop->simduid
5471           && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5472           && loop->simduid
5473              == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5474         {
5475           edge e = single_exit (loop);
5476           basic_block merge_bb = e->dest;
5477           imm_use_iterator imm_iter;
5478           use_operand_p use_p;
5479           tree lhs = gimple_call_lhs (stmt);
5480
5481           FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5482             {
5483               gimple use_stmt = USE_STMT (use_p);
5484               if (gimple_code (use_stmt) == GIMPLE_PHI
5485                   || gimple_bb (use_stmt) == merge_bb)
5486                 {
5487                   if (vec_stmt)
5488                     {
5489                       tree vfm1
5490                         = build_int_cst (unsigned_type_node,
5491                                          loop_vinfo->vectorization_factor - 1);
5492                       SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5493                     }
5494                   return true;
5495                 }
5496             }
5497         }
5498
5499       return false;
5500     }
5501
5502   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5503     return false;
5504
5505   /* FORNOW. CHECKME. */
5506   if (nested_in_vect_loop_p (loop, stmt))
5507     return false;
5508
5509   code = gimple_assign_rhs_code (stmt);
5510   op_type = TREE_CODE_LENGTH (code);
5511   rhs_class = get_gimple_rhs_class (code);
5512   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5513   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5514
5515   /* FORNOW: support only if all uses are invariant.  This means
5516      that the scalar operations can remain in place, unvectorized.
5517      The original last scalar value that they compute will be used.  */
5518
5519   for (i = 0; i < op_type; i++)
5520     {
5521       if (rhs_class == GIMPLE_SINGLE_RHS)
5522         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5523       else
5524         op = gimple_op (stmt, i + 1);
5525       if (op
5526           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5527                                   &dt))
5528         {
5529           if (dump_enabled_p ())
5530             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5531                              "use not simple.\n");
5532           return false;
5533         }
5534
5535       if (dt != vect_external_def && dt != vect_constant_def)
5536         return false;
5537     }
5538
5539   /* No transformation is required for the cases we currently support.  */
5540   return true;
5541 }
5542
5543 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5544
5545 static void
5546 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5547 {
5548   ssa_op_iter op_iter;
5549   imm_use_iterator imm_iter;
5550   def_operand_p def_p;
5551   gimple ustmt;
5552
5553   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5554     {
5555       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5556         {
5557           basic_block bb;
5558
5559           if (!is_gimple_debug (ustmt))
5560             continue;
5561
5562           bb = gimple_bb (ustmt);
5563
5564           if (!flow_bb_inside_loop_p (loop, bb))
5565             {
5566               if (gimple_debug_bind_p (ustmt))
5567                 {
5568                   if (dump_enabled_p ())
5569                     dump_printf_loc (MSG_NOTE, vect_location,
5570                                      "killing debug use\n");
5571
5572                   gimple_debug_bind_reset_value (ustmt);
5573                   update_stmt (ustmt);
5574                 }
5575               else
5576                 gcc_unreachable ();
5577             }
5578         }
5579     }
5580 }
5581
5582
5583 /* This function builds ni_name = number of iterations.  Statements
5584    are emitted on the loop preheader edge.  */
5585
5586 static tree
5587 vect_build_loop_niters (loop_vec_info loop_vinfo)
5588 {
5589   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5590   if (TREE_CODE (ni) == INTEGER_CST)
5591     return ni;
5592   else
5593     {
5594       tree ni_name, var;
5595       gimple_seq stmts = NULL;
5596       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5597
5598       var = create_tmp_var (TREE_TYPE (ni), "niters");
5599       ni_name = force_gimple_operand (ni, &stmts, false, var);
5600       if (stmts)
5601         gsi_insert_seq_on_edge_immediate (pe, stmts);
5602
5603       return ni_name;
5604     }
5605 }
5606
5607
5608 /* This function generates the following statements:
5609
5610    ni_name = number of iterations loop executes
5611    ratio = ni_name / vf
5612    ratio_mult_vf_name = ratio * vf
5613
5614    and places them on the loop preheader edge.  */
5615
5616 static void
5617 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5618                                  tree ni_name,
5619                                  tree *ratio_mult_vf_name_ptr,
5620                                  tree *ratio_name_ptr)
5621 {
5622   tree ni_minus_gap_name;
5623   tree var;
5624   tree ratio_name;
5625   tree ratio_mult_vf_name;
5626   tree ni = LOOP_VINFO_NITERS (loop_vinfo);
5627   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5628   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5629   tree log_vf;
5630
5631   log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
5632
5633   /* If epilogue loop is required because of data accesses with gaps, we
5634      subtract one iteration from the total number of iterations here for
5635      correct calculation of RATIO.  */
5636   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5637     {
5638       ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5639                                        ni_name,
5640                                        build_one_cst (TREE_TYPE (ni_name)));
5641       if (!is_gimple_val (ni_minus_gap_name))
5642         {
5643           var = create_tmp_var (TREE_TYPE (ni), "ni_gap");
5644           gimple stmts = NULL;
5645           ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5646                                                     true, var);
5647           gsi_insert_seq_on_edge_immediate (pe, stmts);
5648         }
5649     }
5650   else
5651     ni_minus_gap_name = ni_name;
5652
5653   /* Create: ratio = ni >> log2(vf) */
5654
5655   ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_minus_gap_name),
5656                             ni_minus_gap_name, log_vf);
5657   if (!is_gimple_val (ratio_name))
5658     {
5659       var = create_tmp_var (TREE_TYPE (ni), "bnd");
5660       gimple stmts = NULL;
5661       ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5662       gsi_insert_seq_on_edge_immediate (pe, stmts);
5663     }
5664   *ratio_name_ptr = ratio_name;
5665
5666   /* Create: ratio_mult_vf = ratio << log2 (vf).  */
5667
5668   if (ratio_mult_vf_name_ptr)
5669     {
5670       ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5671                                         ratio_name, log_vf);
5672       if (!is_gimple_val (ratio_mult_vf_name))
5673         {
5674           var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
5675           gimple stmts = NULL;
5676           ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5677                                                      true, var);
5678           gsi_insert_seq_on_edge_immediate (pe, stmts);
5679         }
5680       *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5681     }
5682
5683   return;
5684 }
5685
5686
5687 /* Function vect_transform_loop.
5688
5689    The analysis phase has determined that the loop is vectorizable.
5690    Vectorize the loop - created vectorized stmts to replace the scalar
5691    stmts in the loop, and update the loop exit condition.  */
5692
5693 void
5694 vect_transform_loop (loop_vec_info loop_vinfo)
5695 {
5696   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5697   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5698   int nbbs = loop->num_nodes;
5699   gimple_stmt_iterator si;
5700   int i;
5701   tree ratio = NULL;
5702   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5703   bool grouped_store;
5704   bool slp_scheduled = false;
5705   unsigned int nunits;
5706   gimple stmt, pattern_stmt;
5707   gimple_seq pattern_def_seq = NULL;
5708   gimple_stmt_iterator pattern_def_si = gsi_none ();
5709   bool transform_pattern_stmt = false;
5710   bool check_profitability = false;
5711   int th;
5712   /* Record number of iterations before we started tampering with the profile. */
5713   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5714
5715   if (dump_enabled_p ())
5716     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5717
5718   /* If profile is inprecise, we have chance to fix it up.  */
5719   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5720     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5721
5722   /* Use the more conservative vectorization threshold.  If the number
5723      of iterations is constant assume the cost check has been performed
5724      by our caller.  If the threshold makes all loops profitable that
5725      run at least the vectorization factor number of times checking
5726      is pointless, too.  */
5727   th = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
5728          * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
5729   th = MAX (th, LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo));
5730   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5731       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5732     {
5733       if (dump_enabled_p ())
5734         dump_printf_loc (MSG_NOTE, vect_location,
5735                          "Profitability threshold is %d loop iterations.\n",
5736                          th);
5737       check_profitability = true;
5738     }
5739
5740   /* Version the loop first, if required, so the profitability check
5741      comes first.  */
5742
5743   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5744       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5745     {
5746       vect_loop_versioning (loop_vinfo, th, check_profitability);
5747       check_profitability = false;
5748     }
5749
5750   tree ni_name = vect_build_loop_niters (loop_vinfo);
5751   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5752
5753   /* Peel the loop if there are data refs with unknown alignment.
5754      Only one data ref with unknown store is allowed.  */
5755
5756   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5757     {
5758       vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5759                                      th, check_profitability);
5760       check_profitability = false;
5761       /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5762          be re-computed.  */
5763       ni_name = NULL_TREE;
5764     }
5765
5766   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5767      compile time constant), or it is a constant that doesn't divide by the
5768      vectorization factor, then an epilog loop needs to be created.
5769      We therefore duplicate the loop: the original loop will be vectorized,
5770      and will compute the first (n/VF) iterations.  The second copy of the loop
5771      will remain scalar and will compute the remaining (n%VF) iterations.
5772      (VF is the vectorization factor).  */
5773
5774   if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
5775       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5776     {
5777       tree ratio_mult_vf;
5778       if (!ni_name)
5779         ni_name = vect_build_loop_niters (loop_vinfo);
5780       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
5781                                        &ratio);
5782       vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
5783                                       th, check_profitability);
5784     }
5785   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5786     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5787                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5788   else
5789     {
5790       if (!ni_name)
5791         ni_name = vect_build_loop_niters (loop_vinfo);
5792       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
5793     }
5794
5795   /* 1) Make sure the loop header has exactly two entries
5796      2) Make sure we have a preheader basic block.  */
5797
5798   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5799
5800   split_edge (loop_preheader_edge (loop));
5801
5802   /* FORNOW: the vectorizer supports only loops which body consist
5803      of one basic block (header + empty latch). When the vectorizer will
5804      support more involved loop forms, the order by which the BBs are
5805      traversed need to be reconsidered.  */
5806
5807   for (i = 0; i < nbbs; i++)
5808     {
5809       basic_block bb = bbs[i];
5810       stmt_vec_info stmt_info;
5811       gimple phi;
5812
5813       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5814         {
5815           phi = gsi_stmt (si);
5816           if (dump_enabled_p ())
5817             {
5818               dump_printf_loc (MSG_NOTE, vect_location,
5819                                "------>vectorizing phi: ");
5820               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5821               dump_printf (MSG_NOTE, "\n");
5822             }
5823           stmt_info = vinfo_for_stmt (phi);
5824           if (!stmt_info)
5825             continue;
5826
5827           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5828             vect_loop_kill_debug_uses (loop, phi);
5829
5830           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5831               && !STMT_VINFO_LIVE_P (stmt_info))
5832             continue;
5833
5834           if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5835                 != (unsigned HOST_WIDE_INT) vectorization_factor)
5836               && dump_enabled_p ())
5837             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
5838
5839           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5840             {
5841               if (dump_enabled_p ())
5842                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
5843               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5844             }
5845         }
5846
5847       pattern_stmt = NULL;
5848       for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5849         {
5850           bool is_store;
5851
5852           if (transform_pattern_stmt)
5853             stmt = pattern_stmt;
5854           else
5855             {
5856               stmt = gsi_stmt (si);
5857               /* During vectorization remove existing clobber stmts.  */
5858               if (gimple_clobber_p (stmt))
5859                 {
5860                   unlink_stmt_vdef (stmt);
5861                   gsi_remove (&si, true);
5862                   release_defs (stmt);
5863                   continue;
5864                 }
5865             }
5866
5867           if (dump_enabled_p ())
5868             {
5869               dump_printf_loc (MSG_NOTE, vect_location,
5870                                "------>vectorizing statement: ");
5871               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5872               dump_printf (MSG_NOTE, "\n");
5873             }
5874
5875           stmt_info = vinfo_for_stmt (stmt);
5876
5877           /* vector stmts created in the outer-loop during vectorization of
5878              stmts in an inner-loop may not have a stmt_info, and do not
5879              need to be vectorized.  */
5880           if (!stmt_info)
5881             {
5882               gsi_next (&si);
5883               continue;
5884             }
5885
5886           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5887             vect_loop_kill_debug_uses (loop, stmt);
5888
5889           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5890               && !STMT_VINFO_LIVE_P (stmt_info))
5891             {
5892               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5893                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5894                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5895                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5896                 {
5897                   stmt = pattern_stmt;
5898                   stmt_info = vinfo_for_stmt (stmt);
5899                 }
5900               else
5901                 {
5902                   gsi_next (&si);
5903                   continue;
5904                 }
5905             }
5906           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5907                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5908                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5909                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5910             transform_pattern_stmt = true;
5911
5912           /* If pattern statement has def stmts, vectorize them too.  */
5913           if (is_pattern_stmt_p (stmt_info))
5914             {
5915               if (pattern_def_seq == NULL)
5916                 {
5917                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
5918                   pattern_def_si = gsi_start (pattern_def_seq);
5919                 }
5920               else if (!gsi_end_p (pattern_def_si))
5921                 gsi_next (&pattern_def_si);
5922               if (pattern_def_seq != NULL)
5923                 {
5924                   gimple pattern_def_stmt = NULL;
5925                   stmt_vec_info pattern_def_stmt_info = NULL;
5926
5927                   while (!gsi_end_p (pattern_def_si))
5928                     {
5929                       pattern_def_stmt = gsi_stmt (pattern_def_si);
5930                       pattern_def_stmt_info
5931                         = vinfo_for_stmt (pattern_def_stmt);
5932                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
5933                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
5934                         break;
5935                       gsi_next (&pattern_def_si);
5936                     }
5937
5938                   if (!gsi_end_p (pattern_def_si))
5939                     {
5940                       if (dump_enabled_p ())
5941                         {
5942                           dump_printf_loc (MSG_NOTE, vect_location,
5943                                            "==> vectorizing pattern def "
5944                                            "stmt: ");
5945                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
5946                                             pattern_def_stmt, 0);
5947                           dump_printf (MSG_NOTE, "\n");
5948                         }
5949
5950                       stmt = pattern_def_stmt;
5951                       stmt_info = pattern_def_stmt_info;
5952                     }
5953                   else
5954                     {
5955                       pattern_def_si = gsi_none ();
5956                       transform_pattern_stmt = false;
5957                     }
5958                 }
5959               else
5960                 transform_pattern_stmt = false;
5961             }
5962
5963           gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5964           nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
5965                                                STMT_VINFO_VECTYPE (stmt_info));
5966           if (!STMT_SLP_TYPE (stmt_info)
5967               && nunits != (unsigned int) vectorization_factor
5968               && dump_enabled_p ())
5969             /* For SLP VF is set according to unrolling factor, and not to
5970                vector size, hence for SLP this print is not valid.  */
5971             dump_printf_loc (MSG_NOTE, vect_location,
5972                              "multiple-types.\n");
5973
5974           /* SLP. Schedule all the SLP instances when the first SLP stmt is
5975              reached.  */
5976           if (STMT_SLP_TYPE (stmt_info))
5977             {
5978               if (!slp_scheduled)
5979                 {
5980                   slp_scheduled = true;
5981
5982                   if (dump_enabled_p ())
5983                     dump_printf_loc (MSG_NOTE, vect_location,
5984                                      "=== scheduling SLP instances ===\n");
5985
5986                   vect_schedule_slp (loop_vinfo, NULL);
5987                 }
5988
5989               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
5990               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
5991                 {
5992                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5993                     {
5994                       pattern_def_seq = NULL;
5995                       gsi_next (&si);
5996                     }
5997                   continue;
5998                 }
5999             }
6000
6001           /* -------- vectorize statement ------------ */
6002           if (dump_enabled_p ())
6003             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6004
6005           grouped_store = false;
6006           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6007           if (is_store)
6008             {
6009               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6010                 {
6011                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6012                      interleaving chain was completed - free all the stores in
6013                      the chain.  */
6014                   gsi_next (&si);
6015                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6016                   continue;
6017                 }
6018               else
6019                 {
6020                   /* Free the attached stmt_vec_info and remove the stmt.  */
6021                   gimple store = gsi_stmt (si);
6022                   free_stmt_vec_info (store);
6023                   unlink_stmt_vdef (store);
6024                   gsi_remove (&si, true);
6025                   release_defs (store);
6026                   continue;
6027                 }
6028             }
6029
6030           if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6031             {
6032               pattern_def_seq = NULL;
6033               gsi_next (&si);
6034             }
6035         }                       /* stmts in BB */
6036     }                           /* BBs in loop */
6037
6038   slpeel_make_loop_iterate_ntimes (loop, ratio);
6039
6040   /* Reduce loop iterations by the vectorization factor.  */
6041   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6042                       expected_iterations / vectorization_factor);
6043   loop->nb_iterations_upper_bound
6044     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (vectorization_factor),
6045                                             FLOOR_DIV_EXPR);
6046   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6047       && loop->nb_iterations_upper_bound != double_int_zero)
6048     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - double_int_one;
6049   if (loop->any_estimate)
6050     {
6051       loop->nb_iterations_estimate
6052         = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (vectorization_factor),
6053                                              FLOOR_DIV_EXPR);
6054        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6055            && loop->nb_iterations_estimate != double_int_zero)
6056          loop->nb_iterations_estimate = loop->nb_iterations_estimate - double_int_one;
6057     }
6058
6059   if (dump_enabled_p ())
6060     {
6061       dump_printf_loc (MSG_NOTE, vect_location,
6062                        "LOOP VECTORIZED\n");
6063       if (loop->inner)
6064         dump_printf_loc (MSG_NOTE, vect_location,
6065                          "OUTER LOOP VECTORIZED\n");
6066       dump_printf (MSG_NOTE, "\n");
6067     }
6068 }