gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2013 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "ggc.h"
  28 #include "tree.h"
  29 #include "basic-block.h"
  30 #include "gimple-pretty-print.h"
  31 #include "gimplify.h"
  32 #include "gimple-iterator.h"
  33 #include "gimple-ssa.h"
  34 #include "tree-phinodes.h"
  35 #include "ssa-iterators.h"
  36 #include "tree-ssanames.h"
  37 #include "tree-ssa-loop-ivopts.h"
  38 #include "tree-ssa-loop-manip.h"
  39 #include "tree-ssa-loop-niter.h"
  40 #include "tree-pass.h"
  41 #include "cfgloop.h"
  42 #include "expr.h"
  43 #include "recog.h"
  44 #include "optabs.h"
  45 #include "params.h"
  46 #include "diagnostic-core.h"
  47 #include "tree-chrec.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "target.h"
  51
  52 /* Loop Vectorization Pass.
  53
  54    This pass tries to vectorize loops.
  55
  56    For example, the vectorizer transforms the following simple loop:
  57
  58         short a[N]; short b[N]; short c[N]; int i;
  59
  60         for (i=0; i<N; i++){
  61           a[i] = b[i] + c[i];
  62         }
  63
  64    as if it was manually vectorized by rewriting the source code into:
  65
  66         typedef int __attribute__((mode(V8HI))) v8hi;
  67         short a[N];  short b[N]; short c[N];   int i;
  68         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  69         v8hi va, vb, vc;
  70
  71         for (i=0; i<N/8; i++){
  72           vb = pb[i];
  73           vc = pc[i];
  74           va = vb + vc;
  75           pa[i] = va;
  76         }
  77
  78         The main entry to this pass is vectorize_loops(), in which
  79    the vectorizer applies a set of analyses on a given set of loops,
  80    followed by the actual vectorization transformation for the loops that
  81    had successfully passed the analysis phase.
  82         Throughout this pass we make a distinction between two types of
  83    data: scalars (which are represented by SSA_NAMES), and memory references
  84    ("data-refs").  These two types of data require different handling both
  85    during analysis and transformation. The types of data-refs that the
  86    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  87    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  88    accesses are required to have a simple (consecutive) access pattern.
  89
  90    Analysis phase:
  91    ===============
  92         The driver for the analysis phase is vect_analyze_loop().
  93    It applies a set of analyses, some of which rely on the scalar evolution
  94    analyzer (scev) developed by Sebastian Pop.
  95
  96         During the analysis phase the vectorizer records some information
  97    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  98    loop, as well as general information about the loop as a whole, which is
  99    recorded in a "loop_vec_info" struct attached to each loop.
 100
 101    Transformation phase:
 102    =====================
 103         The loop transformation phase scans all the stmts in the loop, and
 104    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 105    the loop that needs to be vectorized.  It inserts the vector code sequence
 106    just before the scalar stmt S, and records a pointer to the vector code
 107    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 108    attached to S).  This pointer will be used for the vectorization of following
 109    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 110    otherwise, we rely on dead code elimination for removing it.
 111
 112         For example, say stmt S1 was vectorized into stmt VS1:
 113
 114    VS1: vb = px[i];
 115    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 116    S2:  a = b;
 117
 118    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 119    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 120    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 121    resulting sequence would be:
 122
 123    VS1: vb = px[i];
 124    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 125    VS2: va = vb;
 126    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 127
 128         Operands that are not SSA_NAMEs, are data-refs that appear in
 129    load/store operations (like 'x[i]' in S1), and are handled differently.
 130
 131    Target modeling:
 132    =================
 133         Currently the only target specific information that is used is the
 134    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 135    Targets that can support different sizes of vectors, for now will need
 136    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 137    flexibility will be added in the future.
 138
 139         Since we only vectorize operations which vector form can be
 140    expressed using existing tree codes, to verify that an operation is
 141    supported, the vectorizer checks the relevant optab at the relevant
 142    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 143    the value found is CODE_FOR_nothing, then there's no target support, and
 144    we can't vectorize the stmt.
 145
 146    For additional information on this project see:
 147    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 148 */
 149
 150 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 151
 152 /* Function vect_determine_vectorization_factor
 153
 154    Determine the vectorization factor (VF).  VF is the number of data elements
 155    that are operated upon in parallel in a single iteration of the vectorized
 156    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 157    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 158    elements can fit in a single vector register.
 159
 160    We currently support vectorization of loops in which all types operated upon
 161    are of the same size.  Therefore this function currently sets VF according to
 162    the size of the types operated upon, and fails if there are multiple sizes
 163    in the loop.
 164
 165    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 166    original loop:
 167         for (i=0; i<N; i++){
 168           a[i] = b[i] + c[i];
 169         }
 170
 171    vectorized loop:
 172         for (i=0; i<N; i+=VF){
 173           a[i:VF] = b[i:VF] + c[i:VF];
 174         }
 175 */
 176
 177 static bool
 178 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 179 {
 180   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 181   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 182   int nbbs = loop->num_nodes;
 183   gimple_stmt_iterator si;
 184   unsigned int vectorization_factor = 0;
 185   tree scalar_type;
 186   gimple phi;
 187   tree vectype;
 188   unsigned int nunits;
 189   stmt_vec_info stmt_info;
 190   int i;
 191   HOST_WIDE_INT dummy;
 192   gimple stmt, pattern_stmt = NULL;
 193   gimple_seq pattern_def_seq = NULL;
 194   gimple_stmt_iterator pattern_def_si = gsi_none ();
 195   bool analyze_pattern_stmt = false;
 196
 197   if (dump_enabled_p ())
 198     dump_printf_loc (MSG_NOTE, vect_location,
 199                      "=== vect_determine_vectorization_factor ===\n");
 200
 201   for (i = 0; i < nbbs; i++)
 202     {
 203       basic_block bb = bbs[i];
 204
 205       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 206         {
 207           phi = gsi_stmt (si);
 208           stmt_info = vinfo_for_stmt (phi);
 209           if (dump_enabled_p ())
 210             {
 211               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 212               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 213               dump_printf (MSG_NOTE, "\n");
 214             }
 215
 216           gcc_assert (stmt_info);
 217
 218           if (STMT_VINFO_RELEVANT_P (stmt_info))
 219             {
 220               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 221               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 222
 223               if (dump_enabled_p ())
 224                 {
 225                   dump_printf_loc (MSG_NOTE, vect_location,
 226                                    "get vectype for scalar type:  ");
 227                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 228                   dump_printf (MSG_NOTE, "\n");
 229                 }
 230
 231               vectype = get_vectype_for_scalar_type (scalar_type);
 232               if (!vectype)
 233                 {
 234                   if (dump_enabled_p ())
 235                     {
 236                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 237                                        "not vectorized: unsupported "
 238                                        "data-type ");
 239                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 240                                          scalar_type);
 241                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 242                     }
 243                   return false;
 244                 }
 245               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 246
 247               if (dump_enabled_p ())
 248                 {
 249                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 250                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 251                   dump_printf (MSG_NOTE, "\n");
 252                 }
 253
 254               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 255               if (dump_enabled_p ())
 256                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 257                                  nunits);
 258
 259               if (!vectorization_factor
 260                   || (nunits > vectorization_factor))
 261                 vectorization_factor = nunits;
 262             }
 263         }
 264
 265       for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
 266         {
 267           tree vf_vectype;
 268
 269           if (analyze_pattern_stmt)
 270             stmt = pattern_stmt;
 271           else
 272             stmt = gsi_stmt (si);
 273
 274           stmt_info = vinfo_for_stmt (stmt);
 275
 276           if (dump_enabled_p ())
 277             {
 278               dump_printf_loc (MSG_NOTE, vect_location,
 279                                "==> examining statement: ");
 280               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 281               dump_printf (MSG_NOTE, "\n");
 282             }
 283
 284           gcc_assert (stmt_info);
 285
 286           /* Skip stmts which do not need to be vectorized.  */
 287           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 288                && !STMT_VINFO_LIVE_P (stmt_info))
 289               || gimple_clobber_p (stmt))
 290             {
 291               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 292                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 293                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 294                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 295                 {
 296                   stmt = pattern_stmt;
 297                   stmt_info = vinfo_for_stmt (pattern_stmt);
 298                   if (dump_enabled_p ())
 299                     {
 300                       dump_printf_loc (MSG_NOTE, vect_location,
 301                                        "==> examining pattern statement: ");
 302                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 303                       dump_printf (MSG_NOTE, "\n");
 304                     }
 305                 }
 306               else
 307                 {
 308                   if (dump_enabled_p ())
 309                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 310                   gsi_next (&si);
 311                   continue;
 312                 }
 313             }
 314           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 315                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 316                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 317                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 318             analyze_pattern_stmt = true;
 319
 320           /* If a pattern statement has def stmts, analyze them too.  */
 321           if (is_pattern_stmt_p (stmt_info))
 322             {
 323               if (pattern_def_seq == NULL)
 324                 {
 325                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 326                   pattern_def_si = gsi_start (pattern_def_seq);
 327                 }
 328               else if (!gsi_end_p (pattern_def_si))
 329                 gsi_next (&pattern_def_si);
 330               if (pattern_def_seq != NULL)
 331                 {
 332                   gimple pattern_def_stmt = NULL;
 333                   stmt_vec_info pattern_def_stmt_info = NULL;
 334
 335                   while (!gsi_end_p (pattern_def_si))
 336                     {
 337                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 338                       pattern_def_stmt_info
 339                         = vinfo_for_stmt (pattern_def_stmt);
 340                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 341                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 342                         break;
 343                       gsi_next (&pattern_def_si);
 344                     }
 345
 346                   if (!gsi_end_p (pattern_def_si))
 347                     {
 348                       if (dump_enabled_p ())
 349                         {
 350                           dump_printf_loc (MSG_NOTE, vect_location,
 351                                            "==> examining pattern def stmt: ");
 352                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 353                                             pattern_def_stmt, 0);
 354                           dump_printf (MSG_NOTE, "\n");
 355                         }
 356
 357                       stmt = pattern_def_stmt;
 358                       stmt_info = pattern_def_stmt_info;
 359                     }
 360                   else
 361                     {
 362                       pattern_def_si = gsi_none ();
 363                       analyze_pattern_stmt = false;
 364                     }
 365                 }
 366               else
 367                 analyze_pattern_stmt = false;
 368             }
 369
 370           if (gimple_get_lhs (stmt) == NULL_TREE)
 371             {
 372               if (dump_enabled_p ())
 373                 {
 374                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 375                                    "not vectorized: irregular stmt.");
 376                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 377                                     0);
 378                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 379                 }
 380               return false;
 381             }
 382
 383           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 384             {
 385               if (dump_enabled_p ())
 386                 {
 387                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 388                                    "not vectorized: vector stmt in loop:");
 389                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 390                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 391                 }
 392               return false;
 393             }
 394
 395           if (STMT_VINFO_VECTYPE (stmt_info))
 396             {
 397               /* The only case when a vectype had been already set is for stmts
 398                  that contain a dataref, or for "pattern-stmts" (stmts
 399                  generated by the vectorizer to represent/replace a certain
 400                  idiom).  */
 401               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 402                           || is_pattern_stmt_p (stmt_info)
 403                           || !gsi_end_p (pattern_def_si));
 404               vectype = STMT_VINFO_VECTYPE (stmt_info);
 405             }
 406           else
 407             {
 408               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 409               scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 410               if (dump_enabled_p ())
 411                 {
 412                   dump_printf_loc (MSG_NOTE, vect_location,
 413                                    "get vectype for scalar type:  ");
 414                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 415                   dump_printf (MSG_NOTE, "\n");
 416                 }
 417               vectype = get_vectype_for_scalar_type (scalar_type);
 418               if (!vectype)
 419                 {
 420                   if (dump_enabled_p ())
 421                     {
 422                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 423                                        "not vectorized: unsupported "
 424                                        "data-type ");
 425                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 426                                          scalar_type);
 427                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 428                     }
 429                   return false;
 430                 }
 431
 432               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 433
 434               if (dump_enabled_p ())
 435                 {
 436                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 437                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 438                   dump_printf (MSG_NOTE, "\n");
 439                 }
 440             }
 441
 442           /* The vectorization factor is according to the smallest
 443              scalar type (or the largest vector size, but we only
 444              support one vector size per loop).  */
 445           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 446                                                        &dummy);
 447           if (dump_enabled_p ())
 448             {
 449               dump_printf_loc (MSG_NOTE, vect_location,
 450                                "get vectype for scalar type:  ");
 451               dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 452               dump_printf (MSG_NOTE, "\n");
 453             }
 454           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 455           if (!vf_vectype)
 456             {
 457               if (dump_enabled_p ())
 458                 {
 459                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 460                                    "not vectorized: unsupported data-type ");
 461                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 462                                      scalar_type);
 463                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 464                 }
 465               return false;
 466             }
 467
 468           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 469                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 470             {
 471               if (dump_enabled_p ())
 472                 {
 473                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 474                                    "not vectorized: different sized vector "
 475                                    "types in statement, ");
 476                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 477                                      vectype);
 478                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 479                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 480                                      vf_vectype);
 481                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 482                 }
 483               return false;
 484             }
 485
 486           if (dump_enabled_p ())
 487             {
 488               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 489               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 490               dump_printf (MSG_NOTE, "\n");
 491             }
 492
 493           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 494           if (dump_enabled_p ())
 495             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 496           if (!vectorization_factor
 497               || (nunits > vectorization_factor))
 498             vectorization_factor = nunits;
 499
 500           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 501             {
 502               pattern_def_seq = NULL;
 503               gsi_next (&si);
 504             }
 505         }
 506     }
 507
 508   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 509   if (dump_enabled_p ())
 510     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 511                      vectorization_factor);
 512   if (vectorization_factor <= 1)
 513     {
 514       if (dump_enabled_p ())
 515         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 516                          "not vectorized: unsupported data-type\n");
 517       return false;
 518     }
 519   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 520
 521   return true;
 522 }
 523
 524
 525 /* Function vect_is_simple_iv_evolution.
 526
 527    FORNOW: A simple evolution of an induction variables in the loop is
 528    considered a polynomial evolution.  */
 529
 530 static bool
 531 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 532                              tree * step)
 533 {
 534   tree init_expr;
 535   tree step_expr;
 536   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 537   basic_block bb;
 538
 539   /* When there is no evolution in this loop, the evolution function
 540      is not "simple".  */
 541   if (evolution_part == NULL_TREE)
 542     return false;
 543
 544   /* When the evolution is a polynomial of degree >= 2
 545      the evolution function is not "simple".  */
 546   if (tree_is_chrec (evolution_part))
 547     return false;
 548
 549   step_expr = evolution_part;
 550   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 551
 552   if (dump_enabled_p ())
 553     {
 554       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 555       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 556       dump_printf (MSG_NOTE, ",  init: ");
 557       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 558       dump_printf (MSG_NOTE, "\n");
 559     }
 560
 561   *init = init_expr;
 562   *step = step_expr;
 563
 564   if (TREE_CODE (step_expr) != INTEGER_CST
 565       && (TREE_CODE (step_expr) != SSA_NAME
 566           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 567               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 568           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 569               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 570                   || !flag_associative_math)))
 571       && (TREE_CODE (step_expr) != REAL_CST
 572           || !flag_associative_math))
 573     {
 574       if (dump_enabled_p ())
 575         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 576                          "step unknown.\n");
 577       return false;
 578     }
 579
 580   return true;
 581 }
 582
 583 /* Function vect_analyze_scalar_cycles_1.
 584
 585    Examine the cross iteration def-use cycles of scalar variables
 586    in LOOP.  LOOP_VINFO represents the loop that is now being
 587    considered for vectorization (can be LOOP, or an outer-loop
 588    enclosing LOOP).  */
 589
 590 static void
 591 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 592 {
 593   basic_block bb = loop->header;
 594   tree init, step;
 595   stack_vec<gimple, 64> worklist;
 596   gimple_stmt_iterator gsi;
 597   bool double_reduc;
 598
 599   if (dump_enabled_p ())
 600     dump_printf_loc (MSG_NOTE, vect_location,
 601                      "=== vect_analyze_scalar_cycles ===\n");
 602
 603   /* First - identify all inductions.  Reduction detection assumes that all the
 604      inductions have been identified, therefore, this order must not be
 605      changed.  */
 606   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 607     {
 608       gimple phi = gsi_stmt (gsi);
 609       tree access_fn = NULL;
 610       tree def = PHI_RESULT (phi);
 611       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 612
 613       if (dump_enabled_p ())
 614         {
 615           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 616           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 617           dump_printf (MSG_NOTE, "\n");
 618         }
 619
 620       /* Skip virtual phi's.  The data dependences that are associated with
 621          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 622       if (virtual_operand_p (def))
 623         continue;
 624
 625       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 626
 627       /* Analyze the evolution function.  */
 628       access_fn = analyze_scalar_evolution (loop, def);
 629       if (access_fn)
 630         {
 631           STRIP_NOPS (access_fn);
 632           if (dump_enabled_p ())
 633             {
 634               dump_printf_loc (MSG_NOTE, vect_location,
 635                                "Access function of PHI: ");
 636               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 637               dump_printf (MSG_NOTE, "\n");
 638             }
 639           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 640             = evolution_part_in_loop_num (access_fn, loop->num);
 641         }
 642
 643       if (!access_fn
 644           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 645           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 646               && TREE_CODE (step) != INTEGER_CST))
 647         {
 648           worklist.safe_push (phi);
 649           continue;
 650         }
 651
 652       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 653
 654       if (dump_enabled_p ())
 655         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 656       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 657     }
 658
 659
 660   /* Second - identify all reductions and nested cycles.  */
 661   while (worklist.length () > 0)
 662     {
 663       gimple phi = worklist.pop ();
 664       tree def = PHI_RESULT (phi);
 665       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 666       gimple reduc_stmt;
 667       bool nested_cycle;
 668
 669       if (dump_enabled_p ())
 670         {
 671           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 672           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 673           dump_printf (MSG_NOTE, "\n");
 674         }
 675
 676       gcc_assert (!virtual_operand_p (def)
 677                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 678
 679       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 680       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 681                                                 &double_reduc);
 682       if (reduc_stmt)
 683         {
 684           if (double_reduc)
 685             {
 686               if (dump_enabled_p ())
 687                 dump_printf_loc (MSG_NOTE, vect_location,
 688                                  "Detected double reduction.\n");
 689
 690               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 691               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 692                                                     vect_double_reduction_def;
 693             }
 694           else
 695             {
 696               if (nested_cycle)
 697                 {
 698                   if (dump_enabled_p ())
 699                     dump_printf_loc (MSG_NOTE, vect_location,
 700                                      "Detected vectorizable nested cycle.\n");
 701
 702                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 703                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 704                                                              vect_nested_cycle;
 705                 }
 706               else
 707                 {
 708                   if (dump_enabled_p ())
 709                     dump_printf_loc (MSG_NOTE, vect_location,
 710                                      "Detected reduction.\n");
 711
 712                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 713                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 714                                                            vect_reduction_def;
 715                   /* Store the reduction cycles for possible vectorization in
 716                      loop-aware SLP.  */
 717                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 718                 }
 719             }
 720         }
 721       else
 722         if (dump_enabled_p ())
 723           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 724                            "Unknown def-use cycle pattern.\n");
 725     }
 726 }
 727
 728
 729 /* Function vect_analyze_scalar_cycles.
 730
 731    Examine the cross iteration def-use cycles of scalar variables, by
 732    analyzing the loop-header PHIs of scalar variables.  Classify each
 733    cycle as one of the following: invariant, induction, reduction, unknown.
 734    We do that for the loop represented by LOOP_VINFO, and also to its
 735    inner-loop, if exists.
 736    Examples for scalar cycles:
 737
 738    Example1: reduction:
 739
 740               loop1:
 741               for (i=0; i<N; i++)
 742                  sum += a[i];
 743
 744    Example2: induction:
 745
 746               loop2:
 747               for (i=0; i<N; i++)
 748                  a[i] = i;  */
 749
 750 static void
 751 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 752 {
 753   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 754
 755   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 756
 757   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 758      Reductions in such inner-loop therefore have different properties than
 759      the reductions in the nest that gets vectorized:
 760      1. When vectorized, they are executed in the same order as in the original
 761         scalar loop, so we can't change the order of computation when
 762         vectorizing them.
 763      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 764         current checks are too strict.  */
 765
 766   if (loop->inner)
 767     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 768 }
 769
 770 /* Function vect_get_loop_niters.
 771
 772    Determine how many iterations the loop is executed.
 773    If an expression that represents the number of iterations
 774    can be constructed, place it in NUMBER_OF_ITERATIONS.
 775    Return the loop exit condition.  */
 776
 777 static gimple
 778 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
 779 {
 780   tree niters;
 781
 782   if (dump_enabled_p ())
 783     dump_printf_loc (MSG_NOTE, vect_location,
 784                      "=== get_loop_niters ===\n");
 785   niters = number_of_exit_cond_executions (loop);
 786
 787   if (niters != NULL_TREE
 788       && niters != chrec_dont_know)
 789     {
 790       *number_of_iterations = niters;
 791
 792       if (dump_enabled_p ())
 793         {
 794           dump_printf_loc (MSG_NOTE, vect_location, "==> get_loop_niters:");
 795           dump_generic_expr (MSG_NOTE, TDF_SLIM, *number_of_iterations);
 796           dump_printf (MSG_NOTE, "\n");
 797         }
 798     }
 799
 800   return get_loop_exit_condition (loop);
 801 }
 802
 803
 804 /* Function bb_in_loop_p
 805
 806    Used as predicate for dfs order traversal of the loop bbs.  */
 807
 808 static bool
 809 bb_in_loop_p (const_basic_block bb, const void *data)
 810 {
 811   const struct loop *const loop = (const struct loop *)data;
 812   if (flow_bb_inside_loop_p (loop, bb))
 813     return true;
 814   return false;
 815 }
 816
 817
 818 /* Function new_loop_vec_info.
 819
 820    Create and initialize a new loop_vec_info struct for LOOP, as well as
 821    stmt_vec_info structs for all the stmts in LOOP.  */
 822
 823 static loop_vec_info
 824 new_loop_vec_info (struct loop *loop)
 825 {
 826   loop_vec_info res;
 827   basic_block *bbs;
 828   gimple_stmt_iterator si;
 829   unsigned int i, nbbs;
 830
 831   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 832   LOOP_VINFO_LOOP (res) = loop;
 833
 834   bbs = get_loop_body (loop);
 835
 836   /* Create/Update stmt_info for all stmts in the loop.  */
 837   for (i = 0; i < loop->num_nodes; i++)
 838     {
 839       basic_block bb = bbs[i];
 840
 841       /* BBs in a nested inner-loop will have been already processed (because
 842          we will have called vect_analyze_loop_form for any nested inner-loop).
 843          Therefore, for stmts in an inner-loop we just want to update the
 844          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 845          loop_info of the outer-loop we are currently considering to vectorize
 846          (instead of the loop_info of the inner-loop).
 847          For stmts in other BBs we need to create a stmt_info from scratch.  */
 848       if (bb->loop_father != loop)
 849         {
 850           /* Inner-loop bb.  */
 851           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 852           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 853             {
 854               gimple phi = gsi_stmt (si);
 855               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 856               loop_vec_info inner_loop_vinfo =
 857                 STMT_VINFO_LOOP_VINFO (stmt_info);
 858               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 859               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 860             }
 861           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 862            {
 863               gimple stmt = gsi_stmt (si);
 864               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 865               loop_vec_info inner_loop_vinfo =
 866                  STMT_VINFO_LOOP_VINFO (stmt_info);
 867               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 868               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 869            }
 870         }
 871       else
 872         {
 873           /* bb in current nest.  */
 874           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 875             {
 876               gimple phi = gsi_stmt (si);
 877               gimple_set_uid (phi, 0);
 878               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 879             }
 880
 881           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 882             {
 883               gimple stmt = gsi_stmt (si);
 884               gimple_set_uid (stmt, 0);
 885               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 886             }
 887         }
 888     }
 889
 890   /* CHECKME: We want to visit all BBs before their successors (except for
 891      latch blocks, for which this assertion wouldn't hold).  In the simple
 892      case of the loop forms we allow, a dfs order of the BBs would the same
 893      as reversed postorder traversal, so we are safe.  */
 894
 895    free (bbs);
 896    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 897    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 898                               bbs, loop->num_nodes, loop);
 899    gcc_assert (nbbs == loop->num_nodes);
 900
 901   LOOP_VINFO_BBS (res) = bbs;
 902   LOOP_VINFO_NITERS (res) = NULL;
 903   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 904   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 905   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 906   LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
 907   LOOP_VINFO_VECT_FACTOR (res) = 0;
 908   LOOP_VINFO_LOOP_NEST (res).create (3);
 909   LOOP_VINFO_DATAREFS (res).create (10);
 910   LOOP_VINFO_DDRS (res).create (10 * 10);
 911   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 912   LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
 913              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 914   LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
 915              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 916   LOOP_VINFO_GROUPED_STORES (res).create (10);
 917   LOOP_VINFO_REDUCTIONS (res).create (10);
 918   LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
 919   LOOP_VINFO_SLP_INSTANCES (res).create (10);
 920   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 921   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
 922   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
 923   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
 924
 925   return res;
 926 }
 927
 928
 929 /* Function destroy_loop_vec_info.
 930
 931    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 932    stmts in the loop.  */
 933
 934 void
 935 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 936 {
 937   struct loop *loop;
 938   basic_block *bbs;
 939   int nbbs;
 940   gimple_stmt_iterator si;
 941   int j;
 942   vec<slp_instance> slp_instances;
 943   slp_instance instance;
 944   bool swapped;
 945
 946   if (!loop_vinfo)
 947     return;
 948
 949   loop = LOOP_VINFO_LOOP (loop_vinfo);
 950
 951   bbs = LOOP_VINFO_BBS (loop_vinfo);
 952   nbbs = clean_stmts ? loop->num_nodes : 0;
 953   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
 954
 955   for (j = 0; j < nbbs; j++)
 956     {
 957       basic_block bb = bbs[j];
 958       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 959         free_stmt_vec_info (gsi_stmt (si));
 960
 961       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 962         {
 963           gimple stmt = gsi_stmt (si);
 964
 965           /* We may have broken canonical form by moving a constant
 966              into RHS1 of a commutative op.  Fix such occurrences.  */
 967           if (swapped && is_gimple_assign (stmt))
 968             {
 969               enum tree_code code = gimple_assign_rhs_code (stmt);
 970
 971               if ((code == PLUS_EXPR
 972                    || code == POINTER_PLUS_EXPR
 973                    || code == MULT_EXPR)
 974                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 975                 swap_ssa_operands (stmt,
 976                                    gimple_assign_rhs1_ptr (stmt),
 977                                    gimple_assign_rhs2_ptr (stmt));
 978             }
 979
 980           /* Free stmt_vec_info.  */
 981           free_stmt_vec_info (stmt);
 982           gsi_next (&si);
 983         }
 984     }
 985
 986   free (LOOP_VINFO_BBS (loop_vinfo));
 987   vect_destroy_datarefs (loop_vinfo, NULL);
 988   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 989   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
 990   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
 991   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
 992   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
 993   FOR_EACH_VEC_ELT (slp_instances, j, instance)
 994     vect_free_slp_instance (instance);
 995
 996   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
 997   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
 998   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
 999   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1000
1001   if (LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
1002     LOOP_VINFO_PEELING_HTAB (loop_vinfo).dispose ();
1003
1004   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1005
1006   free (loop_vinfo);
1007   loop->aux = NULL;
1008 }
1009
1010
1011 /* Function vect_analyze_loop_1.
1012
1013    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1014    for it. The different analyses will record information in the
1015    loop_vec_info struct.  This is a subset of the analyses applied in
1016    vect_analyze_loop, to be applied on an inner-loop nested in the loop
1017    that is now considered for (outer-loop) vectorization.  */
1018
1019 static loop_vec_info
1020 vect_analyze_loop_1 (struct loop *loop)
1021 {
1022   loop_vec_info loop_vinfo;
1023
1024   if (dump_enabled_p ())
1025     dump_printf_loc (MSG_NOTE, vect_location,
1026                      "===== analyze_loop_nest_1 =====\n");
1027
1028   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1029
1030   loop_vinfo = vect_analyze_loop_form (loop);
1031   if (!loop_vinfo)
1032     {
1033       if (dump_enabled_p ())
1034         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1035                          "bad inner-loop form.\n");
1036       return NULL;
1037     }
1038
1039   return loop_vinfo;
1040 }
1041
1042
1043 /* Function vect_analyze_loop_form.
1044
1045    Verify that certain CFG restrictions hold, including:
1046    - the loop has a pre-header
1047    - the loop has a single entry and exit
1048    - the loop exit condition is simple enough, and the number of iterations
1049      can be analyzed (a countable loop).  */
1050
1051 loop_vec_info
1052 vect_analyze_loop_form (struct loop *loop)
1053 {
1054   loop_vec_info loop_vinfo;
1055   gimple loop_cond;
1056   tree number_of_iterations = NULL;
1057   loop_vec_info inner_loop_vinfo = NULL;
1058
1059   if (dump_enabled_p ())
1060     dump_printf_loc (MSG_NOTE, vect_location,
1061                      "=== vect_analyze_loop_form ===\n");
1062
1063   /* Different restrictions apply when we are considering an inner-most loop,
1064      vs. an outer (nested) loop.
1065      (FORNOW. May want to relax some of these restrictions in the future).  */
1066
1067   if (!loop->inner)
1068     {
1069       /* Inner-most loop.  We currently require that the number of BBs is
1070          exactly 2 (the header and latch).  Vectorizable inner-most loops
1071          look like this:
1072
1073                         (pre-header)
1074                            |
1075                           header <--------+
1076                            | |            |
1077                            | +--> latch --+
1078                            |
1079                         (exit-bb)  */
1080
1081       if (loop->num_nodes != 2)
1082         {
1083           if (dump_enabled_p ())
1084             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1085                              "not vectorized: control flow in loop.\n");
1086           return NULL;
1087         }
1088
1089       if (empty_block_p (loop->header))
1090     {
1091           if (dump_enabled_p ())
1092             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1093                              "not vectorized: empty loop.\n");
1094       return NULL;
1095     }
1096     }
1097   else
1098     {
1099       struct loop *innerloop = loop->inner;
1100       edge entryedge;
1101
1102       /* Nested loop. We currently require that the loop is doubly-nested,
1103          contains a single inner loop, and the number of BBs is exactly 5.
1104          Vectorizable outer-loops look like this:
1105
1106                         (pre-header)
1107                            |
1108                           header <---+
1109                            |         |
1110                           inner-loop |
1111                            |         |
1112                           tail ------+
1113                            |
1114                         (exit-bb)
1115
1116          The inner-loop has the properties expected of inner-most loops
1117          as described above.  */
1118
1119       if ((loop->inner)->inner || (loop->inner)->next)
1120         {
1121           if (dump_enabled_p ())
1122             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1123                              "not vectorized: multiple nested loops.\n");
1124           return NULL;
1125         }
1126
1127       /* Analyze the inner-loop.  */
1128       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1129       if (!inner_loop_vinfo)
1130         {
1131           if (dump_enabled_p ())
1132             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1133                              "not vectorized: Bad inner loop.\n");
1134           return NULL;
1135         }
1136
1137       if (!expr_invariant_in_loop_p (loop,
1138                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
1139         {
1140           if (dump_enabled_p ())
1141             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1142                              "not vectorized: inner-loop count not"
1143                              " invariant.\n");
1144           destroy_loop_vec_info (inner_loop_vinfo, true);
1145           return NULL;
1146         }
1147
1148       if (loop->num_nodes != 5)
1149         {
1150           if (dump_enabled_p ())
1151             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1152                              "not vectorized: control flow in loop.\n");
1153           destroy_loop_vec_info (inner_loop_vinfo, true);
1154           return NULL;
1155         }
1156
1157       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1158       entryedge = EDGE_PRED (innerloop->header, 0);
1159       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1160         entryedge = EDGE_PRED (innerloop->header, 1);
1161
1162       if (entryedge->src != loop->header
1163           || !single_exit (innerloop)
1164           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1165         {
1166           if (dump_enabled_p ())
1167             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1168                              "not vectorized: unsupported outerloop form.\n");
1169           destroy_loop_vec_info (inner_loop_vinfo, true);
1170           return NULL;
1171         }
1172
1173       if (dump_enabled_p ())
1174         dump_printf_loc (MSG_NOTE, vect_location,
1175                          "Considering outer-loop vectorization.\n");
1176     }
1177
1178   if (!single_exit (loop)
1179       || EDGE_COUNT (loop->header->preds) != 2)
1180     {
1181       if (dump_enabled_p ())
1182         {
1183           if (!single_exit (loop))
1184             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1185                              "not vectorized: multiple exits.\n");
1186           else if (EDGE_COUNT (loop->header->preds) != 2)
1187             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1188                              "not vectorized: too many incoming edges.\n");
1189         }
1190       if (inner_loop_vinfo)
1191         destroy_loop_vec_info (inner_loop_vinfo, true);
1192       return NULL;
1193     }
1194
1195   /* We assume that the loop exit condition is at the end of the loop. i.e,
1196      that the loop is represented as a do-while (with a proper if-guard
1197      before the loop if needed), where the loop header contains all the
1198      executable statements, and the latch is empty.  */
1199   if (!empty_block_p (loop->latch)
1200       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1201     {
1202       if (dump_enabled_p ())
1203         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1204                          "not vectorized: latch block not empty.\n");
1205       if (inner_loop_vinfo)
1206         destroy_loop_vec_info (inner_loop_vinfo, true);
1207       return NULL;
1208     }
1209
1210   /* Make sure there exists a single-predecessor exit bb:  */
1211   if (!single_pred_p (single_exit (loop)->dest))
1212     {
1213       edge e = single_exit (loop);
1214       if (!(e->flags & EDGE_ABNORMAL))
1215         {
1216           split_loop_exit_edge (e);
1217           if (dump_enabled_p ())
1218             dump_printf (MSG_NOTE, "split exit edge.\n");
1219         }
1220       else
1221         {
1222           if (dump_enabled_p ())
1223             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1224                              "not vectorized: abnormal loop exit edge.\n");
1225           if (inner_loop_vinfo)
1226             destroy_loop_vec_info (inner_loop_vinfo, true);
1227           return NULL;
1228         }
1229     }
1230
1231   loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1232   if (!loop_cond)
1233     {
1234       if (dump_enabled_p ())
1235         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1236                          "not vectorized: complicated exit condition.\n");
1237       if (inner_loop_vinfo)
1238         destroy_loop_vec_info (inner_loop_vinfo, true);
1239       return NULL;
1240     }
1241
1242   if (!number_of_iterations)
1243     {
1244       if (dump_enabled_p ())
1245         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1246                          "not vectorized: number of iterations cannot be "
1247                          "computed.\n");
1248       if (inner_loop_vinfo)
1249         destroy_loop_vec_info (inner_loop_vinfo, true);
1250       return NULL;
1251     }
1252
1253   if (chrec_contains_undetermined (number_of_iterations))
1254     {
1255       if (dump_enabled_p ())
1256             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1257                              "Infinite number of iterations.\n");
1258       if (inner_loop_vinfo)
1259         destroy_loop_vec_info (inner_loop_vinfo, true);
1260       return NULL;
1261     }
1262
1263   if (!NITERS_KNOWN_P (number_of_iterations))
1264     {
1265       if (dump_enabled_p ())
1266         {
1267           dump_printf_loc (MSG_NOTE, vect_location,
1268                            "Symbolic number of iterations is ");
1269           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1270           dump_printf (MSG_NOTE, "\n");
1271         }
1272     }
1273   else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1274     {
1275       if (dump_enabled_p ())
1276         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1277                          "not vectorized: number of iterations = 0.\n");
1278       if (inner_loop_vinfo)
1279         destroy_loop_vec_info (inner_loop_vinfo, true);
1280       return NULL;
1281     }
1282
1283   loop_vinfo = new_loop_vec_info (loop);
1284   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1285   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1286
1287   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1288
1289   /* CHECKME: May want to keep it around it in the future.  */
1290   if (inner_loop_vinfo)
1291     destroy_loop_vec_info (inner_loop_vinfo, false);
1292
1293   gcc_assert (!loop->aux);
1294   loop->aux = loop_vinfo;
1295   return loop_vinfo;
1296 }
1297
1298
1299 /* Function vect_analyze_loop_operations.
1300
1301    Scan the loop stmts and make sure they are all vectorizable.  */
1302
1303 static bool
1304 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1305 {
1306   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1307   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1308   int nbbs = loop->num_nodes;
1309   gimple_stmt_iterator si;
1310   unsigned int vectorization_factor = 0;
1311   int i;
1312   gimple phi;
1313   stmt_vec_info stmt_info;
1314   bool need_to_vectorize = false;
1315   int min_profitable_iters;
1316   int min_scalar_loop_bound;
1317   unsigned int th;
1318   bool only_slp_in_loop = true, ok;
1319   HOST_WIDE_INT max_niter;
1320   HOST_WIDE_INT estimated_niter;
1321   int min_profitable_estimate;
1322
1323   if (dump_enabled_p ())
1324     dump_printf_loc (MSG_NOTE, vect_location,
1325                      "=== vect_analyze_loop_operations ===\n");
1326
1327   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1328   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1329   if (slp)
1330     {
1331       /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1332          vectorization factor of the loop is the unrolling factor required by
1333          the SLP instances.  If that unrolling factor is 1, we say, that we
1334          perform pure SLP on loop - cross iteration parallelism is not
1335          exploited.  */
1336       for (i = 0; i < nbbs; i++)
1337         {
1338           basic_block bb = bbs[i];
1339           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1340             {
1341               gimple stmt = gsi_stmt (si);
1342               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1343               gcc_assert (stmt_info);
1344               if ((STMT_VINFO_RELEVANT_P (stmt_info)
1345                    || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1346                   && !PURE_SLP_STMT (stmt_info))
1347                 /* STMT needs both SLP and loop-based vectorization.  */
1348                 only_slp_in_loop = false;
1349             }
1350         }
1351
1352       if (only_slp_in_loop)
1353         vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1354       else
1355         vectorization_factor = least_common_multiple (vectorization_factor,
1356                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1357
1358       LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1359       if (dump_enabled_p ())
1360         dump_printf_loc (MSG_NOTE, vect_location,
1361                          "Updating vectorization factor to %d\n",
1362                          vectorization_factor);
1363     }
1364
1365   for (i = 0; i < nbbs; i++)
1366     {
1367       basic_block bb = bbs[i];
1368
1369       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1370         {
1371           phi = gsi_stmt (si);
1372           ok = true;
1373
1374           stmt_info = vinfo_for_stmt (phi);
1375           if (dump_enabled_p ())
1376             {
1377               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1378               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1379               dump_printf (MSG_NOTE, "\n");
1380             }
1381
1382           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1383              (i.e., a phi in the tail of the outer-loop).  */
1384           if (! is_loop_header_bb_p (bb))
1385             {
1386               /* FORNOW: we currently don't support the case that these phis
1387                  are not used in the outerloop (unless it is double reduction,
1388                  i.e., this phi is vect_reduction_def), cause this case
1389                  requires to actually do something here.  */
1390               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1391                    || STMT_VINFO_LIVE_P (stmt_info))
1392                   && STMT_VINFO_DEF_TYPE (stmt_info)
1393                      != vect_double_reduction_def)
1394                 {
1395                   if (dump_enabled_p ())
1396                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1397                                      "Unsupported loop-closed phi in "
1398                                      "outer-loop.\n");
1399                   return false;
1400                 }
1401
1402               /* If PHI is used in the outer loop, we check that its operand
1403                  is defined in the inner loop.  */
1404               if (STMT_VINFO_RELEVANT_P (stmt_info))
1405                 {
1406                   tree phi_op;
1407                   gimple op_def_stmt;
1408
1409                   if (gimple_phi_num_args (phi) != 1)
1410                     return false;
1411
1412                   phi_op = PHI_ARG_DEF (phi, 0);
1413                   if (TREE_CODE (phi_op) != SSA_NAME)
1414                     return false;
1415
1416                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1417                   if (gimple_nop_p (op_def_stmt)
1418                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1419                       || !vinfo_for_stmt (op_def_stmt))
1420                     return false;
1421
1422                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1423                         != vect_used_in_outer
1424                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1425                            != vect_used_in_outer_by_reduction)
1426                     return false;
1427                 }
1428
1429               continue;
1430             }
1431
1432           gcc_assert (stmt_info);
1433
1434           if (STMT_VINFO_LIVE_P (stmt_info))
1435             {
1436               /* FORNOW: not yet supported.  */
1437               if (dump_enabled_p ())
1438                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1439                                  "not vectorized: value used after loop.\n");
1440               return false;
1441             }
1442
1443           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1444               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1445             {
1446               /* A scalar-dependence cycle that we don't support.  */
1447               if (dump_enabled_p ())
1448                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1449                                  "not vectorized: scalar dependence cycle.\n");
1450               return false;
1451             }
1452
1453           if (STMT_VINFO_RELEVANT_P (stmt_info))
1454             {
1455               need_to_vectorize = true;
1456               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1457                 ok = vectorizable_induction (phi, NULL, NULL);
1458             }
1459
1460           if (!ok)
1461             {
1462               if (dump_enabled_p ())
1463                 {
1464                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1465                                    "not vectorized: relevant phi not "
1466                                    "supported: ");
1467                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1468                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1469                 }
1470               return false;
1471             }
1472         }
1473
1474       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1475         {
1476           gimple stmt = gsi_stmt (si);
1477           if (!gimple_clobber_p (stmt)
1478               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1479             return false;
1480         }
1481     } /* bbs */
1482
1483   /* All operations in the loop are either irrelevant (deal with loop
1484      control, or dead), or only used outside the loop and can be moved
1485      out of the loop (e.g. invariants, inductions).  The loop can be
1486      optimized away by scalar optimizations.  We're better off not
1487      touching this loop.  */
1488   if (!need_to_vectorize)
1489     {
1490       if (dump_enabled_p ())
1491         dump_printf_loc (MSG_NOTE, vect_location,
1492                          "All the computation can be taken out of the loop.\n");
1493       if (dump_enabled_p ())
1494         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1495                          "not vectorized: redundant loop. no profit to "
1496                          "vectorize.\n");
1497       return false;
1498     }
1499
1500   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1501     dump_printf_loc (MSG_NOTE, vect_location,
1502                      "vectorization_factor = %d, niters = "
1503                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1504                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1505
1506   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1507        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1508       || ((max_niter = max_stmt_executions_int (loop)) != -1
1509           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1510     {
1511       if (dump_enabled_p ())
1512         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1513                          "not vectorized: iteration count too small.\n");
1514       if (dump_enabled_p ())
1515         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1516                          "not vectorized: iteration count smaller than "
1517                          "vectorization factor.\n");
1518       return false;
1519     }
1520
1521   /* Analyze cost.  Decide if worth while to vectorize.  */
1522
1523   /* Once VF is set, SLP costs should be updated since the number of created
1524      vector stmts depends on VF.  */
1525   vect_update_slp_costs_according_to_vf (loop_vinfo);
1526
1527   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1528                                       &min_profitable_estimate);
1529   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1530
1531   if (min_profitable_iters < 0)
1532     {
1533       if (dump_enabled_p ())
1534         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1535                          "not vectorized: vectorization not profitable.\n");
1536       if (dump_enabled_p ())
1537         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1538                          "not vectorized: vector version will never be "
1539                          "profitable.\n");
1540       return false;
1541     }
1542
1543   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1544                             * vectorization_factor) - 1);
1545
1546
1547   /* Use the cost model only if it is more conservative than user specified
1548      threshold.  */
1549
1550   th = (unsigned) min_scalar_loop_bound;
1551   if (min_profitable_iters
1552       && (!min_scalar_loop_bound
1553           || min_profitable_iters > min_scalar_loop_bound))
1554     th = (unsigned) min_profitable_iters;
1555
1556   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1557       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1558     {
1559       if (dump_enabled_p ())
1560         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1561                          "not vectorized: vectorization not profitable.\n");
1562       if (dump_enabled_p ())
1563         dump_printf_loc (MSG_NOTE, vect_location,
1564                          "not vectorized: iteration count smaller than user "
1565                          "specified loop bound parameter or minimum profitable "
1566                          "iterations (whichever is more conservative).\n");
1567       return false;
1568     }
1569
1570   if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1571       && ((unsigned HOST_WIDE_INT) estimated_niter
1572           <= MAX (th, (unsigned)min_profitable_estimate)))
1573     {
1574       if (dump_enabled_p ())
1575         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1576                          "not vectorized: estimated iteration count too "
1577                          "small.\n");
1578       if (dump_enabled_p ())
1579         dump_printf_loc (MSG_NOTE, vect_location,
1580                          "not vectorized: estimated iteration count smaller "
1581                          "than specified loop bound parameter or minimum "
1582                          "profitable iterations (whichever is more "
1583                          "conservative).\n");
1584       return false;
1585     }
1586
1587   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo)
1588       || ((int) tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1589           < exact_log2 (vectorization_factor)))
1590     {
1591       if (dump_enabled_p ())
1592         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required.\n");
1593       if (!vect_can_advance_ivs_p (loop_vinfo))
1594         {
1595           if (dump_enabled_p ())
1596             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1597                              "not vectorized: can't create epilog loop 1.\n");
1598           return false;
1599         }
1600       if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1601         {
1602           if (dump_enabled_p ())
1603             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1604                              "not vectorized: can't create epilog loop 2.\n");
1605           return false;
1606         }
1607     }
1608
1609   return true;
1610 }
1611
1612
1613 /* Function vect_analyze_loop_2.
1614
1615    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1616    for it.  The different analyses will record information in the
1617    loop_vec_info struct.  */
1618 static bool
1619 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1620 {
1621   bool ok, slp = false;
1622   int max_vf = MAX_VECTORIZATION_FACTOR;
1623   int min_vf = 2;
1624
1625   /* Find all data references in the loop (which correspond to vdefs/vuses)
1626      and analyze their evolution in the loop.  Also adjust the minimal
1627      vectorization factor according to the loads and stores.
1628
1629      FORNOW: Handle only simple, array references, which
1630      alignment can be forced, and aligned pointer-references.  */
1631
1632   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1633   if (!ok)
1634     {
1635       if (dump_enabled_p ())
1636         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1637                          "bad data references.\n");
1638       return false;
1639     }
1640
1641   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1642      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1643
1644   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1645   if (!ok)
1646     {
1647       if (dump_enabled_p ())
1648         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1649                          "bad data access.\n");
1650       return false;
1651     }
1652
1653   /* Classify all cross-iteration scalar data-flow cycles.
1654      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1655
1656   vect_analyze_scalar_cycles (loop_vinfo);
1657
1658   vect_pattern_recog (loop_vinfo, NULL);
1659
1660   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1661
1662   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1663   if (!ok)
1664     {
1665       if (dump_enabled_p ())
1666         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1667                          "unexpected pattern.\n");
1668       return false;
1669     }
1670
1671   /* Analyze data dependences between the data-refs in the loop
1672      and adjust the maximum vectorization factor according to
1673      the dependences.
1674      FORNOW: fail at the first data dependence that we encounter.  */
1675
1676   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1677   if (!ok
1678       || max_vf < min_vf)
1679     {
1680       if (dump_enabled_p ())
1681             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1682                              "bad data dependence.\n");
1683       return false;
1684     }
1685
1686   ok = vect_determine_vectorization_factor (loop_vinfo);
1687   if (!ok)
1688     {
1689       if (dump_enabled_p ())
1690         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1691                          "can't determine vectorization factor.\n");
1692       return false;
1693     }
1694   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1695     {
1696       if (dump_enabled_p ())
1697         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1698                          "bad data dependence.\n");
1699       return false;
1700     }
1701
1702   /* Analyze the alignment of the data-refs in the loop.
1703      Fail if a data reference is found that cannot be vectorized.  */
1704
1705   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1706   if (!ok)
1707     {
1708       if (dump_enabled_p ())
1709         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1710                          "bad data alignment.\n");
1711       return false;
1712     }
1713
1714   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1715      It is important to call pruning after vect_analyze_data_ref_accesses,
1716      since we use grouping information gathered by interleaving analysis.  */
1717   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1718   if (!ok)
1719     {
1720       if (dump_enabled_p ())
1721         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1722                          "too long list of versioning for alias "
1723                          "run-time tests.\n");
1724       return false;
1725     }
1726
1727   /* This pass will decide on using loop versioning and/or loop peeling in
1728      order to enhance the alignment of data references in the loop.  */
1729
1730   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1731   if (!ok)
1732     {
1733       if (dump_enabled_p ())
1734         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1735                          "bad data alignment.\n");
1736       return false;
1737     }
1738
1739   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1740   ok = vect_analyze_slp (loop_vinfo, NULL);
1741   if (ok)
1742     {
1743       /* Decide which possible SLP instances to SLP.  */
1744       slp = vect_make_slp_decision (loop_vinfo);
1745
1746       /* Find stmts that need to be both vectorized and SLPed.  */
1747       vect_detect_hybrid_slp (loop_vinfo);
1748     }
1749   else
1750     return false;
1751
1752   /* Scan all the operations in the loop and make sure they are
1753      vectorizable.  */
1754
1755   ok = vect_analyze_loop_operations (loop_vinfo, slp);
1756   if (!ok)
1757     {
1758       if (dump_enabled_p ())
1759         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1760                          "bad operation or unsupported loop bound.\n");
1761       return false;
1762     }
1763
1764   return true;
1765 }
1766
1767 /* Function vect_analyze_loop.
1768
1769    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1770    for it.  The different analyses will record information in the
1771    loop_vec_info struct.  */
1772 loop_vec_info
1773 vect_analyze_loop (struct loop *loop)
1774 {
1775   loop_vec_info loop_vinfo;
1776   unsigned int vector_sizes;
1777
1778   /* Autodetect first vector size we try.  */
1779   current_vector_size = 0;
1780   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1781
1782   if (dump_enabled_p ())
1783     dump_printf_loc (MSG_NOTE, vect_location,
1784                      "===== analyze_loop_nest =====\n");
1785
1786   if (loop_outer (loop)
1787       && loop_vec_info_for_loop (loop_outer (loop))
1788       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1789     {
1790       if (dump_enabled_p ())
1791         dump_printf_loc (MSG_NOTE, vect_location,
1792                          "outer-loop already vectorized.\n");
1793       return NULL;
1794     }
1795
1796   while (1)
1797     {
1798       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1799       loop_vinfo = vect_analyze_loop_form (loop);
1800       if (!loop_vinfo)
1801         {
1802           if (dump_enabled_p ())
1803             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1804                              "bad loop form.\n");
1805           return NULL;
1806         }
1807
1808       if (vect_analyze_loop_2 (loop_vinfo))
1809         {
1810           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1811
1812           return loop_vinfo;
1813         }
1814
1815       destroy_loop_vec_info (loop_vinfo, true);
1816
1817       vector_sizes &= ~current_vector_size;
1818       if (vector_sizes == 0
1819           || current_vector_size == 0)
1820         return NULL;
1821
1822       /* Try the next biggest vector size.  */
1823       current_vector_size = 1 << floor_log2 (vector_sizes);
1824       if (dump_enabled_p ())
1825         dump_printf_loc (MSG_NOTE, vect_location,
1826                          "***** Re-trying analysis with "
1827                          "vector size %d\n", current_vector_size);
1828     }
1829 }
1830
1831
1832 /* Function reduction_code_for_scalar_code
1833
1834    Input:
1835    CODE - tree_code of a reduction operations.
1836
1837    Output:
1838    REDUC_CODE - the corresponding tree-code to be used to reduce the
1839       vector of partial results into a single scalar result (which
1840       will also reside in a vector) or ERROR_MARK if the operation is
1841       a supported reduction operation, but does not have such tree-code.
1842
1843    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1844
1845 static bool
1846 reduction_code_for_scalar_code (enum tree_code code,
1847                                 enum tree_code *reduc_code)
1848 {
1849   switch (code)
1850     {
1851       case MAX_EXPR:
1852         *reduc_code = REDUC_MAX_EXPR;
1853         return true;
1854
1855       case MIN_EXPR:
1856         *reduc_code = REDUC_MIN_EXPR;
1857         return true;
1858
1859       case PLUS_EXPR:
1860         *reduc_code = REDUC_PLUS_EXPR;
1861         return true;
1862
1863       case MULT_EXPR:
1864       case MINUS_EXPR:
1865       case BIT_IOR_EXPR:
1866       case BIT_XOR_EXPR:
1867       case BIT_AND_EXPR:
1868         *reduc_code = ERROR_MARK;
1869         return true;
1870
1871       default:
1872        return false;
1873     }
1874 }
1875
1876
1877 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
1878    STMT is printed with a message MSG. */
1879
1880 static void
1881 report_vect_op (int msg_type, gimple stmt, const char *msg)
1882 {
1883   dump_printf_loc (msg_type, vect_location, "%s", msg);
1884   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1885   dump_printf (msg_type, "\n");
1886 }
1887
1888
1889 /* Detect SLP reduction of the form:
1890
1891    #a1 = phi <a5, a0>
1892    a2 = operation (a1)
1893    a3 = operation (a2)
1894    a4 = operation (a3)
1895    a5 = operation (a4)
1896
1897    #a = phi <a5>
1898
1899    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1900    FIRST_STMT is the first reduction stmt in the chain
1901    (a2 = operation (a1)).
1902
1903    Return TRUE if a reduction chain was detected.  */
1904
1905 static bool
1906 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1907 {
1908   struct loop *loop = (gimple_bb (phi))->loop_father;
1909   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1910   enum tree_code code;
1911   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1912   stmt_vec_info use_stmt_info, current_stmt_info;
1913   tree lhs;
1914   imm_use_iterator imm_iter;
1915   use_operand_p use_p;
1916   int nloop_uses, size = 0, n_out_of_loop_uses;
1917   bool found = false;
1918
1919   if (loop != vect_loop)
1920     return false;
1921
1922   lhs = PHI_RESULT (phi);
1923   code = gimple_assign_rhs_code (first_stmt);
1924   while (1)
1925     {
1926       nloop_uses = 0;
1927       n_out_of_loop_uses = 0;
1928       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1929         {
1930           gimple use_stmt = USE_STMT (use_p);
1931           if (is_gimple_debug (use_stmt))
1932             continue;
1933
1934           use_stmt = USE_STMT (use_p);
1935
1936           /* Check if we got back to the reduction phi.  */
1937           if (use_stmt == phi)
1938             {
1939               loop_use_stmt = use_stmt;
1940               found = true;
1941               break;
1942             }
1943
1944           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1945             {
1946               if (vinfo_for_stmt (use_stmt)
1947                   && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1948                 {
1949                   loop_use_stmt = use_stmt;
1950                   nloop_uses++;
1951                 }
1952             }
1953            else
1954              n_out_of_loop_uses++;
1955
1956            /* There are can be either a single use in the loop or two uses in
1957               phi nodes.  */
1958            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1959              return false;
1960         }
1961
1962       if (found)
1963         break;
1964
1965       /* We reached a statement with no loop uses.  */
1966       if (nloop_uses == 0)
1967         return false;
1968
1969       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
1970       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
1971         return false;
1972
1973       if (!is_gimple_assign (loop_use_stmt)
1974           || code != gimple_assign_rhs_code (loop_use_stmt)
1975           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
1976         return false;
1977
1978       /* Insert USE_STMT into reduction chain.  */
1979       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
1980       if (current_stmt)
1981         {
1982           current_stmt_info = vinfo_for_stmt (current_stmt);
1983           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
1984           GROUP_FIRST_ELEMENT (use_stmt_info)
1985             = GROUP_FIRST_ELEMENT (current_stmt_info);
1986         }
1987       else
1988         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
1989
1990       lhs = gimple_assign_lhs (loop_use_stmt);
1991       current_stmt = loop_use_stmt;
1992       size++;
1993    }
1994
1995   if (!found || loop_use_stmt != phi || size < 2)
1996     return false;
1997
1998   /* Swap the operands, if needed, to make the reduction operand be the second
1999      operand.  */
2000   lhs = PHI_RESULT (phi);
2001   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2002   while (next_stmt)
2003     {
2004       if (gimple_assign_rhs2 (next_stmt) == lhs)
2005         {
2006           tree op = gimple_assign_rhs1 (next_stmt);
2007           gimple def_stmt = NULL;
2008
2009           if (TREE_CODE (op) == SSA_NAME)
2010             def_stmt = SSA_NAME_DEF_STMT (op);
2011
2012           /* Check that the other def is either defined in the loop
2013              ("vect_internal_def"), or it's an induction (defined by a
2014              loop-header phi-node).  */
2015           if (def_stmt
2016               && gimple_bb (def_stmt)
2017               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2018               && (is_gimple_assign (def_stmt)
2019                   || is_gimple_call (def_stmt)
2020                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2021                            == vect_induction_def
2022                   || (gimple_code (def_stmt) == GIMPLE_PHI
2023                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2024                                   == vect_internal_def
2025                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2026             {
2027               lhs = gimple_assign_lhs (next_stmt);
2028               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2029               continue;
2030             }
2031
2032           return false;
2033         }
2034       else
2035         {
2036           tree op = gimple_assign_rhs2 (next_stmt);
2037           gimple def_stmt = NULL;
2038
2039           if (TREE_CODE (op) == SSA_NAME)
2040             def_stmt = SSA_NAME_DEF_STMT (op);
2041
2042           /* Check that the other def is either defined in the loop
2043             ("vect_internal_def"), or it's an induction (defined by a
2044             loop-header phi-node).  */
2045           if (def_stmt
2046               && gimple_bb (def_stmt)
2047               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2048               && (is_gimple_assign (def_stmt)
2049                   || is_gimple_call (def_stmt)
2050                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2051                               == vect_induction_def
2052                   || (gimple_code (def_stmt) == GIMPLE_PHI
2053                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2054                                   == vect_internal_def
2055                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2056             {
2057               if (dump_enabled_p ())
2058                 {
2059                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2060                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2061                   dump_printf (MSG_NOTE, "\n");
2062                 }
2063
2064               swap_ssa_operands (next_stmt,
2065                                  gimple_assign_rhs1_ptr (next_stmt),
2066                                  gimple_assign_rhs2_ptr (next_stmt));
2067               update_stmt (next_stmt);
2068
2069               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2070                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2071             }
2072           else
2073             return false;
2074         }
2075
2076       lhs = gimple_assign_lhs (next_stmt);
2077       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2078     }
2079
2080   /* Save the chain for further analysis in SLP detection.  */
2081   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2082   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2083   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2084
2085   return true;
2086 }
2087
2088
2089 /* Function vect_is_simple_reduction_1
2090
2091    (1) Detect a cross-iteration def-use cycle that represents a simple
2092    reduction computation.  We look for the following pattern:
2093
2094    loop_header:
2095      a1 = phi < a0, a2 >
2096      a3 = ...
2097      a2 = operation (a3, a1)
2098
2099    or
2100
2101    a3 = ...
2102    loop_header:
2103      a1 = phi < a0, a2 >
2104      a2 = operation (a3, a1)
2105
2106    such that:
2107    1. operation is commutative and associative and it is safe to
2108       change the order of the computation (if CHECK_REDUCTION is true)
2109    2. no uses for a2 in the loop (a2 is used out of the loop)
2110    3. no uses of a1 in the loop besides the reduction operation
2111    4. no uses of a1 outside the loop.
2112
2113    Conditions 1,4 are tested here.
2114    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2115
2116    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2117    nested cycles, if CHECK_REDUCTION is false.
2118
2119    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2120    reductions:
2121
2122      a1 = phi < a0, a2 >
2123      inner loop (def of a3)
2124      a2 = phi < a3 >
2125
2126    If MODIFY is true it tries also to rework the code in-place to enable
2127    detection of more reduction patterns.  For the time being we rewrite
2128    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2129 */
2130
2131 static gimple
2132 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2133                             bool check_reduction, bool *double_reduc,
2134                             bool modify)
2135 {
2136   struct loop *loop = (gimple_bb (phi))->loop_father;
2137   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2138   edge latch_e = loop_latch_edge (loop);
2139   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2140   gimple def_stmt, def1 = NULL, def2 = NULL;
2141   enum tree_code orig_code, code;
2142   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2143   tree type;
2144   int nloop_uses;
2145   tree name;
2146   imm_use_iterator imm_iter;
2147   use_operand_p use_p;
2148   bool phi_def;
2149
2150   *double_reduc = false;
2151
2152   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2153      otherwise, we assume outer loop vectorization.  */
2154   gcc_assert ((check_reduction && loop == vect_loop)
2155               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2156
2157   name = PHI_RESULT (phi);
2158   nloop_uses = 0;
2159   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2160     {
2161       gimple use_stmt = USE_STMT (use_p);
2162       if (is_gimple_debug (use_stmt))
2163         continue;
2164
2165       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2166         {
2167           if (dump_enabled_p ())
2168             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2169                              "intermediate value used outside loop.\n");
2170
2171           return NULL;
2172         }
2173
2174       if (vinfo_for_stmt (use_stmt)
2175           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2176         nloop_uses++;
2177       if (nloop_uses > 1)
2178         {
2179           if (dump_enabled_p ())
2180             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2181                              "reduction used in loop.\n");
2182           return NULL;
2183         }
2184     }
2185
2186   if (TREE_CODE (loop_arg) != SSA_NAME)
2187     {
2188       if (dump_enabled_p ())
2189         {
2190           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2191                            "reduction: not ssa_name: ");
2192           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2193           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2194         }
2195       return NULL;
2196     }
2197
2198   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2199   if (!def_stmt)
2200     {
2201       if (dump_enabled_p ())
2202         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2203                          "reduction: no def_stmt.\n");
2204       return NULL;
2205     }
2206
2207   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2208     {
2209       if (dump_enabled_p ())
2210         {
2211           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2212           dump_printf (MSG_NOTE, "\n");
2213         }
2214       return NULL;
2215     }
2216
2217   if (is_gimple_assign (def_stmt))
2218     {
2219       name = gimple_assign_lhs (def_stmt);
2220       phi_def = false;
2221     }
2222   else
2223     {
2224       name = PHI_RESULT (def_stmt);
2225       phi_def = true;
2226     }
2227
2228   nloop_uses = 0;
2229   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2230     {
2231       gimple use_stmt = USE_STMT (use_p);
2232       if (is_gimple_debug (use_stmt))
2233         continue;
2234       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2235           && vinfo_for_stmt (use_stmt)
2236           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2237         nloop_uses++;
2238       if (nloop_uses > 1)
2239         {
2240           if (dump_enabled_p ())
2241             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2242                              "reduction used in loop.\n");
2243           return NULL;
2244         }
2245     }
2246
2247   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2248      defined in the inner loop.  */
2249   if (phi_def)
2250     {
2251       op1 = PHI_ARG_DEF (def_stmt, 0);
2252
2253       if (gimple_phi_num_args (def_stmt) != 1
2254           || TREE_CODE (op1) != SSA_NAME)
2255         {
2256           if (dump_enabled_p ())
2257             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2258                              "unsupported phi node definition.\n");
2259
2260           return NULL;
2261         }
2262
2263       def1 = SSA_NAME_DEF_STMT (op1);
2264       if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2265           && loop->inner
2266           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2267           && is_gimple_assign (def1))
2268         {
2269           if (dump_enabled_p ())
2270             report_vect_op (MSG_NOTE, def_stmt,
2271                             "detected double reduction: ");
2272
2273           *double_reduc = true;
2274           return def_stmt;
2275         }
2276
2277       return NULL;
2278     }
2279
2280   code = orig_code = gimple_assign_rhs_code (def_stmt);
2281
2282   /* We can handle "res -= x[i]", which is non-associative by
2283      simply rewriting this into "res += -x[i]".  Avoid changing
2284      gimple instruction for the first simple tests and only do this
2285      if we're allowed to change code at all.  */
2286   if (code == MINUS_EXPR
2287       && modify
2288       && (op1 = gimple_assign_rhs1 (def_stmt))
2289       && TREE_CODE (op1) == SSA_NAME
2290       && SSA_NAME_DEF_STMT (op1) == phi)
2291     code = PLUS_EXPR;
2292
2293   if (check_reduction
2294       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2295     {
2296       if (dump_enabled_p ())
2297         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2298                         "reduction: not commutative/associative: ");
2299       return NULL;
2300     }
2301
2302   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2303     {
2304       if (code != COND_EXPR)
2305         {
2306           if (dump_enabled_p ())
2307             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2308                             "reduction: not binary operation: ");
2309
2310           return NULL;
2311         }
2312
2313       op3 = gimple_assign_rhs1 (def_stmt);
2314       if (COMPARISON_CLASS_P (op3))
2315         {
2316           op4 = TREE_OPERAND (op3, 1);
2317           op3 = TREE_OPERAND (op3, 0);
2318         }
2319
2320       op1 = gimple_assign_rhs2 (def_stmt);
2321       op2 = gimple_assign_rhs3 (def_stmt);
2322
2323       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2324         {
2325           if (dump_enabled_p ())
2326             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2327                             "reduction: uses not ssa_names: ");
2328
2329           return NULL;
2330         }
2331     }
2332   else
2333     {
2334       op1 = gimple_assign_rhs1 (def_stmt);
2335       op2 = gimple_assign_rhs2 (def_stmt);
2336
2337       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2338         {
2339           if (dump_enabled_p ())
2340             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2341                             "reduction: uses not ssa_names: ");
2342
2343           return NULL;
2344         }
2345    }
2346
2347   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2348   if ((TREE_CODE (op1) == SSA_NAME
2349        && !types_compatible_p (type,TREE_TYPE (op1)))
2350       || (TREE_CODE (op2) == SSA_NAME
2351           && !types_compatible_p (type, TREE_TYPE (op2)))
2352       || (op3 && TREE_CODE (op3) == SSA_NAME
2353           && !types_compatible_p (type, TREE_TYPE (op3)))
2354       || (op4 && TREE_CODE (op4) == SSA_NAME
2355           && !types_compatible_p (type, TREE_TYPE (op4))))
2356     {
2357       if (dump_enabled_p ())
2358         {
2359           dump_printf_loc (MSG_NOTE, vect_location,
2360                            "reduction: multiple types: operation type: ");
2361           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2362           dump_printf (MSG_NOTE, ", operands types: ");
2363           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2364                              TREE_TYPE (op1));
2365           dump_printf (MSG_NOTE, ",");
2366           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2367                              TREE_TYPE (op2));
2368           if (op3)
2369             {
2370               dump_printf (MSG_NOTE, ",");
2371               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2372                                  TREE_TYPE (op3));
2373             }
2374
2375           if (op4)
2376             {
2377               dump_printf (MSG_NOTE, ",");
2378               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2379                                  TREE_TYPE (op4));
2380             }
2381           dump_printf (MSG_NOTE, "\n");
2382         }
2383
2384       return NULL;
2385     }
2386
2387   /* Check that it's ok to change the order of the computation.
2388      Generally, when vectorizing a reduction we change the order of the
2389      computation.  This may change the behavior of the program in some
2390      cases, so we need to check that this is ok.  One exception is when
2391      vectorizing an outer-loop: the inner-loop is executed sequentially,
2392      and therefore vectorizing reductions in the inner-loop during
2393      outer-loop vectorization is safe.  */
2394
2395   /* CHECKME: check for !flag_finite_math_only too?  */
2396   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2397       && check_reduction)
2398     {
2399       /* Changing the order of operations changes the semantics.  */
2400       if (dump_enabled_p ())
2401         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2402                         "reduction: unsafe fp math optimization: ");
2403       return NULL;
2404     }
2405   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2406            && check_reduction)
2407     {
2408       /* Changing the order of operations changes the semantics.  */
2409       if (dump_enabled_p ())
2410         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2411                         "reduction: unsafe int math optimization: ");
2412       return NULL;
2413     }
2414   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2415     {
2416       /* Changing the order of operations changes the semantics.  */
2417       if (dump_enabled_p ())
2418         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2419                         "reduction: unsafe fixed-point math optimization: ");
2420       return NULL;
2421     }
2422
2423   /* If we detected "res -= x[i]" earlier, rewrite it into
2424      "res += -x[i]" now.  If this turns out to be useless reassoc
2425      will clean it up again.  */
2426   if (orig_code == MINUS_EXPR)
2427     {
2428       tree rhs = gimple_assign_rhs2 (def_stmt);
2429       tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
2430       gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2431                                                          rhs, NULL);
2432       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2433       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2434                                                           loop_info, NULL));
2435       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2436       gimple_assign_set_rhs2 (def_stmt, negrhs);
2437       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2438       update_stmt (def_stmt);
2439     }
2440
2441   /* Reduction is safe. We're dealing with one of the following:
2442      1) integer arithmetic and no trapv
2443      2) floating point arithmetic, and special flags permit this optimization
2444      3) nested cycle (i.e., outer loop vectorization).  */
2445   if (TREE_CODE (op1) == SSA_NAME)
2446     def1 = SSA_NAME_DEF_STMT (op1);
2447
2448   if (TREE_CODE (op2) == SSA_NAME)
2449     def2 = SSA_NAME_DEF_STMT (op2);
2450
2451   if (code != COND_EXPR
2452       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2453     {
2454       if (dump_enabled_p ())
2455         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2456       return NULL;
2457     }
2458
2459   /* Check that one def is the reduction def, defined by PHI,
2460      the other def is either defined in the loop ("vect_internal_def"),
2461      or it's an induction (defined by a loop-header phi-node).  */
2462
2463   if (def2 && def2 == phi
2464       && (code == COND_EXPR
2465           || !def1 || gimple_nop_p (def1)
2466           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2467           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2468               && (is_gimple_assign (def1)
2469                   || is_gimple_call (def1)
2470                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2471                       == vect_induction_def
2472                   || (gimple_code (def1) == GIMPLE_PHI
2473                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2474                           == vect_internal_def
2475                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2476     {
2477       if (dump_enabled_p ())
2478         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2479       return def_stmt;
2480     }
2481
2482   if (def1 && def1 == phi
2483       && (code == COND_EXPR
2484           || !def2 || gimple_nop_p (def2)
2485           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2486           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2487               && (is_gimple_assign (def2)
2488                   || is_gimple_call (def2)
2489                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2490                       == vect_induction_def
2491                   || (gimple_code (def2) == GIMPLE_PHI
2492                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2493                           == vect_internal_def
2494                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2495     {
2496       if (check_reduction)
2497         {
2498           /* Swap operands (just for simplicity - so that the rest of the code
2499              can assume that the reduction variable is always the last (second)
2500              argument).  */
2501           if (dump_enabled_p ())
2502             report_vect_op (MSG_NOTE, def_stmt,
2503                             "detected reduction: need to swap operands: ");
2504
2505           swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2506                              gimple_assign_rhs2_ptr (def_stmt));
2507
2508           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2509             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2510         }
2511       else
2512         {
2513           if (dump_enabled_p ())
2514             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2515         }
2516
2517       return def_stmt;
2518     }
2519
2520   /* Try to find SLP reduction chain.  */
2521   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2522     {
2523       if (dump_enabled_p ())
2524         report_vect_op (MSG_NOTE, def_stmt,
2525                         "reduction: detected reduction chain: ");
2526
2527       return def_stmt;
2528     }
2529
2530   if (dump_enabled_p ())
2531     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2532                     "reduction: unknown pattern: ");
2533
2534   return NULL;
2535 }
2536
2537 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2538    in-place.  Arguments as there.  */
2539
2540 static gimple
2541 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2542                           bool check_reduction, bool *double_reduc)
2543 {
2544   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2545                                      double_reduc, false);
2546 }
2547
2548 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2549    in-place if it enables detection of more reductions.  Arguments
2550    as there.  */
2551
2552 gimple
2553 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2554                           bool check_reduction, bool *double_reduc)
2555 {
2556   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2557                                      double_reduc, true);
2558 }
2559
2560 /* Calculate the cost of one scalar iteration of the loop.  */
2561 int
2562 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2563 {
2564   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2565   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2566   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2567   int innerloop_iters, i, stmt_cost;
2568
2569   /* Count statements in scalar loop.  Using this as scalar cost for a single
2570      iteration for now.
2571
2572      TODO: Add outer loop support.
2573
2574      TODO: Consider assigning different costs to different scalar
2575      statements.  */
2576
2577   /* FORNOW.  */
2578   innerloop_iters = 1;
2579   if (loop->inner)
2580     innerloop_iters = 50; /* FIXME */
2581
2582   for (i = 0; i < nbbs; i++)
2583     {
2584       gimple_stmt_iterator si;
2585       basic_block bb = bbs[i];
2586
2587       if (bb->loop_father == loop->inner)
2588         factor = innerloop_iters;
2589       else
2590         factor = 1;
2591
2592       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2593         {
2594           gimple stmt = gsi_stmt (si);
2595           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2596
2597           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2598             continue;
2599
2600           /* Skip stmts that are not vectorized inside the loop.  */
2601           if (stmt_info
2602               && !STMT_VINFO_RELEVANT_P (stmt_info)
2603               && (!STMT_VINFO_LIVE_P (stmt_info)
2604                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2605               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2606             continue;
2607
2608           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2609             {
2610               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2611                stmt_cost = vect_get_stmt_cost (scalar_load);
2612              else
2613                stmt_cost = vect_get_stmt_cost (scalar_store);
2614             }
2615           else
2616             stmt_cost = vect_get_stmt_cost (scalar_stmt);
2617
2618           scalar_single_iter_cost += stmt_cost * factor;
2619         }
2620     }
2621   return scalar_single_iter_cost;
2622 }
2623
2624 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2625 int
2626 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2627                              int *peel_iters_epilogue,
2628                              int scalar_single_iter_cost,
2629                              stmt_vector_for_cost *prologue_cost_vec,
2630                              stmt_vector_for_cost *epilogue_cost_vec)
2631 {
2632   int retval = 0;
2633   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2634
2635   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2636     {
2637       *peel_iters_epilogue = vf/2;
2638       if (dump_enabled_p ())
2639         dump_printf_loc (MSG_NOTE, vect_location,
2640                          "cost model: epilogue peel iters set to vf/2 "
2641                          "because loop iterations are unknown .\n");
2642
2643       /* If peeled iterations are known but number of scalar loop
2644          iterations are unknown, count a taken branch per peeled loop.  */
2645       retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2646                                  NULL, 0, vect_prologue);
2647     }
2648   else
2649     {
2650       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2651       peel_iters_prologue = niters < peel_iters_prologue ?
2652                             niters : peel_iters_prologue;
2653       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2654       /* If we need to peel for gaps, but no peeling is required, we have to
2655          peel VF iterations.  */
2656       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2657         *peel_iters_epilogue = vf;
2658     }
2659
2660   if (peel_iters_prologue)
2661     retval += record_stmt_cost (prologue_cost_vec,
2662                                 peel_iters_prologue * scalar_single_iter_cost,
2663                                 scalar_stmt, NULL, 0, vect_prologue);
2664   if (*peel_iters_epilogue)
2665     retval += record_stmt_cost (epilogue_cost_vec,
2666                                 *peel_iters_epilogue * scalar_single_iter_cost,
2667                                 scalar_stmt, NULL, 0, vect_epilogue);
2668   return retval;
2669 }
2670
2671 /* Function vect_estimate_min_profitable_iters
2672
2673    Return the number of iterations required for the vector version of the
2674    loop to be profitable relative to the cost of the scalar version of the
2675    loop.  */
2676
2677 static void
2678 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2679                                     int *ret_min_profitable_niters,
2680                                     int *ret_min_profitable_estimate)
2681 {
2682   int min_profitable_iters;
2683   int min_profitable_estimate;
2684   int peel_iters_prologue;
2685   int peel_iters_epilogue;
2686   unsigned vec_inside_cost = 0;
2687   int vec_outside_cost = 0;
2688   unsigned vec_prologue_cost = 0;
2689   unsigned vec_epilogue_cost = 0;
2690   int scalar_single_iter_cost = 0;
2691   int scalar_outside_cost = 0;
2692   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2693   int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2694   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2695
2696   /* Cost model disabled.  */
2697   if (unlimited_cost_model ())
2698     {
2699       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2700       *ret_min_profitable_niters = 0;
2701       *ret_min_profitable_estimate = 0;
2702       return;
2703     }
2704
2705   /* Requires loop versioning tests to handle misalignment.  */
2706   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2707     {
2708       /*  FIXME: Make cost depend on complexity of individual check.  */
2709       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2710       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2711                             vect_prologue);
2712       dump_printf (MSG_NOTE,
2713                    "cost model: Adding cost of checks for loop "
2714                    "versioning to treat misalignment.\n");
2715     }
2716
2717   /* Requires loop versioning with alias checks.  */
2718   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2719     {
2720       /*  FIXME: Make cost depend on complexity of individual check.  */
2721       unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2722       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2723                             vect_prologue);
2724       dump_printf (MSG_NOTE,
2725                    "cost model: Adding cost of checks for loop "
2726                    "versioning aliasing.\n");
2727     }
2728
2729   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2730       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2731     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2732                           vect_prologue);
2733
2734   /* Count statements in scalar loop.  Using this as scalar cost for a single
2735      iteration for now.
2736
2737      TODO: Add outer loop support.
2738
2739      TODO: Consider assigning different costs to different scalar
2740      statements.  */
2741
2742   scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2743
2744   /* Add additional cost for the peeled instructions in prologue and epilogue
2745      loop.
2746
2747      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2748      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2749
2750      TODO: Build an expression that represents peel_iters for prologue and
2751      epilogue to be used in a run-time test.  */
2752
2753   if (npeel  < 0)
2754     {
2755       peel_iters_prologue = vf/2;
2756       dump_printf (MSG_NOTE, "cost model: "
2757                    "prologue peel iters set to vf/2.\n");
2758
2759       /* If peeling for alignment is unknown, loop bound of main loop becomes
2760          unknown.  */
2761       peel_iters_epilogue = vf/2;
2762       dump_printf (MSG_NOTE, "cost model: "
2763                    "epilogue peel iters set to vf/2 because "
2764                    "peeling for alignment is unknown.\n");
2765
2766       /* If peeled iterations are unknown, count a taken branch and a not taken
2767          branch per peeled loop. Even if scalar loop iterations are known,
2768          vector iterations are not known since peeled prologue iterations are
2769          not known. Hence guards remain the same.  */
2770       (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2771                             NULL, 0, vect_prologue);
2772       (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2773                             NULL, 0, vect_prologue);
2774       /* FORNOW: Don't attempt to pass individual scalar instructions to
2775          the model; just assume linear cost for scalar iterations.  */
2776       (void) add_stmt_cost (target_cost_data,
2777                             peel_iters_prologue * scalar_single_iter_cost,
2778                             scalar_stmt, NULL, 0, vect_prologue);
2779       (void) add_stmt_cost (target_cost_data,
2780                             peel_iters_epilogue * scalar_single_iter_cost,
2781                             scalar_stmt, NULL, 0, vect_epilogue);
2782     }
2783   else
2784     {
2785       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2786       stmt_info_for_cost *si;
2787       int j;
2788       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2789
2790       prologue_cost_vec.create (2);
2791       epilogue_cost_vec.create (2);
2792       peel_iters_prologue = npeel;
2793
2794       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2795                                           &peel_iters_epilogue,
2796                                           scalar_single_iter_cost,
2797                                           &prologue_cost_vec,
2798                                           &epilogue_cost_vec);
2799
2800       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2801         {
2802           struct _stmt_vec_info *stmt_info
2803             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2804           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2805                                 si->misalign, vect_prologue);
2806         }
2807
2808       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2809         {
2810           struct _stmt_vec_info *stmt_info
2811             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2812           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2813                                 si->misalign, vect_epilogue);
2814         }
2815
2816       prologue_cost_vec.release ();
2817       epilogue_cost_vec.release ();
2818     }
2819
2820   /* FORNOW: The scalar outside cost is incremented in one of the
2821      following ways:
2822
2823      1. The vectorizer checks for alignment and aliasing and generates
2824      a condition that allows dynamic vectorization.  A cost model
2825      check is ANDED with the versioning condition.  Hence scalar code
2826      path now has the added cost of the versioning check.
2827
2828        if (cost > th & versioning_check)
2829          jmp to vector code
2830
2831      Hence run-time scalar is incremented by not-taken branch cost.
2832
2833      2. The vectorizer then checks if a prologue is required.  If the
2834      cost model check was not done before during versioning, it has to
2835      be done before the prologue check.
2836
2837        if (cost <= th)
2838          prologue = scalar_iters
2839        if (prologue == 0)
2840          jmp to vector code
2841        else
2842          execute prologue
2843        if (prologue == num_iters)
2844          go to exit
2845
2846      Hence the run-time scalar cost is incremented by a taken branch,
2847      plus a not-taken branch, plus a taken branch cost.
2848
2849      3. The vectorizer then checks if an epilogue is required.  If the
2850      cost model check was not done before during prologue check, it
2851      has to be done with the epilogue check.
2852
2853        if (prologue == 0)
2854          jmp to vector code
2855        else
2856          execute prologue
2857        if (prologue == num_iters)
2858          go to exit
2859        vector code:
2860          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2861            jmp to epilogue
2862
2863      Hence the run-time scalar cost should be incremented by 2 taken
2864      branches.
2865
2866      TODO: The back end may reorder the BBS's differently and reverse
2867      conditions/branch directions.  Change the estimates below to
2868      something more reasonable.  */
2869
2870   /* If the number of iterations is known and we do not do versioning, we can
2871      decide whether to vectorize at compile time.  Hence the scalar version
2872      do not carry cost model guard costs.  */
2873   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2874       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2875       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2876     {
2877       /* Cost model check occurs at versioning.  */
2878       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2879           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2880         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2881       else
2882         {
2883           /* Cost model check occurs at prologue generation.  */
2884           if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2885             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2886               + vect_get_stmt_cost (cond_branch_not_taken);
2887           /* Cost model check occurs at epilogue generation.  */
2888           else
2889             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2890         }
2891     }
2892
2893   /* Complete the target-specific cost calculations.  */
2894   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2895                &vec_inside_cost, &vec_epilogue_cost);
2896
2897   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2898
2899   /* Calculate number of iterations required to make the vector version
2900      profitable, relative to the loop bodies only.  The following condition
2901      must hold true:
2902      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2903      where
2904      SIC = scalar iteration cost, VIC = vector iteration cost,
2905      VOC = vector outside cost, VF = vectorization factor,
2906      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2907      SOC = scalar outside cost for run time cost model check.  */
2908
2909   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2910     {
2911       if (vec_outside_cost <= 0)
2912         min_profitable_iters = 1;
2913       else
2914         {
2915           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2916                                   - vec_inside_cost * peel_iters_prologue
2917                                   - vec_inside_cost * peel_iters_epilogue)
2918                                  / ((scalar_single_iter_cost * vf)
2919                                     - vec_inside_cost);
2920
2921           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2922               <= (((int) vec_inside_cost * min_profitable_iters)
2923                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
2924             min_profitable_iters++;
2925         }
2926     }
2927   /* vector version will never be profitable.  */
2928   else
2929     {
2930       if (dump_enabled_p ())
2931         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2932                          "cost model: the vector iteration cost = %d "
2933                          "divided by the scalar iteration cost = %d "
2934                          "is greater or equal to the vectorization factor = %d"
2935                          ".\n",
2936                          vec_inside_cost, scalar_single_iter_cost, vf);
2937       *ret_min_profitable_niters = -1;
2938       *ret_min_profitable_estimate = -1;
2939       return;
2940     }
2941
2942   if (dump_enabled_p ())
2943     {
2944       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2945       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
2946                    vec_inside_cost);
2947       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
2948                    vec_prologue_cost);
2949       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
2950                    vec_epilogue_cost);
2951       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
2952                    scalar_single_iter_cost);
2953       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
2954                    scalar_outside_cost);
2955       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
2956                    vec_outside_cost);
2957       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
2958                    peel_iters_prologue);
2959       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
2960                    peel_iters_epilogue);
2961       dump_printf (MSG_NOTE,
2962                    "  Calculated minimum iters for profitability: %d\n",
2963                    min_profitable_iters);
2964       dump_printf (MSG_NOTE, "\n");
2965     }
2966
2967   min_profitable_iters =
2968         min_profitable_iters < vf ? vf : min_profitable_iters;
2969
2970   /* Because the condition we create is:
2971      if (niters <= min_profitable_iters)
2972        then skip the vectorized loop.  */
2973   min_profitable_iters--;
2974
2975   if (dump_enabled_p ())
2976     dump_printf_loc (MSG_NOTE, vect_location,
2977                      "  Runtime profitability threshold = %d\n",
2978                      min_profitable_iters);
2979
2980   *ret_min_profitable_niters = min_profitable_iters;
2981
2982   /* Calculate number of iterations required to make the vector version
2983      profitable, relative to the loop bodies only.
2984
2985      Non-vectorized variant is SIC * niters and it must win over vector
2986      variant on the expected loop trip count.  The following condition must hold true:
2987      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
2988
2989   if (vec_outside_cost <= 0)
2990     min_profitable_estimate = 1;
2991   else
2992     {
2993       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
2994                                  - vec_inside_cost * peel_iters_prologue
2995                                  - vec_inside_cost * peel_iters_epilogue)
2996                                  / ((scalar_single_iter_cost * vf)
2997                                    - vec_inside_cost);
2998     }
2999   min_profitable_estimate --;
3000   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3001   if (dump_enabled_p ())
3002     dump_printf_loc (MSG_NOTE, vect_location,
3003                      "  Static estimate profitability threshold = %d\n",
3004                       min_profitable_iters);
3005
3006   *ret_min_profitable_estimate = min_profitable_estimate;
3007 }
3008
3009
3010 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3011    functions. Design better to avoid maintenance issues.  */
3012
3013 /* Function vect_model_reduction_cost.
3014
3015    Models cost for a reduction operation, including the vector ops
3016    generated within the strip-mine loop, the initial definition before
3017    the loop, and the epilogue code that must be generated.  */
3018
3019 static bool
3020 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3021                            int ncopies)
3022 {
3023   int prologue_cost = 0, epilogue_cost = 0;
3024   enum tree_code code;
3025   optab optab;
3026   tree vectype;
3027   gimple stmt, orig_stmt;
3028   tree reduction_op;
3029   enum machine_mode mode;
3030   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3031   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3032   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3033
3034   /* Cost of reduction op inside loop.  */
3035   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3036                                         stmt_info, 0, vect_body);
3037   stmt = STMT_VINFO_STMT (stmt_info);
3038
3039   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3040     {
3041     case GIMPLE_SINGLE_RHS:
3042       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
3043       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
3044       break;
3045     case GIMPLE_UNARY_RHS:
3046       reduction_op = gimple_assign_rhs1 (stmt);
3047       break;
3048     case GIMPLE_BINARY_RHS:
3049       reduction_op = gimple_assign_rhs2 (stmt);
3050       break;
3051     case GIMPLE_TERNARY_RHS:
3052       reduction_op = gimple_assign_rhs3 (stmt);
3053       break;
3054     default:
3055       gcc_unreachable ();
3056     }
3057
3058   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3059   if (!vectype)
3060     {
3061       if (dump_enabled_p ())
3062         {
3063           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3064                            "unsupported data-type ");
3065           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3066                              TREE_TYPE (reduction_op));
3067           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3068         }
3069       return false;
3070    }
3071
3072   mode = TYPE_MODE (vectype);
3073   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3074
3075   if (!orig_stmt)
3076     orig_stmt = STMT_VINFO_STMT (stmt_info);
3077
3078   code = gimple_assign_rhs_code (orig_stmt);
3079
3080   /* Add in cost for initial definition.  */
3081   prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3082                                   stmt_info, 0, vect_prologue);
3083
3084   /* Determine cost of epilogue code.
3085
3086      We have a reduction operator that will reduce the vector in one statement.
3087      Also requires scalar extract.  */
3088
3089   if (!nested_in_vect_loop_p (loop, orig_stmt))
3090     {
3091       if (reduc_code != ERROR_MARK)
3092         {
3093           epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3094                                           stmt_info, 0, vect_epilogue);
3095           epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3096                                           stmt_info, 0, vect_epilogue);
3097         }
3098       else
3099         {
3100           int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3101           tree bitsize =
3102             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3103           int element_bitsize = tree_low_cst (bitsize, 1);
3104           int nelements = vec_size_in_bits / element_bitsize;
3105
3106           optab = optab_for_tree_code (code, vectype, optab_default);
3107
3108           /* We have a whole vector shift available.  */
3109           if (VECTOR_MODE_P (mode)
3110               && optab_handler (optab, mode) != CODE_FOR_nothing
3111               && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3112             {
3113               /* Final reduction via vector shifts and the reduction operator.
3114                  Also requires scalar extract.  */
3115               epilogue_cost += add_stmt_cost (target_cost_data,
3116                                               exact_log2 (nelements) * 2,
3117                                               vector_stmt, stmt_info, 0,
3118                                               vect_epilogue);
3119               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3120                                               vec_to_scalar, stmt_info, 0,
3121                                               vect_epilogue);
3122             }
3123           else
3124             /* Use extracts and reduction op for final reduction.  For N
3125                elements, we have N extracts and N-1 reduction ops.  */
3126             epilogue_cost += add_stmt_cost (target_cost_data,
3127                                             nelements + nelements - 1,
3128                                             vector_stmt, stmt_info, 0,
3129                                             vect_epilogue);
3130         }
3131     }
3132
3133   if (dump_enabled_p ())
3134     dump_printf (MSG_NOTE,
3135                  "vect_model_reduction_cost: inside_cost = %d, "
3136                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3137                  prologue_cost, epilogue_cost);
3138
3139   return true;
3140 }
3141
3142
3143 /* Function vect_model_induction_cost.
3144
3145    Models cost for induction operations.  */
3146
3147 static void
3148 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3149 {
3150   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3151   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3152   unsigned inside_cost, prologue_cost;
3153
3154   /* loop cost for vec_loop.  */
3155   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3156                                stmt_info, 0, vect_body);
3157
3158   /* prologue cost for vec_init and vec_step.  */
3159   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3160                                  stmt_info, 0, vect_prologue);
3161
3162   if (dump_enabled_p ())
3163     dump_printf_loc (MSG_NOTE, vect_location,
3164                      "vect_model_induction_cost: inside_cost = %d, "
3165                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3166 }
3167
3168
3169 /* Function get_initial_def_for_induction
3170
3171    Input:
3172    STMT - a stmt that performs an induction operation in the loop.
3173    IV_PHI - the initial value of the induction variable
3174
3175    Output:
3176    Return a vector variable, initialized with the first VF values of
3177    the induction variable.  E.g., for an iv with IV_PHI='X' and
3178    evolution S, for a vector of 4 units, we want to return:
3179    [X, X + S, X + 2*S, X + 3*S].  */
3180
3181 static tree
3182 get_initial_def_for_induction (gimple iv_phi)
3183 {
3184   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3185   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3186   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3187   tree vectype;
3188   int nunits;
3189   edge pe = loop_preheader_edge (loop);
3190   struct loop *iv_loop;
3191   basic_block new_bb;
3192   tree new_vec, vec_init, vec_step, t;
3193   tree access_fn;
3194   tree new_var;
3195   tree new_name;
3196   gimple init_stmt, induction_phi, new_stmt;
3197   tree induc_def, vec_def, vec_dest;
3198   tree init_expr, step_expr;
3199   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3200   int i;
3201   bool ok;
3202   int ncopies;
3203   tree expr;
3204   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3205   bool nested_in_vect_loop = false;
3206   gimple_seq stmts = NULL;
3207   imm_use_iterator imm_iter;
3208   use_operand_p use_p;
3209   gimple exit_phi;
3210   edge latch_e;
3211   tree loop_arg;
3212   gimple_stmt_iterator si;
3213   basic_block bb = gimple_bb (iv_phi);
3214   tree stepvectype;
3215   tree resvectype;
3216
3217   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3218   if (nested_in_vect_loop_p (loop, iv_phi))
3219     {
3220       nested_in_vect_loop = true;
3221       iv_loop = loop->inner;
3222     }
3223   else
3224     iv_loop = loop;
3225   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3226
3227   latch_e = loop_latch_edge (iv_loop);
3228   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3229
3230   access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
3231   gcc_assert (access_fn);
3232   STRIP_NOPS (access_fn);
3233   ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
3234                                     &init_expr, &step_expr);
3235   gcc_assert (ok);
3236   pe = loop_preheader_edge (iv_loop);
3237
3238   vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3239   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3240   gcc_assert (vectype);
3241   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3242   ncopies = vf / nunits;
3243
3244   gcc_assert (phi_info);
3245   gcc_assert (ncopies >= 1);
3246
3247   /* Find the first insertion point in the BB.  */
3248   si = gsi_after_labels (bb);
3249
3250   /* Create the vector that holds the initial_value of the induction.  */
3251   if (nested_in_vect_loop)
3252     {
3253       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3254          been created during vectorization of previous stmts.  We obtain it
3255          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3256       tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3257                                            loop_preheader_edge (iv_loop));
3258       vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
3259       /* If the initial value is not of proper type, convert it.  */
3260       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3261         {
3262           new_stmt = gimple_build_assign_with_ops
3263               (VIEW_CONVERT_EXPR,
3264                vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"),
3265                build1 (VIEW_CONVERT_EXPR, vectype, vec_init), NULL_TREE);
3266           vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3267           gimple_assign_set_lhs (new_stmt, vec_init);
3268           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3269                                                  new_stmt);
3270           gcc_assert (!new_bb);
3271           set_vinfo_for_stmt (new_stmt,
3272                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3273         }
3274     }
3275   else
3276     {
3277       vec<constructor_elt, va_gc> *v;
3278
3279       /* iv_loop is the loop to be vectorized. Create:
3280          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3281       new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3282                                        vect_scalar_var, "var_");
3283       new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3284                                                      init_expr),
3285                                        &stmts, false, new_var);
3286       if (stmts)
3287         {
3288           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3289           gcc_assert (!new_bb);
3290         }
3291
3292       vec_alloc (v, nunits);
3293       bool constant_p = is_gimple_min_invariant (new_name);
3294       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3295       for (i = 1; i < nunits; i++)
3296         {
3297           /* Create: new_name_i = new_name + step_expr  */
3298           new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3299                                   new_name, step_expr);
3300           if (!is_gimple_min_invariant (new_name))
3301             {
3302               init_stmt = gimple_build_assign (new_var, new_name);
3303               new_name = make_ssa_name (new_var, init_stmt);
3304               gimple_assign_set_lhs (init_stmt, new_name);
3305               new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3306               gcc_assert (!new_bb);
3307               if (dump_enabled_p ())
3308                 {
3309                   dump_printf_loc (MSG_NOTE, vect_location,
3310                                    "created new init_stmt: ");
3311                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3312                   dump_printf (MSG_NOTE, "\n");
3313                 }
3314               constant_p = false;
3315             }
3316           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3317         }
3318       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3319       if (constant_p)
3320         new_vec = build_vector_from_ctor (vectype, v);
3321       else
3322         new_vec = build_constructor (vectype, v);
3323       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3324     }
3325
3326
3327   /* Create the vector that holds the step of the induction.  */
3328   if (nested_in_vect_loop)
3329     /* iv_loop is nested in the loop to be vectorized. Generate:
3330        vec_step = [S, S, S, S]  */
3331     new_name = step_expr;
3332   else
3333     {
3334       /* iv_loop is the loop to be vectorized. Generate:
3335           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3336       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3337         {
3338           expr = build_int_cst (integer_type_node, vf);
3339           expr = fold_convert (TREE_TYPE (step_expr), expr);
3340         }
3341       else
3342         expr = build_int_cst (TREE_TYPE (step_expr), vf);
3343       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3344                               expr, step_expr);
3345       if (TREE_CODE (step_expr) == SSA_NAME)
3346         new_name = vect_init_vector (iv_phi, new_name,
3347                                      TREE_TYPE (step_expr), NULL);
3348     }
3349
3350   t = unshare_expr (new_name);
3351   gcc_assert (CONSTANT_CLASS_P (new_name)
3352               || TREE_CODE (new_name) == SSA_NAME);
3353   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3354   gcc_assert (stepvectype);
3355   new_vec = build_vector_from_val (stepvectype, t);
3356   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3357
3358
3359   /* Create the following def-use cycle:
3360      loop prolog:
3361          vec_init = ...
3362          vec_step = ...
3363      loop:
3364          vec_iv = PHI <vec_init, vec_loop>
3365          ...
3366          STMT
3367          ...
3368          vec_loop = vec_iv + vec_step;  */
3369
3370   /* Create the induction-phi that defines the induction-operand.  */
3371   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3372   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3373   set_vinfo_for_stmt (induction_phi,
3374                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3375   induc_def = PHI_RESULT (induction_phi);
3376
3377   /* Create the iv update inside the loop  */
3378   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3379                                            induc_def, vec_step);
3380   vec_def = make_ssa_name (vec_dest, new_stmt);
3381   gimple_assign_set_lhs (new_stmt, vec_def);
3382   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3383   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3384                                                    NULL));
3385
3386   /* Set the arguments of the phi node:  */
3387   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3388   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3389                UNKNOWN_LOCATION);
3390
3391
3392   /* In case that vectorization factor (VF) is bigger than the number
3393      of elements that we can fit in a vectype (nunits), we have to generate
3394      more than one vector stmt - i.e - we need to "unroll" the
3395      vector stmt by a factor VF/nunits.  For more details see documentation
3396      in vectorizable_operation.  */
3397
3398   if (ncopies > 1)
3399     {
3400       stmt_vec_info prev_stmt_vinfo;
3401       /* FORNOW. This restriction should be relaxed.  */
3402       gcc_assert (!nested_in_vect_loop);
3403
3404       /* Create the vector that holds the step of the induction.  */
3405       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3406         {
3407           expr = build_int_cst (integer_type_node, nunits);
3408           expr = fold_convert (TREE_TYPE (step_expr), expr);
3409         }
3410       else
3411         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3412       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3413                               expr, step_expr);
3414       if (TREE_CODE (step_expr) == SSA_NAME)
3415         new_name = vect_init_vector (iv_phi, new_name,
3416                                      TREE_TYPE (step_expr), NULL);
3417       t = unshare_expr (new_name);
3418       gcc_assert (CONSTANT_CLASS_P (new_name)
3419                   || TREE_CODE (new_name) == SSA_NAME);
3420       new_vec = build_vector_from_val (stepvectype, t);
3421       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3422
3423       vec_def = induc_def;
3424       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3425       for (i = 1; i < ncopies; i++)
3426         {
3427           /* vec_i = vec_prev + vec_step  */
3428           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3429                                                    vec_def, vec_step);
3430           vec_def = make_ssa_name (vec_dest, new_stmt);
3431           gimple_assign_set_lhs (new_stmt, vec_def);
3432
3433           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3434           if (!useless_type_conversion_p (resvectype, vectype))
3435             {
3436               new_stmt = gimple_build_assign_with_ops
3437                   (VIEW_CONVERT_EXPR,
3438                    vect_get_new_vect_var (resvectype, vect_simple_var,
3439                                           "vec_iv_"),
3440                    build1 (VIEW_CONVERT_EXPR, resvectype,
3441                            gimple_assign_lhs (new_stmt)), NULL_TREE);
3442               gimple_assign_set_lhs (new_stmt,
3443                                      make_ssa_name
3444                                        (gimple_assign_lhs (new_stmt), new_stmt));
3445               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3446             }
3447           set_vinfo_for_stmt (new_stmt,
3448                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3449           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3450           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3451         }
3452     }
3453
3454   if (nested_in_vect_loop)
3455     {
3456       /* Find the loop-closed exit-phi of the induction, and record
3457          the final vector of induction results:  */
3458       exit_phi = NULL;
3459       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3460         {
3461           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3462             {
3463               exit_phi = USE_STMT (use_p);
3464               break;
3465             }
3466         }
3467       if (exit_phi)
3468         {
3469           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3470           /* FORNOW. Currently not supporting the case that an inner-loop induction
3471              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3472           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3473                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3474
3475           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3476           if (dump_enabled_p ())
3477             {
3478               dump_printf_loc (MSG_NOTE, vect_location,
3479                                "vector of inductions after inner-loop:");
3480               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3481               dump_printf (MSG_NOTE, "\n");
3482             }
3483         }
3484     }
3485
3486
3487   if (dump_enabled_p ())
3488     {
3489       dump_printf_loc (MSG_NOTE, vect_location,
3490                        "transform induction: created def-use cycle: ");
3491       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3492       dump_printf (MSG_NOTE, "\n");
3493       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3494                         SSA_NAME_DEF_STMT (vec_def), 0);
3495       dump_printf (MSG_NOTE, "\n");
3496     }
3497
3498   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3499   if (!useless_type_conversion_p (resvectype, vectype))
3500     {
3501       new_stmt = gimple_build_assign_with_ops
3502          (VIEW_CONVERT_EXPR,
3503           vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3504           build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3505       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3506       gimple_assign_set_lhs (new_stmt, induc_def);
3507       si = gsi_after_labels (bb);
3508       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3509       set_vinfo_for_stmt (new_stmt,
3510                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3511       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3512         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3513     }
3514
3515   return induc_def;
3516 }
3517
3518
3519 /* Function get_initial_def_for_reduction
3520
3521    Input:
3522    STMT - a stmt that performs a reduction operation in the loop.
3523    INIT_VAL - the initial value of the reduction variable
3524
3525    Output:
3526    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3527         of the reduction (used for adjusting the epilog - see below).
3528    Return a vector variable, initialized according to the operation that STMT
3529         performs. This vector will be used as the initial value of the
3530         vector of partial results.
3531
3532    Option1 (adjust in epilog): Initialize the vector as follows:
3533      add/bit or/xor:    [0,0,...,0,0]
3534      mult/bit and:      [1,1,...,1,1]
3535      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3536    and when necessary (e.g. add/mult case) let the caller know
3537    that it needs to adjust the result by init_val.
3538
3539    Option2: Initialize the vector as follows:
3540      add/bit or/xor:    [init_val,0,0,...,0]
3541      mult/bit and:      [init_val,1,1,...,1]
3542      min/max/cond_expr: [init_val,init_val,...,init_val]
3543    and no adjustments are needed.
3544
3545    For example, for the following code:
3546
3547    s = init_val;
3548    for (i=0;i<n;i++)
3549      s = s + a[i];
3550
3551    STMT is 's = s + a[i]', and the reduction variable is 's'.
3552    For a vector of 4 units, we want to return either [0,0,0,init_val],
3553    or [0,0,0,0] and let the caller know that it needs to adjust
3554    the result at the end by 'init_val'.
3555
3556    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3557    initialization vector is simpler (same element in all entries), if
3558    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3559
3560    A cost model should help decide between these two schemes.  */
3561
3562 tree
3563 get_initial_def_for_reduction (gimple stmt, tree init_val,
3564                                tree *adjustment_def)
3565 {
3566   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3567   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3568   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3569   tree scalar_type = TREE_TYPE (init_val);
3570   tree vectype = get_vectype_for_scalar_type (scalar_type);
3571   int nunits;
3572   enum tree_code code = gimple_assign_rhs_code (stmt);
3573   tree def_for_init;
3574   tree init_def;
3575   tree *elts;
3576   int i;
3577   bool nested_in_vect_loop = false;
3578   tree init_value;
3579   REAL_VALUE_TYPE real_init_val = dconst0;
3580   int int_init_val = 0;
3581   gimple def_stmt = NULL;
3582
3583   gcc_assert (vectype);
3584   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3585
3586   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3587               || SCALAR_FLOAT_TYPE_P (scalar_type));
3588
3589   if (nested_in_vect_loop_p (loop, stmt))
3590     nested_in_vect_loop = true;
3591   else
3592     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3593
3594   /* In case of double reduction we only create a vector variable to be put
3595      in the reduction phi node.  The actual statement creation is done in
3596      vect_create_epilog_for_reduction.  */
3597   if (adjustment_def && nested_in_vect_loop
3598       && TREE_CODE (init_val) == SSA_NAME
3599       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3600       && gimple_code (def_stmt) == GIMPLE_PHI
3601       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3602       && vinfo_for_stmt (def_stmt)
3603       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3604           == vect_double_reduction_def)
3605     {
3606       *adjustment_def = NULL;
3607       return vect_create_destination_var (init_val, vectype);
3608     }
3609
3610   if (TREE_CONSTANT (init_val))
3611     {
3612       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3613         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3614       else
3615         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3616     }
3617   else
3618     init_value = init_val;
3619
3620   switch (code)
3621     {
3622       case WIDEN_SUM_EXPR:
3623       case DOT_PROD_EXPR:
3624       case PLUS_EXPR:
3625       case MINUS_EXPR:
3626       case BIT_IOR_EXPR:
3627       case BIT_XOR_EXPR:
3628       case MULT_EXPR:
3629       case BIT_AND_EXPR:
3630         /* ADJUSMENT_DEF is NULL when called from
3631            vect_create_epilog_for_reduction to vectorize double reduction.  */
3632         if (adjustment_def)
3633           {
3634             if (nested_in_vect_loop)
3635               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3636                                                               NULL);
3637             else
3638               *adjustment_def = init_val;
3639           }
3640
3641         if (code == MULT_EXPR)
3642           {
3643             real_init_val = dconst1;
3644             int_init_val = 1;
3645           }
3646
3647         if (code == BIT_AND_EXPR)
3648           int_init_val = -1;
3649
3650         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3651           def_for_init = build_real (scalar_type, real_init_val);
3652         else
3653           def_for_init = build_int_cst (scalar_type, int_init_val);
3654
3655         /* Create a vector of '0' or '1' except the first element.  */
3656         elts = XALLOCAVEC (tree, nunits);
3657         for (i = nunits - 2; i >= 0; --i)
3658           elts[i + 1] = def_for_init;
3659
3660         /* Option1: the first element is '0' or '1' as well.  */
3661         if (adjustment_def)
3662           {
3663             elts[0] = def_for_init;
3664             init_def = build_vector (vectype, elts);
3665             break;
3666           }
3667
3668         /* Option2: the first element is INIT_VAL.  */
3669         elts[0] = init_val;
3670         if (TREE_CONSTANT (init_val))
3671           init_def = build_vector (vectype, elts);
3672         else
3673           {
3674             vec<constructor_elt, va_gc> *v;
3675             vec_alloc (v, nunits);
3676             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3677             for (i = 1; i < nunits; ++i)
3678               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3679             init_def = build_constructor (vectype, v);
3680           }
3681
3682         break;
3683
3684       case MIN_EXPR:
3685       case MAX_EXPR:
3686       case COND_EXPR:
3687         if (adjustment_def)
3688           {
3689             *adjustment_def = NULL_TREE;
3690             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3691             break;
3692           }
3693
3694         init_def = build_vector_from_val (vectype, init_value);
3695         break;
3696
3697       default:
3698         gcc_unreachable ();
3699     }
3700
3701   return init_def;
3702 }
3703
3704
3705 /* Function vect_create_epilog_for_reduction
3706
3707    Create code at the loop-epilog to finalize the result of a reduction
3708    computation.
3709
3710    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3711      reduction statements.
3712    STMT is the scalar reduction stmt that is being vectorized.
3713    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3714      number of elements that we can fit in a vectype (nunits).  In this case
3715      we have to generate more than one vector stmt - i.e - we need to "unroll"
3716      the vector stmt by a factor VF/nunits.  For more details see documentation
3717      in vectorizable_operation.
3718    REDUC_CODE is the tree-code for the epilog reduction.
3719    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3720      computation.
3721    REDUC_INDEX is the index of the operand in the right hand side of the
3722      statement that is defined by REDUCTION_PHI.
3723    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3724    SLP_NODE is an SLP node containing a group of reduction statements. The
3725      first one in this group is STMT.
3726
3727    This function:
3728    1. Creates the reduction def-use cycles: sets the arguments for
3729       REDUCTION_PHIS:
3730       The loop-entry argument is the vectorized initial-value of the reduction.
3731       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3732       sums.
3733    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3734       by applying the operation specified by REDUC_CODE if available, or by
3735       other means (whole-vector shifts or a scalar loop).
3736       The function also creates a new phi node at the loop exit to preserve
3737       loop-closed form, as illustrated below.
3738
3739      The flow at the entry to this function:
3740
3741         loop:
3742           vec_def = phi <null, null>            # REDUCTION_PHI
3743           VECT_DEF = vector_stmt                # vectorized form of STMT
3744           s_loop = scalar_stmt                  # (scalar) STMT
3745         loop_exit:
3746           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3747           use <s_out0>
3748           use <s_out0>
3749
3750      The above is transformed by this function into:
3751
3752         loop:
3753           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3754           VECT_DEF = vector_stmt                # vectorized form of STMT
3755           s_loop = scalar_stmt                  # (scalar) STMT
3756         loop_exit:
3757           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3758           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3759           v_out2 = reduce <v_out1>
3760           s_out3 = extract_field <v_out2, 0>
3761           s_out4 = adjust_result <s_out3>
3762           use <s_out4>
3763           use <s_out4>
3764 */
3765
3766 static void
3767 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3768                                   int ncopies, enum tree_code reduc_code,
3769                                   vec<gimple> reduction_phis,
3770                                   int reduc_index, bool double_reduc,
3771                                   slp_tree slp_node)
3772 {
3773   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3774   stmt_vec_info prev_phi_info;
3775   tree vectype;
3776   enum machine_mode mode;
3777   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3778   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3779   basic_block exit_bb;
3780   tree scalar_dest;
3781   tree scalar_type;
3782   gimple new_phi = NULL, phi;
3783   gimple_stmt_iterator exit_gsi;
3784   tree vec_dest;
3785   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3786   gimple epilog_stmt = NULL;
3787   enum tree_code code = gimple_assign_rhs_code (stmt);
3788   gimple exit_phi;
3789   tree bitsize, bitpos;
3790   tree adjustment_def = NULL;
3791   tree vec_initial_def = NULL;
3792   tree reduction_op, expr, def;
3793   tree orig_name, scalar_result;
3794   imm_use_iterator imm_iter, phi_imm_iter;
3795   use_operand_p use_p, phi_use_p;
3796   bool extract_scalar_result = false;
3797   gimple use_stmt, orig_stmt, reduction_phi = NULL;
3798   bool nested_in_vect_loop = false;
3799   vec<gimple> new_phis = vNULL;
3800   vec<gimple> inner_phis = vNULL;
3801   enum vect_def_type dt = vect_unknown_def_type;
3802   int j, i;
3803   vec<tree> scalar_results = vNULL;
3804   unsigned int group_size = 1, k, ratio;
3805   vec<tree> vec_initial_defs = vNULL;
3806   vec<gimple> phis;
3807   bool slp_reduc = false;
3808   tree new_phi_result;
3809   gimple inner_phi = NULL;
3810
3811   if (slp_node)
3812     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3813
3814   if (nested_in_vect_loop_p (loop, stmt))
3815     {
3816       outer_loop = loop;
3817       loop = loop->inner;
3818       nested_in_vect_loop = true;
3819       gcc_assert (!slp_node);
3820     }
3821
3822   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3823     {
3824     case GIMPLE_SINGLE_RHS:
3825       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3826                   == ternary_op);
3827       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3828       break;
3829     case GIMPLE_UNARY_RHS:
3830       reduction_op = gimple_assign_rhs1 (stmt);
3831       break;
3832     case GIMPLE_BINARY_RHS:
3833       reduction_op = reduc_index ?
3834                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3835       break;
3836     case GIMPLE_TERNARY_RHS:
3837       reduction_op = gimple_op (stmt, reduc_index + 1);
3838       break;
3839     default:
3840       gcc_unreachable ();
3841     }
3842
3843   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3844   gcc_assert (vectype);
3845   mode = TYPE_MODE (vectype);
3846
3847   /* 1. Create the reduction def-use cycle:
3848      Set the arguments of REDUCTION_PHIS, i.e., transform
3849
3850         loop:
3851           vec_def = phi <null, null>            # REDUCTION_PHI
3852           VECT_DEF = vector_stmt                # vectorized form of STMT
3853           ...
3854
3855      into:
3856
3857         loop:
3858           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3859           VECT_DEF = vector_stmt                # vectorized form of STMT
3860           ...
3861
3862      (in case of SLP, do it for all the phis). */
3863
3864   /* Get the loop-entry arguments.  */
3865   if (slp_node)
3866     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3867                        NULL, slp_node, reduc_index);
3868   else
3869     {
3870       vec_initial_defs.create (1);
3871      /* For the case of reduction, vect_get_vec_def_for_operand returns
3872         the scalar def before the loop, that defines the initial value
3873         of the reduction variable.  */
3874       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3875                                                       &adjustment_def);
3876       vec_initial_defs.quick_push (vec_initial_def);
3877     }
3878
3879   /* Set phi nodes arguments.  */
3880   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
3881     {
3882       tree vec_init_def = vec_initial_defs[i];
3883       tree def = vect_defs[i];
3884       for (j = 0; j < ncopies; j++)
3885         {
3886           /* Set the loop-entry arg of the reduction-phi.  */
3887           add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3888                        UNKNOWN_LOCATION);
3889
3890           /* Set the loop-latch arg for the reduction-phi.  */
3891           if (j > 0)
3892             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3893
3894           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3895
3896           if (dump_enabled_p ())
3897             {
3898               dump_printf_loc (MSG_NOTE, vect_location,
3899                                "transform reduction: created def-use cycle: ");
3900               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
3901               dump_printf (MSG_NOTE, "\n");
3902               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
3903               dump_printf (MSG_NOTE, "\n");
3904             }
3905
3906           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3907         }
3908     }
3909
3910   vec_initial_defs.release ();
3911
3912   /* 2. Create epilog code.
3913         The reduction epilog code operates across the elements of the vector
3914         of partial results computed by the vectorized loop.
3915         The reduction epilog code consists of:
3916
3917         step 1: compute the scalar result in a vector (v_out2)
3918         step 2: extract the scalar result (s_out3) from the vector (v_out2)
3919         step 3: adjust the scalar result (s_out3) if needed.
3920
3921         Step 1 can be accomplished using one the following three schemes:
3922           (scheme 1) using reduc_code, if available.
3923           (scheme 2) using whole-vector shifts, if available.
3924           (scheme 3) using a scalar loop. In this case steps 1+2 above are
3925                      combined.
3926
3927           The overall epilog code looks like this:
3928
3929           s_out0 = phi <s_loop>         # original EXIT_PHI
3930           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
3931           v_out2 = reduce <v_out1>              # step 1
3932           s_out3 = extract_field <v_out2, 0>    # step 2
3933           s_out4 = adjust_result <s_out3>       # step 3
3934
3935           (step 3 is optional, and steps 1 and 2 may be combined).
3936           Lastly, the uses of s_out0 are replaced by s_out4.  */
3937
3938
3939   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3940          v_out1 = phi <VECT_DEF>
3941          Store them in NEW_PHIS.  */
3942
3943   exit_bb = single_exit (loop)->dest;
3944   prev_phi_info = NULL;
3945   new_phis.create (vect_defs.length ());
3946   FOR_EACH_VEC_ELT (vect_defs, i, def)
3947     {
3948       for (j = 0; j < ncopies; j++)
3949         {
3950           tree new_def = copy_ssa_name (def, NULL);
3951           phi = create_phi_node (new_def, exit_bb);
3952           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3953           if (j == 0)
3954             new_phis.quick_push (phi);
3955           else
3956             {
3957               def = vect_get_vec_def_for_stmt_copy (dt, def);
3958               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3959             }
3960
3961           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3962           prev_phi_info = vinfo_for_stmt (phi);
3963         }
3964     }
3965
3966   /* The epilogue is created for the outer-loop, i.e., for the loop being
3967      vectorized.  Create exit phis for the outer loop.  */
3968   if (double_reduc)
3969     {
3970       loop = outer_loop;
3971       exit_bb = single_exit (loop)->dest;
3972       inner_phis.create (vect_defs.length ());
3973       FOR_EACH_VEC_ELT (new_phis, i, phi)
3974         {
3975           tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3976           gimple outer_phi = create_phi_node (new_result, exit_bb);
3977           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3978                            PHI_RESULT (phi));
3979           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3980                                                             loop_vinfo, NULL));
3981           inner_phis.quick_push (phi);
3982           new_phis[i] = outer_phi;
3983           prev_phi_info = vinfo_for_stmt (outer_phi);
3984           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
3985             {
3986               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3987               new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3988               outer_phi = create_phi_node (new_result, exit_bb);
3989               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3990                                PHI_RESULT (phi));
3991               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3992                                                         loop_vinfo, NULL));
3993               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
3994               prev_phi_info = vinfo_for_stmt (outer_phi);
3995             }
3996         }
3997     }
3998
3999   exit_gsi = gsi_after_labels (exit_bb);
4000
4001   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4002          (i.e. when reduc_code is not available) and in the final adjustment
4003          code (if needed).  Also get the original scalar reduction variable as
4004          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4005          represents a reduction pattern), the tree-code and scalar-def are
4006          taken from the original stmt that the pattern-stmt (STMT) replaces.
4007          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4008          are taken from STMT.  */
4009
4010   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4011   if (!orig_stmt)
4012     {
4013       /* Regular reduction  */
4014       orig_stmt = stmt;
4015     }
4016   else
4017     {
4018       /* Reduction pattern  */
4019       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4020       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4021       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4022     }
4023
4024   code = gimple_assign_rhs_code (orig_stmt);
4025   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4026      partial results are added and not subtracted.  */
4027   if (code == MINUS_EXPR)
4028     code = PLUS_EXPR;
4029
4030   scalar_dest = gimple_assign_lhs (orig_stmt);
4031   scalar_type = TREE_TYPE (scalar_dest);
4032   scalar_results.create (group_size);
4033   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4034   bitsize = TYPE_SIZE (scalar_type);
4035
4036   /* In case this is a reduction in an inner-loop while vectorizing an outer
4037      loop - we don't need to extract a single scalar result at the end of the
4038      inner-loop (unless it is double reduction, i.e., the use of reduction is
4039      outside the outer-loop).  The final vector of partial results will be used
4040      in the vectorized outer-loop, or reduced to a scalar result at the end of
4041      the outer-loop.  */
4042   if (nested_in_vect_loop && !double_reduc)
4043     goto vect_finalize_reduction;
4044
4045   /* SLP reduction without reduction chain, e.g.,
4046      # a1 = phi <a2, a0>
4047      # b1 = phi <b2, b0>
4048      a2 = operation (a1)
4049      b2 = operation (b1)  */
4050   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4051
4052   /* In case of reduction chain, e.g.,
4053      # a1 = phi <a3, a0>
4054      a2 = operation (a1)
4055      a3 = operation (a2),
4056
4057      we may end up with more than one vector result.  Here we reduce them to
4058      one vector.  */
4059   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4060     {
4061       tree first_vect = PHI_RESULT (new_phis[0]);
4062       tree tmp;
4063       gimple new_vec_stmt = NULL;
4064
4065       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4066       for (k = 1; k < new_phis.length (); k++)
4067         {
4068           gimple next_phi = new_phis[k];
4069           tree second_vect = PHI_RESULT (next_phi);
4070
4071           tmp = build2 (code, vectype,  first_vect, second_vect);
4072           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4073           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4074           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4075           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4076         }
4077
4078       new_phi_result = first_vect;
4079       if (new_vec_stmt)
4080         {
4081           new_phis.truncate (0);
4082           new_phis.safe_push (new_vec_stmt);
4083         }
4084     }
4085   else
4086     new_phi_result = PHI_RESULT (new_phis[0]);
4087
4088   /* 2.3 Create the reduction code, using one of the three schemes described
4089          above. In SLP we simply need to extract all the elements from the
4090          vector (without reducing them), so we use scalar shifts.  */
4091   if (reduc_code != ERROR_MARK && !slp_reduc)
4092     {
4093       tree tmp;
4094
4095       /*** Case 1:  Create:
4096            v_out2 = reduc_expr <v_out1>  */
4097
4098       if (dump_enabled_p ())
4099         dump_printf_loc (MSG_NOTE, vect_location,
4100                          "Reduce using direct vector reduction.\n");
4101
4102       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4103       tmp = build1 (reduc_code, vectype, new_phi_result);
4104       epilog_stmt = gimple_build_assign (vec_dest, tmp);
4105       new_temp = make_ssa_name (vec_dest, epilog_stmt);
4106       gimple_assign_set_lhs (epilog_stmt, new_temp);
4107       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4108
4109       extract_scalar_result = true;
4110     }
4111   else
4112     {
4113       enum tree_code shift_code = ERROR_MARK;
4114       bool have_whole_vector_shift = true;
4115       int bit_offset;
4116       int element_bitsize = tree_low_cst (bitsize, 1);
4117       int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4118       tree vec_temp;
4119
4120       if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4121         shift_code = VEC_RSHIFT_EXPR;
4122       else
4123         have_whole_vector_shift = false;
4124
4125       /* Regardless of whether we have a whole vector shift, if we're
4126          emulating the operation via tree-vect-generic, we don't want
4127          to use it.  Only the first round of the reduction is likely
4128          to still be profitable via emulation.  */
4129       /* ??? It might be better to emit a reduction tree code here, so that
4130          tree-vect-generic can expand the first round via bit tricks.  */
4131       if (!VECTOR_MODE_P (mode))
4132         have_whole_vector_shift = false;
4133       else
4134         {
4135           optab optab = optab_for_tree_code (code, vectype, optab_default);
4136           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4137             have_whole_vector_shift = false;
4138         }
4139
4140       if (have_whole_vector_shift && !slp_reduc)
4141         {
4142           /*** Case 2: Create:
4143              for (offset = VS/2; offset >= element_size; offset/=2)
4144                 {
4145                   Create:  va' = vec_shift <va, offset>
4146                   Create:  va = vop <va, va'>
4147                 }  */
4148
4149           if (dump_enabled_p ())
4150             dump_printf_loc (MSG_NOTE, vect_location,
4151                              "Reduce using vector shifts\n");
4152
4153           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4154           new_temp = new_phi_result;
4155           for (bit_offset = vec_size_in_bits/2;
4156                bit_offset >= element_bitsize;
4157                bit_offset /= 2)
4158             {
4159               tree bitpos = size_int (bit_offset);
4160
4161               epilog_stmt = gimple_build_assign_with_ops (shift_code,
4162                                                vec_dest, new_temp, bitpos);
4163               new_name = make_ssa_name (vec_dest, epilog_stmt);
4164               gimple_assign_set_lhs (epilog_stmt, new_name);
4165               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4166
4167               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4168                                                           new_name, new_temp);
4169               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4170               gimple_assign_set_lhs (epilog_stmt, new_temp);
4171               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4172             }
4173
4174           extract_scalar_result = true;
4175         }
4176       else
4177         {
4178           tree rhs;
4179
4180           /*** Case 3: Create:
4181              s = extract_field <v_out2, 0>
4182              for (offset = element_size;
4183                   offset < vector_size;
4184                   offset += element_size;)
4185                {
4186                  Create:  s' = extract_field <v_out2, offset>
4187                  Create:  s = op <s, s'>  // For non SLP cases
4188                }  */
4189
4190           if (dump_enabled_p ())
4191             dump_printf_loc (MSG_NOTE, vect_location,
4192                              "Reduce using scalar code.\n");
4193
4194           vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4195           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4196             {
4197               if (gimple_code (new_phi) == GIMPLE_PHI)
4198                 vec_temp = PHI_RESULT (new_phi);
4199               else
4200                 vec_temp = gimple_assign_lhs (new_phi);
4201               rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4202                             bitsize_zero_node);
4203               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4204               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4205               gimple_assign_set_lhs (epilog_stmt, new_temp);
4206               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4207
4208               /* In SLP we don't need to apply reduction operation, so we just
4209                  collect s' values in SCALAR_RESULTS.  */
4210               if (slp_reduc)
4211                 scalar_results.safe_push (new_temp);
4212
4213               for (bit_offset = element_bitsize;
4214                    bit_offset < vec_size_in_bits;
4215                    bit_offset += element_bitsize)
4216                 {
4217                   tree bitpos = bitsize_int (bit_offset);
4218                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4219                                      bitsize, bitpos);
4220
4221                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4222                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4223                   gimple_assign_set_lhs (epilog_stmt, new_name);
4224                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4225
4226                   if (slp_reduc)
4227                     {
4228                       /* In SLP we don't need to apply reduction operation, so
4229                          we just collect s' values in SCALAR_RESULTS.  */
4230                       new_temp = new_name;
4231                       scalar_results.safe_push (new_name);
4232                     }
4233                   else
4234                     {
4235                       epilog_stmt = gimple_build_assign_with_ops (code,
4236                                           new_scalar_dest, new_name, new_temp);
4237                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4238                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4239                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4240                     }
4241                 }
4242             }
4243
4244           /* The only case where we need to reduce scalar results in SLP, is
4245              unrolling.  If the size of SCALAR_RESULTS is greater than
4246              GROUP_SIZE, we reduce them combining elements modulo
4247              GROUP_SIZE.  */
4248           if (slp_reduc)
4249             {
4250               tree res, first_res, new_res;
4251               gimple new_stmt;
4252
4253               /* Reduce multiple scalar results in case of SLP unrolling.  */
4254               for (j = group_size; scalar_results.iterate (j, &res);
4255                    j++)
4256                 {
4257                   first_res = scalar_results[j % group_size];
4258                   new_stmt = gimple_build_assign_with_ops (code,
4259                                               new_scalar_dest, first_res, res);
4260                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4261                   gimple_assign_set_lhs (new_stmt, new_res);
4262                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4263                   scalar_results[j % group_size] = new_res;
4264                 }
4265             }
4266           else
4267             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4268             scalar_results.safe_push (new_temp);
4269
4270           extract_scalar_result = false;
4271         }
4272     }
4273
4274   /* 2.4  Extract the final scalar result.  Create:
4275           s_out3 = extract_field <v_out2, bitpos>  */
4276
4277   if (extract_scalar_result)
4278     {
4279       tree rhs;
4280
4281       if (dump_enabled_p ())
4282         dump_printf_loc (MSG_NOTE, vect_location,
4283                          "extract scalar result\n");
4284
4285       if (BYTES_BIG_ENDIAN)
4286         bitpos = size_binop (MULT_EXPR,
4287                              bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4288                              TYPE_SIZE (scalar_type));
4289       else
4290         bitpos = bitsize_zero_node;
4291
4292       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4293       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4294       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4295       gimple_assign_set_lhs (epilog_stmt, new_temp);
4296       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4297       scalar_results.safe_push (new_temp);
4298     }
4299
4300 vect_finalize_reduction:
4301
4302   if (double_reduc)
4303     loop = loop->inner;
4304
4305   /* 2.5 Adjust the final result by the initial value of the reduction
4306          variable. (When such adjustment is not needed, then
4307          'adjustment_def' is zero).  For example, if code is PLUS we create:
4308          new_temp = loop_exit_def + adjustment_def  */
4309
4310   if (adjustment_def)
4311     {
4312       gcc_assert (!slp_reduc);
4313       if (nested_in_vect_loop)
4314         {
4315           new_phi = new_phis[0];
4316           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4317           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4318           new_dest = vect_create_destination_var (scalar_dest, vectype);
4319         }
4320       else
4321         {
4322           new_temp = scalar_results[0];
4323           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4324           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4325           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4326         }
4327
4328       epilog_stmt = gimple_build_assign (new_dest, expr);
4329       new_temp = make_ssa_name (new_dest, epilog_stmt);
4330       gimple_assign_set_lhs (epilog_stmt, new_temp);
4331       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4332       if (nested_in_vect_loop)
4333         {
4334           set_vinfo_for_stmt (epilog_stmt,
4335                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
4336                                                  NULL));
4337           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4338                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4339
4340           if (!double_reduc)
4341             scalar_results.quick_push (new_temp);
4342           else
4343             scalar_results[0] = new_temp;
4344         }
4345       else
4346         scalar_results[0] = new_temp;
4347
4348       new_phis[0] = epilog_stmt;
4349     }
4350
4351   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4352           phis with new adjusted scalar results, i.e., replace use <s_out0>
4353           with use <s_out4>.
4354
4355      Transform:
4356         loop_exit:
4357           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4358           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4359           v_out2 = reduce <v_out1>
4360           s_out3 = extract_field <v_out2, 0>
4361           s_out4 = adjust_result <s_out3>
4362           use <s_out0>
4363           use <s_out0>
4364
4365      into:
4366
4367         loop_exit:
4368           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4369           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4370           v_out2 = reduce <v_out1>
4371           s_out3 = extract_field <v_out2, 0>
4372           s_out4 = adjust_result <s_out3>
4373           use <s_out4>
4374           use <s_out4> */
4375
4376
4377   /* In SLP reduction chain we reduce vector results into one vector if
4378      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4379      the last stmt in the reduction chain, since we are looking for the loop
4380      exit phi node.  */
4381   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4382     {
4383       scalar_dest = gimple_assign_lhs (
4384                         SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4385       group_size = 1;
4386     }
4387
4388   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4389      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4390      need to match SCALAR_RESULTS with corresponding statements.  The first
4391      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4392      the first vector stmt, etc.
4393      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4394   if (group_size > new_phis.length ())
4395     {
4396       ratio = group_size / new_phis.length ();
4397       gcc_assert (!(group_size % new_phis.length ()));
4398     }
4399   else
4400     ratio = 1;
4401
4402   for (k = 0; k < group_size; k++)
4403     {
4404       if (k % ratio == 0)
4405         {
4406           epilog_stmt = new_phis[k / ratio];
4407           reduction_phi = reduction_phis[k / ratio];
4408           if (double_reduc)
4409             inner_phi = inner_phis[k / ratio];
4410         }
4411
4412       if (slp_reduc)
4413         {
4414           gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4415
4416           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4417           /* SLP statements can't participate in patterns.  */
4418           gcc_assert (!orig_stmt);
4419           scalar_dest = gimple_assign_lhs (current_stmt);
4420         }
4421
4422       phis.create (3);
4423       /* Find the loop-closed-use at the loop exit of the original scalar
4424          result.  (The reduction result is expected to have two immediate uses -
4425          one at the latch block, and one at the loop exit).  */
4426       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4427         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4428             && !is_gimple_debug (USE_STMT (use_p)))
4429           phis.safe_push (USE_STMT (use_p));
4430
4431       /* While we expect to have found an exit_phi because of loop-closed-ssa
4432          form we can end up without one if the scalar cycle is dead.  */
4433
4434       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4435         {
4436           if (outer_loop)
4437             {
4438               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4439               gimple vect_phi;
4440
4441               /* FORNOW. Currently not supporting the case that an inner-loop
4442                  reduction is not used in the outer-loop (but only outside the
4443                  outer-loop), unless it is double reduction.  */
4444               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4445                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4446                           || double_reduc);
4447
4448               STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4449               if (!double_reduc
4450                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4451                       != vect_double_reduction_def)
4452                 continue;
4453
4454               /* Handle double reduction:
4455
4456                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4457                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4458                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4459                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4460
4461                  At that point the regular reduction (stmt2 and stmt3) is
4462                  already vectorized, as well as the exit phi node, stmt4.
4463                  Here we vectorize the phi node of double reduction, stmt1, and
4464                  update all relevant statements.  */
4465
4466               /* Go through all the uses of s2 to find double reduction phi
4467                  node, i.e., stmt1 above.  */
4468               orig_name = PHI_RESULT (exit_phi);
4469               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4470                 {
4471                   stmt_vec_info use_stmt_vinfo;
4472                   stmt_vec_info new_phi_vinfo;
4473                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4474                   basic_block bb = gimple_bb (use_stmt);
4475                   gimple use;
4476
4477                   /* Check that USE_STMT is really double reduction phi
4478                      node.  */
4479                   if (gimple_code (use_stmt) != GIMPLE_PHI
4480                       || gimple_phi_num_args (use_stmt) != 2
4481                       || bb->loop_father != outer_loop)
4482                     continue;
4483                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4484                   if (!use_stmt_vinfo
4485                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4486                           != vect_double_reduction_def)
4487                     continue;
4488
4489                   /* Create vector phi node for double reduction:
4490                      vs1 = phi <vs0, vs2>
4491                      vs1 was created previously in this function by a call to
4492                        vect_get_vec_def_for_operand and is stored in
4493                        vec_initial_def;
4494                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4495                      vs0 is created here.  */
4496
4497                   /* Create vector phi node.  */
4498                   vect_phi = create_phi_node (vec_initial_def, bb);
4499                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4500                                     loop_vec_info_for_loop (outer_loop), NULL);
4501                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4502
4503                   /* Create vs0 - initial def of the double reduction phi.  */
4504                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4505                                              loop_preheader_edge (outer_loop));
4506                   init_def = get_initial_def_for_reduction (stmt,
4507                                                           preheader_arg, NULL);
4508                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4509                                                     vectype, NULL);
4510
4511                   /* Update phi node arguments with vs0 and vs2.  */
4512                   add_phi_arg (vect_phi, vect_phi_init,
4513                                loop_preheader_edge (outer_loop),
4514                                UNKNOWN_LOCATION);
4515                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4516                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4517                   if (dump_enabled_p ())
4518                     {
4519                       dump_printf_loc (MSG_NOTE, vect_location,
4520                                        "created double reduction phi node: ");
4521                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4522                       dump_printf (MSG_NOTE, "\n");
4523                     }
4524
4525                   vect_phi_res = PHI_RESULT (vect_phi);
4526
4527                   /* Replace the use, i.e., set the correct vs1 in the regular
4528                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4529                      loop is redundant.  */
4530                   use = reduction_phi;
4531                   for (j = 0; j < ncopies; j++)
4532                     {
4533                       edge pr_edge = loop_preheader_edge (loop);
4534                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4535                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4536                     }
4537                 }
4538             }
4539         }
4540
4541       phis.release ();
4542       if (nested_in_vect_loop)
4543         {
4544           if (double_reduc)
4545             loop = outer_loop;
4546           else
4547             continue;
4548         }
4549
4550       phis.create (3);
4551       /* Find the loop-closed-use at the loop exit of the original scalar
4552          result.  (The reduction result is expected to have two immediate uses,
4553          one at the latch block, and one at the loop exit).  For double
4554          reductions we are looking for exit phis of the outer loop.  */
4555       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4556         {
4557           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4558             {
4559               if (!is_gimple_debug (USE_STMT (use_p)))
4560                 phis.safe_push (USE_STMT (use_p));
4561             }
4562           else
4563             {
4564               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4565                 {
4566                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4567
4568                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4569                     {
4570                       if (!flow_bb_inside_loop_p (loop,
4571                                              gimple_bb (USE_STMT (phi_use_p)))
4572                           && !is_gimple_debug (USE_STMT (phi_use_p)))
4573                         phis.safe_push (USE_STMT (phi_use_p));
4574                     }
4575                 }
4576             }
4577         }
4578
4579       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4580         {
4581           /* Replace the uses:  */
4582           orig_name = PHI_RESULT (exit_phi);
4583           scalar_result = scalar_results[k];
4584           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4585             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4586               SET_USE (use_p, scalar_result);
4587         }
4588
4589       phis.release ();
4590     }
4591
4592   scalar_results.release ();
4593   inner_phis.release ();
4594   new_phis.release ();
4595 }
4596
4597
4598 /* Function vectorizable_reduction.
4599
4600    Check if STMT performs a reduction operation that can be vectorized.
4601    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4602    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4603    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4604
4605    This function also handles reduction idioms (patterns) that have been
4606    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4607    of this form:
4608      X = pattern_expr (arg0, arg1, ..., X)
4609    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4610    sequence that had been detected and replaced by the pattern-stmt (STMT).
4611
4612    In some cases of reduction patterns, the type of the reduction variable X is
4613    different than the type of the other arguments of STMT.
4614    In such cases, the vectype that is used when transforming STMT into a vector
4615    stmt is different than the vectype that is used to determine the
4616    vectorization factor, because it consists of a different number of elements
4617    than the actual number of elements that are being operated upon in parallel.
4618
4619    For example, consider an accumulation of shorts into an int accumulator.
4620    On some targets it's possible to vectorize this pattern operating on 8
4621    shorts at a time (hence, the vectype for purposes of determining the
4622    vectorization factor should be V8HI); on the other hand, the vectype that
4623    is used to create the vector form is actually V4SI (the type of the result).
4624
4625    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4626    indicates what is the actual level of parallelism (V8HI in the example), so
4627    that the right vectorization factor would be derived.  This vectype
4628    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4629    be used to create the vectorized stmt.  The right vectype for the vectorized
4630    stmt is obtained from the type of the result X:
4631         get_vectype_for_scalar_type (TREE_TYPE (X))
4632
4633    This means that, contrary to "regular" reductions (or "regular" stmts in
4634    general), the following equation:
4635       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4636    does *NOT* necessarily hold for reduction patterns.  */
4637
4638 bool
4639 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4640                         gimple *vec_stmt, slp_tree slp_node)
4641 {
4642   tree vec_dest;
4643   tree scalar_dest;
4644   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4645   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4646   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4647   tree vectype_in = NULL_TREE;
4648   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4649   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4650   enum tree_code code, orig_code, epilog_reduc_code;
4651   enum machine_mode vec_mode;
4652   int op_type;
4653   optab optab, reduc_optab;
4654   tree new_temp = NULL_TREE;
4655   tree def;
4656   gimple def_stmt;
4657   enum vect_def_type dt;
4658   gimple new_phi = NULL;
4659   tree scalar_type;
4660   bool is_simple_use;
4661   gimple orig_stmt;
4662   stmt_vec_info orig_stmt_info;
4663   tree expr = NULL_TREE;
4664   int i;
4665   int ncopies;
4666   int epilog_copies;
4667   stmt_vec_info prev_stmt_info, prev_phi_info;
4668   bool single_defuse_cycle = false;
4669   tree reduc_def = NULL_TREE;
4670   gimple new_stmt = NULL;
4671   int j;
4672   tree ops[3];
4673   bool nested_cycle = false, found_nested_cycle_def = false;
4674   gimple reduc_def_stmt = NULL;
4675   /* The default is that the reduction variable is the last in statement.  */
4676   int reduc_index = 2;
4677   bool double_reduc = false, dummy;
4678   basic_block def_bb;
4679   struct loop * def_stmt_loop, *outer_loop = NULL;
4680   tree def_arg;
4681   gimple def_arg_stmt;
4682   vec<tree> vec_oprnds0 = vNULL;
4683   vec<tree> vec_oprnds1 = vNULL;
4684   vec<tree> vect_defs = vNULL;
4685   vec<gimple> phis = vNULL;
4686   int vec_num;
4687   tree def0, def1, tem, op0, op1 = NULL_TREE;
4688
4689   /* In case of reduction chain we switch to the first stmt in the chain, but
4690      we don't update STMT_INFO, since only the last stmt is marked as reduction
4691      and has reduction properties.  */
4692   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4693     stmt = GROUP_FIRST_ELEMENT (stmt_info);
4694
4695   if (nested_in_vect_loop_p (loop, stmt))
4696     {
4697       outer_loop = loop;
4698       loop = loop->inner;
4699       nested_cycle = true;
4700     }
4701
4702   /* 1. Is vectorizable reduction?  */
4703   /* Not supportable if the reduction variable is used in the loop, unless
4704      it's a reduction chain.  */
4705   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4706       && !GROUP_FIRST_ELEMENT (stmt_info))
4707     return false;
4708
4709   /* Reductions that are not used even in an enclosing outer-loop,
4710      are expected to be "live" (used out of the loop).  */
4711   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4712       && !STMT_VINFO_LIVE_P (stmt_info))
4713     return false;
4714
4715   /* Make sure it was already recognized as a reduction computation.  */
4716   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4717       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4718     return false;
4719
4720   /* 2. Has this been recognized as a reduction pattern?
4721
4722      Check if STMT represents a pattern that has been recognized
4723      in earlier analysis stages.  For stmts that represent a pattern,
4724      the STMT_VINFO_RELATED_STMT field records the last stmt in
4725      the original sequence that constitutes the pattern.  */
4726
4727   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4728   if (orig_stmt)
4729     {
4730       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4731       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4732       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4733     }
4734
4735   /* 3. Check the operands of the operation.  The first operands are defined
4736         inside the loop body. The last operand is the reduction variable,
4737         which is defined by the loop-header-phi.  */
4738
4739   gcc_assert (is_gimple_assign (stmt));
4740
4741   /* Flatten RHS.  */
4742   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4743     {
4744     case GIMPLE_SINGLE_RHS:
4745       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4746       if (op_type == ternary_op)
4747         {
4748           tree rhs = gimple_assign_rhs1 (stmt);
4749           ops[0] = TREE_OPERAND (rhs, 0);
4750           ops[1] = TREE_OPERAND (rhs, 1);
4751           ops[2] = TREE_OPERAND (rhs, 2);
4752           code = TREE_CODE (rhs);
4753         }
4754       else
4755         return false;
4756       break;
4757
4758     case GIMPLE_BINARY_RHS:
4759       code = gimple_assign_rhs_code (stmt);
4760       op_type = TREE_CODE_LENGTH (code);
4761       gcc_assert (op_type == binary_op);
4762       ops[0] = gimple_assign_rhs1 (stmt);
4763       ops[1] = gimple_assign_rhs2 (stmt);
4764       break;
4765
4766     case GIMPLE_TERNARY_RHS:
4767       code = gimple_assign_rhs_code (stmt);
4768       op_type = TREE_CODE_LENGTH (code);
4769       gcc_assert (op_type == ternary_op);
4770       ops[0] = gimple_assign_rhs1 (stmt);
4771       ops[1] = gimple_assign_rhs2 (stmt);
4772       ops[2] = gimple_assign_rhs3 (stmt);
4773       break;
4774
4775     case GIMPLE_UNARY_RHS:
4776       return false;
4777
4778     default:
4779       gcc_unreachable ();
4780     }
4781
4782   if (code == COND_EXPR && slp_node)
4783     return false;
4784
4785   scalar_dest = gimple_assign_lhs (stmt);
4786   scalar_type = TREE_TYPE (scalar_dest);
4787   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4788       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4789     return false;
4790
4791   /* Do not try to vectorize bit-precision reductions.  */
4792   if ((TYPE_PRECISION (scalar_type)
4793        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4794     return false;
4795
4796   /* All uses but the last are expected to be defined in the loop.
4797      The last use is the reduction variable.  In case of nested cycle this
4798      assumption is not true: we use reduc_index to record the index of the
4799      reduction variable.  */
4800   for (i = 0; i < op_type - 1; i++)
4801     {
4802       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4803       if (i == 0 && code == COND_EXPR)
4804         continue;
4805
4806       is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4807                                             &def_stmt, &def, &dt, &tem);
4808       if (!vectype_in)
4809         vectype_in = tem;
4810       gcc_assert (is_simple_use);
4811
4812       if (dt != vect_internal_def
4813           && dt != vect_external_def
4814           && dt != vect_constant_def
4815           && dt != vect_induction_def
4816           && !(dt == vect_nested_cycle && nested_cycle))
4817         return false;
4818
4819       if (dt == vect_nested_cycle)
4820         {
4821           found_nested_cycle_def = true;
4822           reduc_def_stmt = def_stmt;
4823           reduc_index = i;
4824         }
4825     }
4826
4827   is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4828                                         &def_stmt, &def, &dt, &tem);
4829   if (!vectype_in)
4830     vectype_in = tem;
4831   gcc_assert (is_simple_use);
4832   if (!(dt == vect_reduction_def
4833         || dt == vect_nested_cycle
4834         || ((dt == vect_internal_def || dt == vect_external_def
4835              || dt == vect_constant_def || dt == vect_induction_def)
4836             && nested_cycle && found_nested_cycle_def)))
4837     {
4838       /* For pattern recognized stmts, orig_stmt might be a reduction,
4839          but some helper statements for the pattern might not, or
4840          might be COND_EXPRs with reduction uses in the condition.  */
4841       gcc_assert (orig_stmt);
4842       return false;
4843     }
4844   if (!found_nested_cycle_def)
4845     reduc_def_stmt = def_stmt;
4846
4847   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4848   if (orig_stmt)
4849     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4850                                                        reduc_def_stmt,
4851                                                        !nested_cycle,
4852                                                        &dummy));
4853   else
4854     {
4855       gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4856                                              !nested_cycle, &dummy);
4857       /* We changed STMT to be the first stmt in reduction chain, hence we
4858          check that in this case the first element in the chain is STMT.  */
4859       gcc_assert (stmt == tmp
4860                   || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4861     }
4862
4863   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4864     return false;
4865
4866   if (slp_node || PURE_SLP_STMT (stmt_info))
4867     ncopies = 1;
4868   else
4869     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4870                / TYPE_VECTOR_SUBPARTS (vectype_in));
4871
4872   gcc_assert (ncopies >= 1);
4873
4874   vec_mode = TYPE_MODE (vectype_in);
4875
4876   if (code == COND_EXPR)
4877     {
4878       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4879         {
4880           if (dump_enabled_p ())
4881             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4882                              "unsupported condition in reduction\n");
4883
4884             return false;
4885         }
4886     }
4887   else
4888     {
4889       /* 4. Supportable by target?  */
4890
4891       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
4892           || code == LROTATE_EXPR || code == RROTATE_EXPR)
4893         {
4894           /* Shifts and rotates are only supported by vectorizable_shifts,
4895              not vectorizable_reduction.  */
4896           if (dump_enabled_p ())
4897             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4898                              "unsupported shift or rotation.\n");
4899           return false;
4900         }
4901
4902       /* 4.1. check support for the operation in the loop  */
4903       optab = optab_for_tree_code (code, vectype_in, optab_default);
4904       if (!optab)
4905         {
4906           if (dump_enabled_p ())
4907             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4908                              "no optab.\n");
4909
4910           return false;
4911         }
4912
4913       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4914         {
4915           if (dump_enabled_p ())
4916             dump_printf (MSG_NOTE, "op not supported by target.\n");
4917
4918           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4919               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4920                   < vect_min_worthwhile_factor (code))
4921             return false;
4922
4923           if (dump_enabled_p ())
4924             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
4925         }
4926
4927       /* Worthwhile without SIMD support?  */
4928       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4929           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4930              < vect_min_worthwhile_factor (code))
4931         {
4932           if (dump_enabled_p ())
4933             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4934                              "not worthwhile without SIMD support.\n");
4935
4936           return false;
4937         }
4938     }
4939
4940   /* 4.2. Check support for the epilog operation.
4941
4942           If STMT represents a reduction pattern, then the type of the
4943           reduction variable may be different than the type of the rest
4944           of the arguments.  For example, consider the case of accumulation
4945           of shorts into an int accumulator; The original code:
4946                         S1: int_a = (int) short_a;
4947           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
4948
4949           was replaced with:
4950                         STMT: int_acc = widen_sum <short_a, int_acc>
4951
4952           This means that:
4953           1. The tree-code that is used to create the vector operation in the
4954              epilog code (that reduces the partial results) is not the
4955              tree-code of STMT, but is rather the tree-code of the original
4956              stmt from the pattern that STMT is replacing.  I.e, in the example
4957              above we want to use 'widen_sum' in the loop, but 'plus' in the
4958              epilog.
4959           2. The type (mode) we use to check available target support
4960              for the vector operation to be created in the *epilog*, is
4961              determined by the type of the reduction variable (in the example
4962              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4963              However the type (mode) we use to check available target support
4964              for the vector operation to be created *inside the loop*, is
4965              determined by the type of the other arguments to STMT (in the
4966              example we'd check this: optab_handler (widen_sum_optab,
4967              vect_short_mode)).
4968
4969           This is contrary to "regular" reductions, in which the types of all
4970           the arguments are the same as the type of the reduction variable.
4971           For "regular" reductions we can therefore use the same vector type
4972           (and also the same tree-code) when generating the epilog code and
4973           when generating the code inside the loop.  */
4974
4975   if (orig_stmt)
4976     {
4977       /* This is a reduction pattern: get the vectype from the type of the
4978          reduction variable, and get the tree-code from orig_stmt.  */
4979       orig_code = gimple_assign_rhs_code (orig_stmt);
4980       gcc_assert (vectype_out);
4981       vec_mode = TYPE_MODE (vectype_out);
4982     }
4983   else
4984     {
4985       /* Regular reduction: use the same vectype and tree-code as used for
4986          the vector code inside the loop can be used for the epilog code. */
4987       orig_code = code;
4988     }
4989
4990   if (nested_cycle)
4991     {
4992       def_bb = gimple_bb (reduc_def_stmt);
4993       def_stmt_loop = def_bb->loop_father;
4994       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4995                                        loop_preheader_edge (def_stmt_loop));
4996       if (TREE_CODE (def_arg) == SSA_NAME
4997           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4998           && gimple_code (def_arg_stmt) == GIMPLE_PHI
4999           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5000           && vinfo_for_stmt (def_arg_stmt)
5001           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5002               == vect_double_reduction_def)
5003         double_reduc = true;
5004     }
5005
5006   epilog_reduc_code = ERROR_MARK;
5007   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5008     {
5009       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5010                                          optab_default);
5011       if (!reduc_optab)
5012         {
5013           if (dump_enabled_p ())
5014             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5015                              "no optab for reduction.\n");
5016
5017           epilog_reduc_code = ERROR_MARK;
5018         }
5019
5020       if (reduc_optab
5021           && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5022         {
5023           if (dump_enabled_p ())
5024             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5025                              "reduc op not supported by target.\n");
5026
5027           epilog_reduc_code = ERROR_MARK;
5028         }
5029     }
5030   else
5031     {
5032       if (!nested_cycle || double_reduc)
5033         {
5034           if (dump_enabled_p ())
5035             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5036                              "no reduc code for scalar code.\n");
5037
5038           return false;
5039         }
5040     }
5041
5042   if (double_reduc && ncopies > 1)
5043     {
5044       if (dump_enabled_p ())
5045         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5046                          "multiple types in double reduction\n");
5047
5048       return false;
5049     }
5050
5051   /* In case of widenning multiplication by a constant, we update the type
5052      of the constant to be the type of the other operand.  We check that the
5053      constant fits the type in the pattern recognition pass.  */
5054   if (code == DOT_PROD_EXPR
5055       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5056     {
5057       if (TREE_CODE (ops[0]) == INTEGER_CST)
5058         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5059       else if (TREE_CODE (ops[1]) == INTEGER_CST)
5060         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5061       else
5062         {
5063           if (dump_enabled_p ())
5064             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5065                              "invalid types in dot-prod\n");
5066
5067           return false;
5068         }
5069     }
5070
5071   if (!vec_stmt) /* transformation not required.  */
5072     {
5073       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
5074         return false;
5075       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5076       return true;
5077     }
5078
5079   /** Transform.  **/
5080
5081   if (dump_enabled_p ())
5082     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5083
5084   /* FORNOW: Multiple types are not supported for condition.  */
5085   if (code == COND_EXPR)
5086     gcc_assert (ncopies == 1);
5087
5088   /* Create the destination vector  */
5089   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5090
5091   /* In case the vectorization factor (VF) is bigger than the number
5092      of elements that we can fit in a vectype (nunits), we have to generate
5093      more than one vector stmt - i.e - we need to "unroll" the
5094      vector stmt by a factor VF/nunits.  For more details see documentation
5095      in vectorizable_operation.  */
5096
5097   /* If the reduction is used in an outer loop we need to generate
5098      VF intermediate results, like so (e.g. for ncopies=2):
5099         r0 = phi (init, r0)
5100         r1 = phi (init, r1)
5101         r0 = x0 + r0;
5102         r1 = x1 + r1;
5103     (i.e. we generate VF results in 2 registers).
5104     In this case we have a separate def-use cycle for each copy, and therefore
5105     for each copy we get the vector def for the reduction variable from the
5106     respective phi node created for this copy.
5107
5108     Otherwise (the reduction is unused in the loop nest), we can combine
5109     together intermediate results, like so (e.g. for ncopies=2):
5110         r = phi (init, r)
5111         r = x0 + r;
5112         r = x1 + r;
5113    (i.e. we generate VF/2 results in a single register).
5114    In this case for each copy we get the vector def for the reduction variable
5115    from the vectorized reduction operation generated in the previous iteration.
5116   */
5117
5118   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5119     {
5120       single_defuse_cycle = true;
5121       epilog_copies = 1;
5122     }
5123   else
5124     epilog_copies = ncopies;
5125
5126   prev_stmt_info = NULL;
5127   prev_phi_info = NULL;
5128   if (slp_node)
5129     {
5130       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5131       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5132                   == TYPE_VECTOR_SUBPARTS (vectype_in));
5133     }
5134   else
5135     {
5136       vec_num = 1;
5137       vec_oprnds0.create (1);
5138       if (op_type == ternary_op)
5139         vec_oprnds1.create (1);
5140     }
5141
5142   phis.create (vec_num);
5143   vect_defs.create (vec_num);
5144   if (!slp_node)
5145     vect_defs.quick_push (NULL_TREE);
5146
5147   for (j = 0; j < ncopies; j++)
5148     {
5149       if (j == 0 || !single_defuse_cycle)
5150         {
5151           for (i = 0; i < vec_num; i++)
5152             {
5153               /* Create the reduction-phi that defines the reduction
5154                  operand.  */
5155               new_phi = create_phi_node (vec_dest, loop->header);
5156               set_vinfo_for_stmt (new_phi,
5157                                   new_stmt_vec_info (new_phi, loop_vinfo,
5158                                                      NULL));
5159                if (j == 0 || slp_node)
5160                  phis.quick_push (new_phi);
5161             }
5162         }
5163
5164       if (code == COND_EXPR)
5165         {
5166           gcc_assert (!slp_node);
5167           vectorizable_condition (stmt, gsi, vec_stmt,
5168                                   PHI_RESULT (phis[0]),
5169                                   reduc_index, NULL);
5170           /* Multiple types are not supported for condition.  */
5171           break;
5172         }
5173
5174       /* Handle uses.  */
5175       if (j == 0)
5176         {
5177           op0 = ops[!reduc_index];
5178           if (op_type == ternary_op)
5179             {
5180               if (reduc_index == 0)
5181                 op1 = ops[2];
5182               else
5183                 op1 = ops[1];
5184             }
5185
5186           if (slp_node)
5187             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5188                                slp_node, -1);
5189           else
5190             {
5191               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5192                                                             stmt, NULL);
5193               vec_oprnds0.quick_push (loop_vec_def0);
5194               if (op_type == ternary_op)
5195                {
5196                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5197                                                                NULL);
5198                  vec_oprnds1.quick_push (loop_vec_def1);
5199                }
5200             }
5201         }
5202       else
5203         {
5204           if (!slp_node)
5205             {
5206               enum vect_def_type dt;
5207               gimple dummy_stmt;
5208               tree dummy;
5209
5210               vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5211                                   &dummy_stmt, &dummy, &dt);
5212               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5213                                                               loop_vec_def0);
5214               vec_oprnds0[0] = loop_vec_def0;
5215               if (op_type == ternary_op)
5216                 {
5217                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5218                                       &dummy, &dt);
5219                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5220                                                                 loop_vec_def1);
5221                   vec_oprnds1[0] = loop_vec_def1;
5222                 }
5223             }
5224
5225           if (single_defuse_cycle)
5226             reduc_def = gimple_assign_lhs (new_stmt);
5227
5228           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5229         }
5230
5231       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5232         {
5233           if (slp_node)
5234             reduc_def = PHI_RESULT (phis[i]);
5235           else
5236             {
5237               if (!single_defuse_cycle || j == 0)
5238                 reduc_def = PHI_RESULT (new_phi);
5239             }
5240
5241           def1 = ((op_type == ternary_op)
5242                   ? vec_oprnds1[i] : NULL);
5243           if (op_type == binary_op)
5244             {
5245               if (reduc_index == 0)
5246                 expr = build2 (code, vectype_out, reduc_def, def0);
5247               else
5248                 expr = build2 (code, vectype_out, def0, reduc_def);
5249             }
5250           else
5251             {
5252               if (reduc_index == 0)
5253                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5254               else
5255                 {
5256                   if (reduc_index == 1)
5257                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5258                   else
5259                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5260                 }
5261             }
5262
5263           new_stmt = gimple_build_assign (vec_dest, expr);
5264           new_temp = make_ssa_name (vec_dest, new_stmt);
5265           gimple_assign_set_lhs (new_stmt, new_temp);
5266           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5267
5268           if (slp_node)
5269             {
5270               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5271               vect_defs.quick_push (new_temp);
5272             }
5273           else
5274             vect_defs[0] = new_temp;
5275         }
5276
5277       if (slp_node)
5278         continue;
5279
5280       if (j == 0)
5281         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5282       else
5283         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5284
5285       prev_stmt_info = vinfo_for_stmt (new_stmt);
5286       prev_phi_info = vinfo_for_stmt (new_phi);
5287     }
5288
5289   /* Finalize the reduction-phi (set its arguments) and create the
5290      epilog reduction code.  */
5291   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5292     {
5293       new_temp = gimple_assign_lhs (*vec_stmt);
5294       vect_defs[0] = new_temp;
5295     }
5296
5297   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5298                                     epilog_reduc_code, phis, reduc_index,
5299                                     double_reduc, slp_node);
5300
5301   phis.release ();
5302   vect_defs.release ();
5303   vec_oprnds0.release ();
5304   vec_oprnds1.release ();
5305
5306   return true;
5307 }
5308
5309 /* Function vect_min_worthwhile_factor.
5310
5311    For a loop where we could vectorize the operation indicated by CODE,
5312    return the minimum vectorization factor that makes it worthwhile
5313    to use generic vectors.  */
5314 int
5315 vect_min_worthwhile_factor (enum tree_code code)
5316 {
5317   switch (code)
5318     {
5319     case PLUS_EXPR:
5320     case MINUS_EXPR:
5321     case NEGATE_EXPR:
5322       return 4;
5323
5324     case BIT_AND_EXPR:
5325     case BIT_IOR_EXPR:
5326     case BIT_XOR_EXPR:
5327     case BIT_NOT_EXPR:
5328       return 2;
5329
5330     default:
5331       return INT_MAX;
5332     }
5333 }
5334
5335
5336 /* Function vectorizable_induction
5337
5338    Check if PHI performs an induction computation that can be vectorized.
5339    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5340    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5341    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5342
5343 bool
5344 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5345                         gimple *vec_stmt)
5346 {
5347   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5348   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5349   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5350   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5351   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5352   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5353   tree vec_def;
5354
5355   gcc_assert (ncopies >= 1);
5356   /* FORNOW. These restrictions should be relaxed.  */
5357   if (nested_in_vect_loop_p (loop, phi))
5358     {
5359       imm_use_iterator imm_iter;
5360       use_operand_p use_p;
5361       gimple exit_phi;
5362       edge latch_e;
5363       tree loop_arg;
5364
5365       if (ncopies > 1)
5366         {
5367           if (dump_enabled_p ())
5368             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5369                              "multiple types in nested loop.\n");
5370           return false;
5371         }
5372
5373       exit_phi = NULL;
5374       latch_e = loop_latch_edge (loop->inner);
5375       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5376       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5377         {
5378           if (!flow_bb_inside_loop_p (loop->inner,
5379                                       gimple_bb (USE_STMT (use_p))))
5380             {
5381               exit_phi = USE_STMT (use_p);
5382               break;
5383             }
5384         }
5385       if (exit_phi)
5386         {
5387           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5388           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5389                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5390             {
5391               if (dump_enabled_p ())
5392                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5393                                  "inner-loop induction only used outside "
5394                                  "of the outer vectorized loop.\n");
5395               return false;
5396             }
5397         }
5398     }
5399
5400   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5401     return false;
5402
5403   /* FORNOW: SLP not supported.  */
5404   if (STMT_SLP_TYPE (stmt_info))
5405     return false;
5406
5407   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5408
5409   if (gimple_code (phi) != GIMPLE_PHI)
5410     return false;
5411
5412   if (!vec_stmt) /* transformation not required.  */
5413     {
5414       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5415       if (dump_enabled_p ())
5416         dump_printf_loc (MSG_NOTE, vect_location,
5417                          "=== vectorizable_induction ===\n");
5418       vect_model_induction_cost (stmt_info, ncopies);
5419       return true;
5420     }
5421
5422   /** Transform.  **/
5423
5424   if (dump_enabled_p ())
5425     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5426
5427   vec_def = get_initial_def_for_induction (phi);
5428   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5429   return true;
5430 }
5431
5432 /* Function vectorizable_live_operation.
5433
5434    STMT computes a value that is used outside the loop.  Check if
5435    it can be supported.  */
5436
5437 bool
5438 vectorizable_live_operation (gimple stmt,
5439                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5440                              gimple *vec_stmt)
5441 {
5442   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5443   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5444   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5445   int i;
5446   int op_type;
5447   tree op;
5448   tree def;
5449   gimple def_stmt;
5450   enum vect_def_type dt;
5451   enum tree_code code;
5452   enum gimple_rhs_class rhs_class;
5453
5454   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5455
5456   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5457     return false;
5458
5459   if (!is_gimple_assign (stmt))
5460     {
5461       if (gimple_call_internal_p (stmt)
5462           && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5463           && gimple_call_lhs (stmt)
5464           && loop->simduid
5465           && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5466           && loop->simduid
5467              == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5468         {
5469           edge e = single_exit (loop);
5470           basic_block merge_bb = e->dest;
5471           imm_use_iterator imm_iter;
5472           use_operand_p use_p;
5473           tree lhs = gimple_call_lhs (stmt);
5474
5475           FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5476             {
5477               gimple use_stmt = USE_STMT (use_p);
5478               if (gimple_code (use_stmt) == GIMPLE_PHI
5479                   || gimple_bb (use_stmt) == merge_bb)
5480                 {
5481                   if (vec_stmt)
5482                     {
5483                       tree vfm1
5484                         = build_int_cst (unsigned_type_node,
5485                                          loop_vinfo->vectorization_factor - 1);
5486                       SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5487                     }
5488                   return true;
5489                 }
5490             }
5491         }
5492
5493       return false;
5494     }
5495
5496   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5497     return false;
5498
5499   /* FORNOW. CHECKME. */
5500   if (nested_in_vect_loop_p (loop, stmt))
5501     return false;
5502
5503   code = gimple_assign_rhs_code (stmt);
5504   op_type = TREE_CODE_LENGTH (code);
5505   rhs_class = get_gimple_rhs_class (code);
5506   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5507   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5508
5509   /* FORNOW: support only if all uses are invariant.  This means
5510      that the scalar operations can remain in place, unvectorized.
5511      The original last scalar value that they compute will be used.  */
5512
5513   for (i = 0; i < op_type; i++)
5514     {
5515       if (rhs_class == GIMPLE_SINGLE_RHS)
5516         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5517       else
5518         op = gimple_op (stmt, i + 1);
5519       if (op
5520           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5521                                   &dt))
5522         {
5523           if (dump_enabled_p ())
5524             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5525                              "use not simple.\n");
5526           return false;
5527         }
5528
5529       if (dt != vect_external_def && dt != vect_constant_def)
5530         return false;
5531     }
5532
5533   /* No transformation is required for the cases we currently support.  */
5534   return true;
5535 }
5536
5537 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5538
5539 static void
5540 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5541 {
5542   ssa_op_iter op_iter;
5543   imm_use_iterator imm_iter;
5544   def_operand_p def_p;
5545   gimple ustmt;
5546
5547   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5548     {
5549       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5550         {
5551           basic_block bb;
5552
5553           if (!is_gimple_debug (ustmt))
5554             continue;
5555
5556           bb = gimple_bb (ustmt);
5557
5558           if (!flow_bb_inside_loop_p (loop, bb))
5559             {
5560               if (gimple_debug_bind_p (ustmt))
5561                 {
5562                   if (dump_enabled_p ())
5563                     dump_printf_loc (MSG_NOTE, vect_location,
5564                                      "killing debug use\n");
5565
5566                   gimple_debug_bind_reset_value (ustmt);
5567                   update_stmt (ustmt);
5568                 }
5569               else
5570                 gcc_unreachable ();
5571             }
5572         }
5573     }
5574 }
5575
5576 /* Function vect_transform_loop.
5577
5578    The analysis phase has determined that the loop is vectorizable.
5579    Vectorize the loop - created vectorized stmts to replace the scalar
5580    stmts in the loop, and update the loop exit condition.  */
5581
5582 void
5583 vect_transform_loop (loop_vec_info loop_vinfo)
5584 {
5585   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5586   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5587   int nbbs = loop->num_nodes;
5588   gimple_stmt_iterator si;
5589   int i;
5590   tree ratio = NULL;
5591   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5592   bool grouped_store;
5593   bool slp_scheduled = false;
5594   unsigned int nunits;
5595   gimple stmt, pattern_stmt;
5596   gimple_seq pattern_def_seq = NULL;
5597   gimple_stmt_iterator pattern_def_si = gsi_none ();
5598   bool transform_pattern_stmt = false;
5599   bool check_profitability = false;
5600   int th;
5601   /* Record number of iterations before we started tampering with the profile. */
5602   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5603
5604   if (dump_enabled_p ())
5605     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5606
5607   /* If profile is inprecise, we have chance to fix it up.  */
5608   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5609     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5610
5611   /* Use the more conservative vectorization threshold.  If the number
5612      of iterations is constant assume the cost check has been performed
5613      by our caller.  If the threshold makes all loops profitable that
5614      run at least the vectorization factor number of times checking
5615      is pointless, too.  */
5616   th = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
5617          * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
5618   th = MAX (th, LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo));
5619   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5620       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5621     {
5622       if (dump_enabled_p ())
5623         dump_printf_loc (MSG_NOTE, vect_location,
5624                          "Profitability threshold is %d loop iterations.\n",
5625                          th);
5626       check_profitability = true;
5627     }
5628
5629   /* Version the loop first, if required, so the profitability check
5630      comes first.  */
5631
5632   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5633       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5634     {
5635       vect_loop_versioning (loop_vinfo, th, check_profitability);
5636       check_profitability = false;
5637     }
5638
5639   /* Peel the loop if there are data refs with unknown alignment.
5640      Only one data ref with unknown store is allowed.  */
5641
5642   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5643     {
5644       vect_do_peeling_for_alignment (loop_vinfo, th, check_profitability);
5645       check_profitability = false;
5646     }
5647
5648   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5649      compile time constant), or it is a constant that doesn't divide by the
5650      vectorization factor, then an epilog loop needs to be created.
5651      We therefore duplicate the loop: the original loop will be vectorized,
5652      and will compute the first (n/VF) iterations.  The second copy of the loop
5653      will remain scalar and will compute the remaining (n%VF) iterations.
5654      (VF is the vectorization factor).  */
5655
5656   if ((int) tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
5657       < exact_log2 (vectorization_factor)
5658       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5659     vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
5660                                     th, check_profitability);
5661   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5662     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5663                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5664   else
5665     {
5666       tree ni_name, ratio_mult_vf;
5667       vect_generate_tmps_on_preheader (loop_vinfo, &ni_name, &ratio_mult_vf,
5668                                        &ratio, NULL);
5669     }
5670
5671   /* 1) Make sure the loop header has exactly two entries
5672      2) Make sure we have a preheader basic block.  */
5673
5674   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5675
5676   split_edge (loop_preheader_edge (loop));
5677
5678   /* FORNOW: the vectorizer supports only loops which body consist
5679      of one basic block (header + empty latch). When the vectorizer will
5680      support more involved loop forms, the order by which the BBs are
5681      traversed need to be reconsidered.  */
5682
5683   for (i = 0; i < nbbs; i++)
5684     {
5685       basic_block bb = bbs[i];
5686       stmt_vec_info stmt_info;
5687       gimple phi;
5688
5689       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5690         {
5691           phi = gsi_stmt (si);
5692           if (dump_enabled_p ())
5693             {
5694               dump_printf_loc (MSG_NOTE, vect_location,
5695                                "------>vectorizing phi: ");
5696               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5697               dump_printf (MSG_NOTE, "\n");
5698             }
5699           stmt_info = vinfo_for_stmt (phi);
5700           if (!stmt_info)
5701             continue;
5702
5703           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5704             vect_loop_kill_debug_uses (loop, phi);
5705
5706           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5707               && !STMT_VINFO_LIVE_P (stmt_info))
5708             continue;
5709
5710           if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5711                 != (unsigned HOST_WIDE_INT) vectorization_factor)
5712               && dump_enabled_p ())
5713             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
5714
5715           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5716             {
5717               if (dump_enabled_p ())
5718                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
5719               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5720             }
5721         }
5722
5723       pattern_stmt = NULL;
5724       for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5725         {
5726           bool is_store;
5727
5728           if (transform_pattern_stmt)
5729             stmt = pattern_stmt;
5730           else
5731             {
5732               stmt = gsi_stmt (si);
5733               /* During vectorization remove existing clobber stmts.  */
5734               if (gimple_clobber_p (stmt))
5735                 {
5736                   unlink_stmt_vdef (stmt);
5737                   gsi_remove (&si, true);
5738                   release_defs (stmt);
5739                   continue;
5740                 }
5741             }
5742
5743           if (dump_enabled_p ())
5744             {
5745               dump_printf_loc (MSG_NOTE, vect_location,
5746                                "------>vectorizing statement: ");
5747               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5748               dump_printf (MSG_NOTE, "\n");
5749             }
5750
5751           stmt_info = vinfo_for_stmt (stmt);
5752
5753           /* vector stmts created in the outer-loop during vectorization of
5754              stmts in an inner-loop may not have a stmt_info, and do not
5755              need to be vectorized.  */
5756           if (!stmt_info)
5757             {
5758               gsi_next (&si);
5759               continue;
5760             }
5761
5762           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5763             vect_loop_kill_debug_uses (loop, stmt);
5764
5765           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5766               && !STMT_VINFO_LIVE_P (stmt_info))
5767             {
5768               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5769                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5770                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5771                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5772                 {
5773                   stmt = pattern_stmt;
5774                   stmt_info = vinfo_for_stmt (stmt);
5775                 }
5776               else
5777                 {
5778                   gsi_next (&si);
5779                   continue;
5780                 }
5781             }
5782           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5783                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5784                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5785                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5786             transform_pattern_stmt = true;
5787
5788           /* If pattern statement has def stmts, vectorize them too.  */
5789           if (is_pattern_stmt_p (stmt_info))
5790             {
5791               if (pattern_def_seq == NULL)
5792                 {
5793                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
5794                   pattern_def_si = gsi_start (pattern_def_seq);
5795                 }
5796               else if (!gsi_end_p (pattern_def_si))
5797                 gsi_next (&pattern_def_si);
5798               if (pattern_def_seq != NULL)
5799                 {
5800                   gimple pattern_def_stmt = NULL;
5801                   stmt_vec_info pattern_def_stmt_info = NULL;
5802
5803                   while (!gsi_end_p (pattern_def_si))
5804                     {
5805                       pattern_def_stmt = gsi_stmt (pattern_def_si);
5806                       pattern_def_stmt_info
5807                         = vinfo_for_stmt (pattern_def_stmt);
5808                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
5809                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
5810                         break;
5811                       gsi_next (&pattern_def_si);
5812                     }
5813
5814                   if (!gsi_end_p (pattern_def_si))
5815                     {
5816                       if (dump_enabled_p ())
5817                         {
5818                           dump_printf_loc (MSG_NOTE, vect_location,
5819                                            "==> vectorizing pattern def "
5820                                            "stmt: ");
5821                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
5822                                             pattern_def_stmt, 0);
5823                           dump_printf (MSG_NOTE, "\n");
5824                         }
5825
5826                       stmt = pattern_def_stmt;
5827                       stmt_info = pattern_def_stmt_info;
5828                     }
5829                   else
5830                     {
5831                       pattern_def_si = gsi_none ();
5832                       transform_pattern_stmt = false;
5833                     }
5834                 }
5835               else
5836                 transform_pattern_stmt = false;
5837             }
5838
5839           gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5840           nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
5841                                                STMT_VINFO_VECTYPE (stmt_info));
5842           if (!STMT_SLP_TYPE (stmt_info)
5843               && nunits != (unsigned int) vectorization_factor
5844               && dump_enabled_p ())
5845             /* For SLP VF is set according to unrolling factor, and not to
5846                vector size, hence for SLP this print is not valid.  */
5847             dump_printf_loc (MSG_NOTE, vect_location,
5848                              "multiple-types.\n");
5849
5850           /* SLP. Schedule all the SLP instances when the first SLP stmt is
5851              reached.  */
5852           if (STMT_SLP_TYPE (stmt_info))
5853             {
5854               if (!slp_scheduled)
5855                 {
5856                   slp_scheduled = true;
5857
5858                   if (dump_enabled_p ())
5859                     dump_printf_loc (MSG_NOTE, vect_location,
5860                                      "=== scheduling SLP instances ===\n");
5861
5862                   vect_schedule_slp (loop_vinfo, NULL);
5863                 }
5864
5865               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
5866               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
5867                 {
5868                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5869                     {
5870                       pattern_def_seq = NULL;
5871                       gsi_next (&si);
5872                     }
5873                   continue;
5874                 }
5875             }
5876
5877           /* -------- vectorize statement ------------ */
5878           if (dump_enabled_p ())
5879             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
5880
5881           grouped_store = false;
5882           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
5883           if (is_store)
5884             {
5885               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
5886                 {
5887                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5888                      interleaving chain was completed - free all the stores in
5889                      the chain.  */
5890                   gsi_next (&si);
5891                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
5892                   continue;
5893                 }
5894               else
5895                 {
5896                   /* Free the attached stmt_vec_info and remove the stmt.  */
5897                   gimple store = gsi_stmt (si);
5898                   free_stmt_vec_info (store);
5899                   unlink_stmt_vdef (store);
5900                   gsi_remove (&si, true);
5901                   release_defs (store);
5902                   continue;
5903                 }
5904             }
5905
5906           if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5907             {
5908               pattern_def_seq = NULL;
5909               gsi_next (&si);
5910             }
5911         }                       /* stmts in BB */
5912     }                           /* BBs in loop */
5913
5914   slpeel_make_loop_iterate_ntimes (loop, ratio);
5915
5916   /* Reduce loop iterations by the vectorization factor.  */
5917   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
5918                       expected_iterations / vectorization_factor);
5919   loop->nb_iterations_upper_bound
5920     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (vectorization_factor),
5921                                             FLOOR_DIV_EXPR);
5922   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5923       && loop->nb_iterations_upper_bound != double_int_zero)
5924     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - double_int_one;
5925   if (loop->any_estimate)
5926     {
5927       loop->nb_iterations_estimate
5928         = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (vectorization_factor),
5929                                              FLOOR_DIV_EXPR);
5930        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5931            && loop->nb_iterations_estimate != double_int_zero)
5932          loop->nb_iterations_estimate = loop->nb_iterations_estimate - double_int_one;
5933     }
5934
5935   if (dump_enabled_p ())
5936     {
5937       dump_printf_loc (MSG_NOTE, vect_location,
5938                        "LOOP VECTORIZED\n");
5939       if (loop->inner)
5940         dump_printf_loc (MSG_NOTE, vect_location,
5941                          "OUTER LOOP VECTORIZED\n");
5942       dump_printf (MSG_NOTE, "\n");
5943     }
5944 }