gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2014 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "tree.h"
  28 #include "stor-layout.h"
  29 #include "predict.h"
  30 #include "vec.h"
  31 #include "hashtab.h"
  32 #include "hash-set.h"
  33 #include "machmode.h"
  34 #include "hard-reg-set.h"
  35 #include "input.h"
  36 #include "function.h"
  37 #include "dominance.h"
  38 #include "cfg.h"
  39 #include "cfganal.h"
  40 #include "basic-block.h"
  41 #include "gimple-pretty-print.h"
  42 #include "tree-ssa-alias.h"
  43 #include "internal-fn.h"
  44 #include "gimple-expr.h"
  45 #include "is-a.h"
  46 #include "gimple.h"
  47 #include "gimplify.h"
  48 #include "gimple-iterator.h"
  49 #include "gimplify-me.h"
  50 #include "gimple-ssa.h"
  51 #include "tree-phinodes.h"
  52 #include "ssa-iterators.h"
  53 #include "stringpool.h"
  54 #include "tree-ssanames.h"
  55 #include "tree-ssa-loop-ivopts.h"
  56 #include "tree-ssa-loop-manip.h"
  57 #include "tree-ssa-loop-niter.h"
  58 #include "tree-pass.h"
  59 #include "cfgloop.h"
  60 #include "expr.h"
  61 #include "recog.h"
  62 #include "optabs.h"
  63 #include "params.h"
  64 #include "diagnostic-core.h"
  65 #include "tree-chrec.h"
  66 #include "tree-scalar-evolution.h"
  67 #include "tree-vectorizer.h"
  68 #include "target.h"
  69
  70 /* Loop Vectorization Pass.
  71
  72    This pass tries to vectorize loops.
  73
  74    For example, the vectorizer transforms the following simple loop:
  75
  76         short a[N]; short b[N]; short c[N]; int i;
  77
  78         for (i=0; i<N; i++){
  79           a[i] = b[i] + c[i];
  80         }
  81
  82    as if it was manually vectorized by rewriting the source code into:
  83
  84         typedef int __attribute__((mode(V8HI))) v8hi;
  85         short a[N];  short b[N]; short c[N];   int i;
  86         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  87         v8hi va, vb, vc;
  88
  89         for (i=0; i<N/8; i++){
  90           vb = pb[i];
  91           vc = pc[i];
  92           va = vb + vc;
  93           pa[i] = va;
  94         }
  95
  96         The main entry to this pass is vectorize_loops(), in which
  97    the vectorizer applies a set of analyses on a given set of loops,
  98    followed by the actual vectorization transformation for the loops that
  99    had successfully passed the analysis phase.
 100         Throughout this pass we make a distinction between two types of
 101    data: scalars (which are represented by SSA_NAMES), and memory references
 102    ("data-refs").  These two types of data require different handling both
 103    during analysis and transformation. The types of data-refs that the
 104    vectorizer currently supports are ARRAY_REFS which base is an array DECL
 105    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
 106    accesses are required to have a simple (consecutive) access pattern.
 107
 108    Analysis phase:
 109    ===============
 110         The driver for the analysis phase is vect_analyze_loop().
 111    It applies a set of analyses, some of which rely on the scalar evolution
 112    analyzer (scev) developed by Sebastian Pop.
 113
 114         During the analysis phase the vectorizer records some information
 115    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 116    loop, as well as general information about the loop as a whole, which is
 117    recorded in a "loop_vec_info" struct attached to each loop.
 118
 119    Transformation phase:
 120    =====================
 121         The loop transformation phase scans all the stmts in the loop, and
 122    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 123    the loop that needs to be vectorized.  It inserts the vector code sequence
 124    just before the scalar stmt S, and records a pointer to the vector code
 125    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 126    attached to S).  This pointer will be used for the vectorization of following
 127    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 128    otherwise, we rely on dead code elimination for removing it.
 129
 130         For example, say stmt S1 was vectorized into stmt VS1:
 131
 132    VS1: vb = px[i];
 133    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 134    S2:  a = b;
 135
 136    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 137    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 138    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 139    resulting sequence would be:
 140
 141    VS1: vb = px[i];
 142    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 143    VS2: va = vb;
 144    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 145
 146         Operands that are not SSA_NAMEs, are data-refs that appear in
 147    load/store operations (like 'x[i]' in S1), and are handled differently.
 148
 149    Target modeling:
 150    =================
 151         Currently the only target specific information that is used is the
 152    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 153    Targets that can support different sizes of vectors, for now will need
 154    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 155    flexibility will be added in the future.
 156
 157         Since we only vectorize operations which vector form can be
 158    expressed using existing tree codes, to verify that an operation is
 159    supported, the vectorizer checks the relevant optab at the relevant
 160    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 161    the value found is CODE_FOR_nothing, then there's no target support, and
 162    we can't vectorize the stmt.
 163
 164    For additional information on this project see:
 165    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 166 */
 167
 168 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 169
 170 /* Function vect_determine_vectorization_factor
 171
 172    Determine the vectorization factor (VF).  VF is the number of data elements
 173    that are operated upon in parallel in a single iteration of the vectorized
 174    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 175    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 176    elements can fit in a single vector register.
 177
 178    We currently support vectorization of loops in which all types operated upon
 179    are of the same size.  Therefore this function currently sets VF according to
 180    the size of the types operated upon, and fails if there are multiple sizes
 181    in the loop.
 182
 183    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 184    original loop:
 185         for (i=0; i<N; i++){
 186           a[i] = b[i] + c[i];
 187         }
 188
 189    vectorized loop:
 190         for (i=0; i<N; i+=VF){
 191           a[i:VF] = b[i:VF] + c[i:VF];
 192         }
 193 */
 194
 195 static bool
 196 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 197 {
 198   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 199   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 200   int nbbs = loop->num_nodes;
 201   gimple_stmt_iterator si;
 202   unsigned int vectorization_factor = 0;
 203   tree scalar_type;
 204   gimple phi;
 205   tree vectype;
 206   unsigned int nunits;
 207   stmt_vec_info stmt_info;
 208   int i;
 209   HOST_WIDE_INT dummy;
 210   gimple stmt, pattern_stmt = NULL;
 211   gimple_seq pattern_def_seq = NULL;
 212   gimple_stmt_iterator pattern_def_si = gsi_none ();
 213   bool analyze_pattern_stmt = false;
 214
 215   if (dump_enabled_p ())
 216     dump_printf_loc (MSG_NOTE, vect_location,
 217                      "=== vect_determine_vectorization_factor ===\n");
 218
 219   for (i = 0; i < nbbs; i++)
 220     {
 221       basic_block bb = bbs[i];
 222
 223       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 224         {
 225           phi = gsi_stmt (si);
 226           stmt_info = vinfo_for_stmt (phi);
 227           if (dump_enabled_p ())
 228             {
 229               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 230               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 231               dump_printf (MSG_NOTE, "\n");
 232             }
 233
 234           gcc_assert (stmt_info);
 235
 236           if (STMT_VINFO_RELEVANT_P (stmt_info))
 237             {
 238               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 239               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 240
 241               if (dump_enabled_p ())
 242                 {
 243                   dump_printf_loc (MSG_NOTE, vect_location,
 244                                    "get vectype for scalar type:  ");
 245                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 246                   dump_printf (MSG_NOTE, "\n");
 247                 }
 248
 249               vectype = get_vectype_for_scalar_type (scalar_type);
 250               if (!vectype)
 251                 {
 252                   if (dump_enabled_p ())
 253                     {
 254                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 255                                        "not vectorized: unsupported "
 256                                        "data-type ");
 257                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 258                                          scalar_type);
 259                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 260                     }
 261                   return false;
 262                 }
 263               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 264
 265               if (dump_enabled_p ())
 266                 {
 267                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 268                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 269                   dump_printf (MSG_NOTE, "\n");
 270                 }
 271
 272               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 273               if (dump_enabled_p ())
 274                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 275                                  nunits);
 276
 277               if (!vectorization_factor
 278                   || (nunits > vectorization_factor))
 279                 vectorization_factor = nunits;
 280             }
 281         }
 282
 283       for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
 284         {
 285           tree vf_vectype;
 286
 287           if (analyze_pattern_stmt)
 288             stmt = pattern_stmt;
 289           else
 290             stmt = gsi_stmt (si);
 291
 292           stmt_info = vinfo_for_stmt (stmt);
 293
 294           if (dump_enabled_p ())
 295             {
 296               dump_printf_loc (MSG_NOTE, vect_location,
 297                                "==> examining statement: ");
 298               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 299               dump_printf (MSG_NOTE, "\n");
 300             }
 301
 302           gcc_assert (stmt_info);
 303
 304           /* Skip stmts which do not need to be vectorized.  */
 305           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 306                && !STMT_VINFO_LIVE_P (stmt_info))
 307               || gimple_clobber_p (stmt))
 308             {
 309               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 310                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 311                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 312                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 313                 {
 314                   stmt = pattern_stmt;
 315                   stmt_info = vinfo_for_stmt (pattern_stmt);
 316                   if (dump_enabled_p ())
 317                     {
 318                       dump_printf_loc (MSG_NOTE, vect_location,
 319                                        "==> examining pattern statement: ");
 320                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 321                       dump_printf (MSG_NOTE, "\n");
 322                     }
 323                 }
 324               else
 325                 {
 326                   if (dump_enabled_p ())
 327                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 328                   gsi_next (&si);
 329                   continue;
 330                 }
 331             }
 332           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 333                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 334                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 335                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 336             analyze_pattern_stmt = true;
 337
 338           /* If a pattern statement has def stmts, analyze them too.  */
 339           if (is_pattern_stmt_p (stmt_info))
 340             {
 341               if (pattern_def_seq == NULL)
 342                 {
 343                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 344                   pattern_def_si = gsi_start (pattern_def_seq);
 345                 }
 346               else if (!gsi_end_p (pattern_def_si))
 347                 gsi_next (&pattern_def_si);
 348               if (pattern_def_seq != NULL)
 349                 {
 350                   gimple pattern_def_stmt = NULL;
 351                   stmt_vec_info pattern_def_stmt_info = NULL;
 352
 353                   while (!gsi_end_p (pattern_def_si))
 354                     {
 355                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 356                       pattern_def_stmt_info
 357                         = vinfo_for_stmt (pattern_def_stmt);
 358                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 359                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 360                         break;
 361                       gsi_next (&pattern_def_si);
 362                     }
 363
 364                   if (!gsi_end_p (pattern_def_si))
 365                     {
 366                       if (dump_enabled_p ())
 367                         {
 368                           dump_printf_loc (MSG_NOTE, vect_location,
 369                                            "==> examining pattern def stmt: ");
 370                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 371                                             pattern_def_stmt, 0);
 372                           dump_printf (MSG_NOTE, "\n");
 373                         }
 374
 375                       stmt = pattern_def_stmt;
 376                       stmt_info = pattern_def_stmt_info;
 377                     }
 378                   else
 379                     {
 380                       pattern_def_si = gsi_none ();
 381                       analyze_pattern_stmt = false;
 382                     }
 383                 }
 384               else
 385                 analyze_pattern_stmt = false;
 386             }
 387
 388           if (gimple_get_lhs (stmt) == NULL_TREE
 389               /* MASK_STORE has no lhs, but is ok.  */
 390               && (!is_gimple_call (stmt)
 391                   || !gimple_call_internal_p (stmt)
 392                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 393             {
 394               if (is_gimple_call (stmt))
 395                 {
 396                   /* Ignore calls with no lhs.  These must be calls to
 397                      #pragma omp simd functions, and what vectorization factor
 398                      it really needs can't be determined until
 399                      vectorizable_simd_clone_call.  */
 400                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 401                     {
 402                       pattern_def_seq = NULL;
 403                       gsi_next (&si);
 404                     }
 405                   continue;
 406                 }
 407               if (dump_enabled_p ())
 408                 {
 409                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 410                                    "not vectorized: irregular stmt.");
 411                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 412                                     0);
 413                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 414                 }
 415               return false;
 416             }
 417
 418           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 419             {
 420               if (dump_enabled_p ())
 421                 {
 422                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 423                                    "not vectorized: vector stmt in loop:");
 424                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 425                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 426                 }
 427               return false;
 428             }
 429
 430           if (STMT_VINFO_VECTYPE (stmt_info))
 431             {
 432               /* The only case when a vectype had been already set is for stmts
 433                  that contain a dataref, or for "pattern-stmts" (stmts
 434                  generated by the vectorizer to represent/replace a certain
 435                  idiom).  */
 436               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 437                           || is_pattern_stmt_p (stmt_info)
 438                           || !gsi_end_p (pattern_def_si));
 439               vectype = STMT_VINFO_VECTYPE (stmt_info);
 440             }
 441           else
 442             {
 443               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 444               if (is_gimple_call (stmt)
 445                   && gimple_call_internal_p (stmt)
 446                   && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
 447                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 448               else
 449                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 450               if (dump_enabled_p ())
 451                 {
 452                   dump_printf_loc (MSG_NOTE, vect_location,
 453                                    "get vectype for scalar type:  ");
 454                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 455                   dump_printf (MSG_NOTE, "\n");
 456                 }
 457               vectype = get_vectype_for_scalar_type (scalar_type);
 458               if (!vectype)
 459                 {
 460                   if (dump_enabled_p ())
 461                     {
 462                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 463                                        "not vectorized: unsupported "
 464                                        "data-type ");
 465                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 466                                          scalar_type);
 467                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 468                     }
 469                   return false;
 470                 }
 471
 472               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 473
 474               if (dump_enabled_p ())
 475                 {
 476                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 477                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 478                   dump_printf (MSG_NOTE, "\n");
 479                 }
 480             }
 481
 482           /* The vectorization factor is according to the smallest
 483              scalar type (or the largest vector size, but we only
 484              support one vector size per loop).  */
 485           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 486                                                        &dummy);
 487           if (dump_enabled_p ())
 488             {
 489               dump_printf_loc (MSG_NOTE, vect_location,
 490                                "get vectype for scalar type:  ");
 491               dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 492               dump_printf (MSG_NOTE, "\n");
 493             }
 494           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 495           if (!vf_vectype)
 496             {
 497               if (dump_enabled_p ())
 498                 {
 499                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 500                                    "not vectorized: unsupported data-type ");
 501                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 502                                      scalar_type);
 503                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 504                 }
 505               return false;
 506             }
 507
 508           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 509                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 510             {
 511               if (dump_enabled_p ())
 512                 {
 513                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 514                                    "not vectorized: different sized vector "
 515                                    "types in statement, ");
 516                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 517                                      vectype);
 518                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 519                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 520                                      vf_vectype);
 521                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 522                 }
 523               return false;
 524             }
 525
 526           if (dump_enabled_p ())
 527             {
 528               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 529               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 530               dump_printf (MSG_NOTE, "\n");
 531             }
 532
 533           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 534           if (dump_enabled_p ())
 535             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 536           if (!vectorization_factor
 537               || (nunits > vectorization_factor))
 538             vectorization_factor = nunits;
 539
 540           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 541             {
 542               pattern_def_seq = NULL;
 543               gsi_next (&si);
 544             }
 545         }
 546     }
 547
 548   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 549   if (dump_enabled_p ())
 550     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 551                      vectorization_factor);
 552   if (vectorization_factor <= 1)
 553     {
 554       if (dump_enabled_p ())
 555         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 556                          "not vectorized: unsupported data-type\n");
 557       return false;
 558     }
 559   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 560
 561   return true;
 562 }
 563
 564
 565 /* Function vect_is_simple_iv_evolution.
 566
 567    FORNOW: A simple evolution of an induction variables in the loop is
 568    considered a polynomial evolution.  */
 569
 570 static bool
 571 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 572                              tree * step)
 573 {
 574   tree init_expr;
 575   tree step_expr;
 576   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 577   basic_block bb;
 578
 579   /* When there is no evolution in this loop, the evolution function
 580      is not "simple".  */
 581   if (evolution_part == NULL_TREE)
 582     return false;
 583
 584   /* When the evolution is a polynomial of degree >= 2
 585      the evolution function is not "simple".  */
 586   if (tree_is_chrec (evolution_part))
 587     return false;
 588
 589   step_expr = evolution_part;
 590   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 591
 592   if (dump_enabled_p ())
 593     {
 594       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 595       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 596       dump_printf (MSG_NOTE, ",  init: ");
 597       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 598       dump_printf (MSG_NOTE, "\n");
 599     }
 600
 601   *init = init_expr;
 602   *step = step_expr;
 603
 604   if (TREE_CODE (step_expr) != INTEGER_CST
 605       && (TREE_CODE (step_expr) != SSA_NAME
 606           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 607               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 608           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 609               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 610                   || !flag_associative_math)))
 611       && (TREE_CODE (step_expr) != REAL_CST
 612           || !flag_associative_math))
 613     {
 614       if (dump_enabled_p ())
 615         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 616                          "step unknown.\n");
 617       return false;
 618     }
 619
 620   return true;
 621 }
 622
 623 /* Function vect_analyze_scalar_cycles_1.
 624
 625    Examine the cross iteration def-use cycles of scalar variables
 626    in LOOP.  LOOP_VINFO represents the loop that is now being
 627    considered for vectorization (can be LOOP, or an outer-loop
 628    enclosing LOOP).  */
 629
 630 static void
 631 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 632 {
 633   basic_block bb = loop->header;
 634   tree init, step;
 635   auto_vec<gimple, 64> worklist;
 636   gimple_stmt_iterator gsi;
 637   bool double_reduc;
 638
 639   if (dump_enabled_p ())
 640     dump_printf_loc (MSG_NOTE, vect_location,
 641                      "=== vect_analyze_scalar_cycles ===\n");
 642
 643   /* First - identify all inductions.  Reduction detection assumes that all the
 644      inductions have been identified, therefore, this order must not be
 645      changed.  */
 646   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 647     {
 648       gimple phi = gsi_stmt (gsi);
 649       tree access_fn = NULL;
 650       tree def = PHI_RESULT (phi);
 651       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 652
 653       if (dump_enabled_p ())
 654         {
 655           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 656           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 657           dump_printf (MSG_NOTE, "\n");
 658         }
 659
 660       /* Skip virtual phi's.  The data dependences that are associated with
 661          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 662       if (virtual_operand_p (def))
 663         continue;
 664
 665       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 666
 667       /* Analyze the evolution function.  */
 668       access_fn = analyze_scalar_evolution (loop, def);
 669       if (access_fn)
 670         {
 671           STRIP_NOPS (access_fn);
 672           if (dump_enabled_p ())
 673             {
 674               dump_printf_loc (MSG_NOTE, vect_location,
 675                                "Access function of PHI: ");
 676               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 677               dump_printf (MSG_NOTE, "\n");
 678             }
 679           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 680             = evolution_part_in_loop_num (access_fn, loop->num);
 681         }
 682
 683       if (!access_fn
 684           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 685           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 686               && TREE_CODE (step) != INTEGER_CST))
 687         {
 688           worklist.safe_push (phi);
 689           continue;
 690         }
 691
 692       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 693
 694       if (dump_enabled_p ())
 695         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 696       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 697     }
 698
 699
 700   /* Second - identify all reductions and nested cycles.  */
 701   while (worklist.length () > 0)
 702     {
 703       gimple phi = worklist.pop ();
 704       tree def = PHI_RESULT (phi);
 705       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 706       gimple reduc_stmt;
 707       bool nested_cycle;
 708
 709       if (dump_enabled_p ())
 710         {
 711           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 712           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 713           dump_printf (MSG_NOTE, "\n");
 714         }
 715
 716       gcc_assert (!virtual_operand_p (def)
 717                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 718
 719       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 720       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 721                                                 &double_reduc);
 722       if (reduc_stmt)
 723         {
 724           if (double_reduc)
 725             {
 726               if (dump_enabled_p ())
 727                 dump_printf_loc (MSG_NOTE, vect_location,
 728                                  "Detected double reduction.\n");
 729
 730               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 731               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 732                                                     vect_double_reduction_def;
 733             }
 734           else
 735             {
 736               if (nested_cycle)
 737                 {
 738                   if (dump_enabled_p ())
 739                     dump_printf_loc (MSG_NOTE, vect_location,
 740                                      "Detected vectorizable nested cycle.\n");
 741
 742                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 743                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 744                                                              vect_nested_cycle;
 745                 }
 746               else
 747                 {
 748                   if (dump_enabled_p ())
 749                     dump_printf_loc (MSG_NOTE, vect_location,
 750                                      "Detected reduction.\n");
 751
 752                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 753                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 754                                                            vect_reduction_def;
 755                   /* Store the reduction cycles for possible vectorization in
 756                      loop-aware SLP.  */
 757                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 758                 }
 759             }
 760         }
 761       else
 762         if (dump_enabled_p ())
 763           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 764                            "Unknown def-use cycle pattern.\n");
 765     }
 766 }
 767
 768
 769 /* Function vect_analyze_scalar_cycles.
 770
 771    Examine the cross iteration def-use cycles of scalar variables, by
 772    analyzing the loop-header PHIs of scalar variables.  Classify each
 773    cycle as one of the following: invariant, induction, reduction, unknown.
 774    We do that for the loop represented by LOOP_VINFO, and also to its
 775    inner-loop, if exists.
 776    Examples for scalar cycles:
 777
 778    Example1: reduction:
 779
 780               loop1:
 781               for (i=0; i<N; i++)
 782                  sum += a[i];
 783
 784    Example2: induction:
 785
 786               loop2:
 787               for (i=0; i<N; i++)
 788                  a[i] = i;  */
 789
 790 static void
 791 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 792 {
 793   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 794
 795   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 796
 797   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 798      Reductions in such inner-loop therefore have different properties than
 799      the reductions in the nest that gets vectorized:
 800      1. When vectorized, they are executed in the same order as in the original
 801         scalar loop, so we can't change the order of computation when
 802         vectorizing them.
 803      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 804         current checks are too strict.  */
 805
 806   if (loop->inner)
 807     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 808 }
 809
 810
 811 /* Function vect_get_loop_niters.
 812
 813    Determine how many iterations the loop is executed and place it
 814    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 815    in NUMBER_OF_ITERATIONSM1.
 816
 817    Return the loop exit condition.  */
 818
 819 static gimple
 820 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
 821                       tree *number_of_iterationsm1)
 822 {
 823   tree niters;
 824
 825   if (dump_enabled_p ())
 826     dump_printf_loc (MSG_NOTE, vect_location,
 827                      "=== get_loop_niters ===\n");
 828
 829   niters = number_of_latch_executions (loop);
 830   *number_of_iterationsm1 = niters;
 831
 832   /* We want the number of loop header executions which is the number
 833      of latch executions plus one.
 834      ???  For UINT_MAX latch executions this number overflows to zero
 835      for loops like do { n++; } while (n != 0);  */
 836   if (niters && !chrec_contains_undetermined (niters))
 837     niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
 838                           build_int_cst (TREE_TYPE (niters), 1));
 839   *number_of_iterations = niters;
 840
 841   return get_loop_exit_condition (loop);
 842 }
 843
 844
 845 /* Function bb_in_loop_p
 846
 847    Used as predicate for dfs order traversal of the loop bbs.  */
 848
 849 static bool
 850 bb_in_loop_p (const_basic_block bb, const void *data)
 851 {
 852   const struct loop *const loop = (const struct loop *)data;
 853   if (flow_bb_inside_loop_p (loop, bb))
 854     return true;
 855   return false;
 856 }
 857
 858
 859 /* Function new_loop_vec_info.
 860
 861    Create and initialize a new loop_vec_info struct for LOOP, as well as
 862    stmt_vec_info structs for all the stmts in LOOP.  */
 863
 864 static loop_vec_info
 865 new_loop_vec_info (struct loop *loop)
 866 {
 867   loop_vec_info res;
 868   basic_block *bbs;
 869   gimple_stmt_iterator si;
 870   unsigned int i, nbbs;
 871
 872   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 873   LOOP_VINFO_LOOP (res) = loop;
 874
 875   bbs = get_loop_body (loop);
 876
 877   /* Create/Update stmt_info for all stmts in the loop.  */
 878   for (i = 0; i < loop->num_nodes; i++)
 879     {
 880       basic_block bb = bbs[i];
 881
 882       /* BBs in a nested inner-loop will have been already processed (because
 883          we will have called vect_analyze_loop_form for any nested inner-loop).
 884          Therefore, for stmts in an inner-loop we just want to update the
 885          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 886          loop_info of the outer-loop we are currently considering to vectorize
 887          (instead of the loop_info of the inner-loop).
 888          For stmts in other BBs we need to create a stmt_info from scratch.  */
 889       if (bb->loop_father != loop)
 890         {
 891           /* Inner-loop bb.  */
 892           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 893           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 894             {
 895               gimple phi = gsi_stmt (si);
 896               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 897               loop_vec_info inner_loop_vinfo =
 898                 STMT_VINFO_LOOP_VINFO (stmt_info);
 899               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 900               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 901             }
 902           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 903            {
 904               gimple stmt = gsi_stmt (si);
 905               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 906               loop_vec_info inner_loop_vinfo =
 907                  STMT_VINFO_LOOP_VINFO (stmt_info);
 908               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 909               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 910            }
 911         }
 912       else
 913         {
 914           /* bb in current nest.  */
 915           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 916             {
 917               gimple phi = gsi_stmt (si);
 918               gimple_set_uid (phi, 0);
 919               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 920             }
 921
 922           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 923             {
 924               gimple stmt = gsi_stmt (si);
 925               gimple_set_uid (stmt, 0);
 926               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 927             }
 928         }
 929     }
 930
 931   /* CHECKME: We want to visit all BBs before their successors (except for
 932      latch blocks, for which this assertion wouldn't hold).  In the simple
 933      case of the loop forms we allow, a dfs order of the BBs would the same
 934      as reversed postorder traversal, so we are safe.  */
 935
 936    free (bbs);
 937    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 938    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 939                               bbs, loop->num_nodes, loop);
 940    gcc_assert (nbbs == loop->num_nodes);
 941
 942   LOOP_VINFO_BBS (res) = bbs;
 943   LOOP_VINFO_NITERSM1 (res) = NULL;
 944   LOOP_VINFO_NITERS (res) = NULL;
 945   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 946   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 947   LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
 948   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 949   LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
 950   LOOP_VINFO_VECT_FACTOR (res) = 0;
 951   LOOP_VINFO_LOOP_NEST (res).create (3);
 952   LOOP_VINFO_DATAREFS (res).create (10);
 953   LOOP_VINFO_DDRS (res).create (10 * 10);
 954   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 955   LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
 956              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 957   LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
 958              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 959   LOOP_VINFO_GROUPED_STORES (res).create (10);
 960   LOOP_VINFO_REDUCTIONS (res).create (10);
 961   LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
 962   LOOP_VINFO_SLP_INSTANCES (res).create (10);
 963   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 964   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
 965   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
 966   LOOP_VINFO_PEELING_FOR_NITER (res) = false;
 967   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
 968
 969   return res;
 970 }
 971
 972
 973 /* Function destroy_loop_vec_info.
 974
 975    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 976    stmts in the loop.  */
 977
 978 void
 979 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 980 {
 981   struct loop *loop;
 982   basic_block *bbs;
 983   int nbbs;
 984   gimple_stmt_iterator si;
 985   int j;
 986   vec<slp_instance> slp_instances;
 987   slp_instance instance;
 988   bool swapped;
 989
 990   if (!loop_vinfo)
 991     return;
 992
 993   loop = LOOP_VINFO_LOOP (loop_vinfo);
 994
 995   bbs = LOOP_VINFO_BBS (loop_vinfo);
 996   nbbs = clean_stmts ? loop->num_nodes : 0;
 997   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
 998
 999   for (j = 0; j < nbbs; j++)
1000     {
1001       basic_block bb = bbs[j];
1002       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1003         free_stmt_vec_info (gsi_stmt (si));
1004
1005       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1006         {
1007           gimple stmt = gsi_stmt (si);
1008
1009           /* We may have broken canonical form by moving a constant
1010              into RHS1 of a commutative op.  Fix such occurrences.  */
1011           if (swapped && is_gimple_assign (stmt))
1012             {
1013               enum tree_code code = gimple_assign_rhs_code (stmt);
1014
1015               if ((code == PLUS_EXPR
1016                    || code == POINTER_PLUS_EXPR
1017                    || code == MULT_EXPR)
1018                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1019                 swap_ssa_operands (stmt,
1020                                    gimple_assign_rhs1_ptr (stmt),
1021                                    gimple_assign_rhs2_ptr (stmt));
1022             }
1023
1024           /* Free stmt_vec_info.  */
1025           free_stmt_vec_info (stmt);
1026           gsi_next (&si);
1027         }
1028     }
1029
1030   free (LOOP_VINFO_BBS (loop_vinfo));
1031   vect_destroy_datarefs (loop_vinfo, NULL);
1032   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1033   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1034   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1035   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1036   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1037   FOR_EACH_VEC_ELT (slp_instances, j, instance)
1038     vect_free_slp_instance (instance);
1039
1040   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1041   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1042   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1043   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1044
1045   delete LOOP_VINFO_PEELING_HTAB (loop_vinfo);
1046   LOOP_VINFO_PEELING_HTAB (loop_vinfo) = NULL;
1047
1048   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1049
1050   free (loop_vinfo);
1051   loop->aux = NULL;
1052 }
1053
1054
1055 /* Function vect_analyze_loop_1.
1056
1057    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1058    for it. The different analyses will record information in the
1059    loop_vec_info struct.  This is a subset of the analyses applied in
1060    vect_analyze_loop, to be applied on an inner-loop nested in the loop
1061    that is now considered for (outer-loop) vectorization.  */
1062
1063 static loop_vec_info
1064 vect_analyze_loop_1 (struct loop *loop)
1065 {
1066   loop_vec_info loop_vinfo;
1067
1068   if (dump_enabled_p ())
1069     dump_printf_loc (MSG_NOTE, vect_location,
1070                      "===== analyze_loop_nest_1 =====\n");
1071
1072   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1073
1074   loop_vinfo = vect_analyze_loop_form (loop);
1075   if (!loop_vinfo)
1076     {
1077       if (dump_enabled_p ())
1078         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1079                          "bad inner-loop form.\n");
1080       return NULL;
1081     }
1082
1083   return loop_vinfo;
1084 }
1085
1086
1087 /* Function vect_analyze_loop_form.
1088
1089    Verify that certain CFG restrictions hold, including:
1090    - the loop has a pre-header
1091    - the loop has a single entry and exit
1092    - the loop exit condition is simple enough, and the number of iterations
1093      can be analyzed (a countable loop).  */
1094
1095 loop_vec_info
1096 vect_analyze_loop_form (struct loop *loop)
1097 {
1098   loop_vec_info loop_vinfo;
1099   gimple loop_cond;
1100   tree number_of_iterations = NULL, number_of_iterationsm1 = NULL;
1101   loop_vec_info inner_loop_vinfo = NULL;
1102
1103   if (dump_enabled_p ())
1104     dump_printf_loc (MSG_NOTE, vect_location,
1105                      "=== vect_analyze_loop_form ===\n");
1106
1107   /* Different restrictions apply when we are considering an inner-most loop,
1108      vs. an outer (nested) loop.
1109      (FORNOW. May want to relax some of these restrictions in the future).  */
1110
1111   if (!loop->inner)
1112     {
1113       /* Inner-most loop.  We currently require that the number of BBs is
1114          exactly 2 (the header and latch).  Vectorizable inner-most loops
1115          look like this:
1116
1117                         (pre-header)
1118                            |
1119                           header <--------+
1120                            | |            |
1121                            | +--> latch --+
1122                            |
1123                         (exit-bb)  */
1124
1125       if (loop->num_nodes != 2)
1126         {
1127           if (dump_enabled_p ())
1128             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1129                              "not vectorized: control flow in loop.\n");
1130           return NULL;
1131         }
1132
1133       if (empty_block_p (loop->header))
1134         {
1135           if (dump_enabled_p ())
1136             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1137                              "not vectorized: empty loop.\n");
1138           return NULL;
1139         }
1140     }
1141   else
1142     {
1143       struct loop *innerloop = loop->inner;
1144       edge entryedge;
1145
1146       /* Nested loop. We currently require that the loop is doubly-nested,
1147          contains a single inner loop, and the number of BBs is exactly 5.
1148          Vectorizable outer-loops look like this:
1149
1150                         (pre-header)
1151                            |
1152                           header <---+
1153                            |         |
1154                           inner-loop |
1155                            |         |
1156                           tail ------+
1157                            |
1158                         (exit-bb)
1159
1160          The inner-loop has the properties expected of inner-most loops
1161          as described above.  */
1162
1163       if ((loop->inner)->inner || (loop->inner)->next)
1164         {
1165           if (dump_enabled_p ())
1166             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1167                              "not vectorized: multiple nested loops.\n");
1168           return NULL;
1169         }
1170
1171       /* Analyze the inner-loop.  */
1172       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1173       if (!inner_loop_vinfo)
1174         {
1175           if (dump_enabled_p ())
1176             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1177                              "not vectorized: Bad inner loop.\n");
1178           return NULL;
1179         }
1180
1181       if (!expr_invariant_in_loop_p (loop,
1182                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
1183         {
1184           if (dump_enabled_p ())
1185             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1186                              "not vectorized: inner-loop count not"
1187                              " invariant.\n");
1188           destroy_loop_vec_info (inner_loop_vinfo, true);
1189           return NULL;
1190         }
1191
1192       if (loop->num_nodes != 5)
1193         {
1194           if (dump_enabled_p ())
1195             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1196                              "not vectorized: control flow in loop.\n");
1197           destroy_loop_vec_info (inner_loop_vinfo, true);
1198           return NULL;
1199         }
1200
1201       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1202       entryedge = EDGE_PRED (innerloop->header, 0);
1203       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1204         entryedge = EDGE_PRED (innerloop->header, 1);
1205
1206       if (entryedge->src != loop->header
1207           || !single_exit (innerloop)
1208           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1209         {
1210           if (dump_enabled_p ())
1211             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1212                              "not vectorized: unsupported outerloop form.\n");
1213           destroy_loop_vec_info (inner_loop_vinfo, true);
1214           return NULL;
1215         }
1216
1217       if (dump_enabled_p ())
1218         dump_printf_loc (MSG_NOTE, vect_location,
1219                          "Considering outer-loop vectorization.\n");
1220     }
1221
1222   if (!single_exit (loop)
1223       || EDGE_COUNT (loop->header->preds) != 2)
1224     {
1225       if (dump_enabled_p ())
1226         {
1227           if (!single_exit (loop))
1228             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1229                              "not vectorized: multiple exits.\n");
1230           else if (EDGE_COUNT (loop->header->preds) != 2)
1231             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1232                              "not vectorized: too many incoming edges.\n");
1233         }
1234       if (inner_loop_vinfo)
1235         destroy_loop_vec_info (inner_loop_vinfo, true);
1236       return NULL;
1237     }
1238
1239   /* We assume that the loop exit condition is at the end of the loop. i.e,
1240      that the loop is represented as a do-while (with a proper if-guard
1241      before the loop if needed), where the loop header contains all the
1242      executable statements, and the latch is empty.  */
1243   if (!empty_block_p (loop->latch)
1244       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1245     {
1246       if (dump_enabled_p ())
1247         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1248                          "not vectorized: latch block not empty.\n");
1249       if (inner_loop_vinfo)
1250         destroy_loop_vec_info (inner_loop_vinfo, true);
1251       return NULL;
1252     }
1253
1254   /* Make sure there exists a single-predecessor exit bb:  */
1255   if (!single_pred_p (single_exit (loop)->dest))
1256     {
1257       edge e = single_exit (loop);
1258       if (!(e->flags & EDGE_ABNORMAL))
1259         {
1260           split_loop_exit_edge (e);
1261           if (dump_enabled_p ())
1262             dump_printf (MSG_NOTE, "split exit edge.\n");
1263         }
1264       else
1265         {
1266           if (dump_enabled_p ())
1267             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1268                              "not vectorized: abnormal loop exit edge.\n");
1269           if (inner_loop_vinfo)
1270             destroy_loop_vec_info (inner_loop_vinfo, true);
1271           return NULL;
1272         }
1273     }
1274
1275   loop_cond = vect_get_loop_niters (loop, &number_of_iterations,
1276                                     &number_of_iterationsm1);
1277   if (!loop_cond)
1278     {
1279       if (dump_enabled_p ())
1280         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1281                          "not vectorized: complicated exit condition.\n");
1282       if (inner_loop_vinfo)
1283         destroy_loop_vec_info (inner_loop_vinfo, true);
1284       return NULL;
1285     }
1286
1287   if (!number_of_iterations
1288       || chrec_contains_undetermined (number_of_iterations))
1289     {
1290       if (dump_enabled_p ())
1291         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1292                          "not vectorized: number of iterations cannot be "
1293                          "computed.\n");
1294       if (inner_loop_vinfo)
1295         destroy_loop_vec_info (inner_loop_vinfo, true);
1296       return NULL;
1297     }
1298
1299   if (integer_zerop (number_of_iterations))
1300     {
1301       if (dump_enabled_p ())
1302         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1303                          "not vectorized: number of iterations = 0.\n");
1304       if (inner_loop_vinfo)
1305         destroy_loop_vec_info (inner_loop_vinfo, true);
1306       return NULL;
1307     }
1308
1309   loop_vinfo = new_loop_vec_info (loop);
1310   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1311   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1312   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1313
1314   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1315     {
1316       if (dump_enabled_p ())
1317         {
1318           dump_printf_loc (MSG_NOTE, vect_location,
1319                            "Symbolic number of iterations is ");
1320           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1321           dump_printf (MSG_NOTE, "\n");
1322         }
1323     }
1324
1325   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1326
1327   /* CHECKME: May want to keep it around it in the future.  */
1328   if (inner_loop_vinfo)
1329     destroy_loop_vec_info (inner_loop_vinfo, false);
1330
1331   gcc_assert (!loop->aux);
1332   loop->aux = loop_vinfo;
1333   return loop_vinfo;
1334 }
1335
1336
1337 /* Function vect_analyze_loop_operations.
1338
1339    Scan the loop stmts and make sure they are all vectorizable.  */
1340
1341 static bool
1342 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1343 {
1344   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1345   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1346   int nbbs = loop->num_nodes;
1347   gimple_stmt_iterator si;
1348   unsigned int vectorization_factor = 0;
1349   int i;
1350   gimple phi;
1351   stmt_vec_info stmt_info;
1352   bool need_to_vectorize = false;
1353   int min_profitable_iters;
1354   int min_scalar_loop_bound;
1355   unsigned int th;
1356   bool only_slp_in_loop = true, ok;
1357   HOST_WIDE_INT max_niter;
1358   HOST_WIDE_INT estimated_niter;
1359   int min_profitable_estimate;
1360
1361   if (dump_enabled_p ())
1362     dump_printf_loc (MSG_NOTE, vect_location,
1363                      "=== vect_analyze_loop_operations ===\n");
1364
1365   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1366   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1367   if (slp)
1368     {
1369       /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1370          vectorization factor of the loop is the unrolling factor required by
1371          the SLP instances.  If that unrolling factor is 1, we say, that we
1372          perform pure SLP on loop - cross iteration parallelism is not
1373          exploited.  */
1374       for (i = 0; i < nbbs; i++)
1375         {
1376           basic_block bb = bbs[i];
1377           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1378             {
1379               gimple stmt = gsi_stmt (si);
1380               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1381               gcc_assert (stmt_info);
1382               if ((STMT_VINFO_RELEVANT_P (stmt_info)
1383                    || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1384                   && !PURE_SLP_STMT (stmt_info))
1385                 /* STMT needs both SLP and loop-based vectorization.  */
1386                 only_slp_in_loop = false;
1387             }
1388         }
1389
1390       if (only_slp_in_loop)
1391         vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1392       else
1393         vectorization_factor = least_common_multiple (vectorization_factor,
1394                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1395
1396       LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1397       if (dump_enabled_p ())
1398         dump_printf_loc (MSG_NOTE, vect_location,
1399                          "Updating vectorization factor to %d\n",
1400                          vectorization_factor);
1401     }
1402
1403   for (i = 0; i < nbbs; i++)
1404     {
1405       basic_block bb = bbs[i];
1406
1407       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1408         {
1409           phi = gsi_stmt (si);
1410           ok = true;
1411
1412           stmt_info = vinfo_for_stmt (phi);
1413           if (dump_enabled_p ())
1414             {
1415               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1416               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1417               dump_printf (MSG_NOTE, "\n");
1418             }
1419
1420           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1421              (i.e., a phi in the tail of the outer-loop).  */
1422           if (! is_loop_header_bb_p (bb))
1423             {
1424               /* FORNOW: we currently don't support the case that these phis
1425                  are not used in the outerloop (unless it is double reduction,
1426                  i.e., this phi is vect_reduction_def), cause this case
1427                  requires to actually do something here.  */
1428               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1429                    || STMT_VINFO_LIVE_P (stmt_info))
1430                   && STMT_VINFO_DEF_TYPE (stmt_info)
1431                      != vect_double_reduction_def)
1432                 {
1433                   if (dump_enabled_p ())
1434                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1435                                      "Unsupported loop-closed phi in "
1436                                      "outer-loop.\n");
1437                   return false;
1438                 }
1439
1440               /* If PHI is used in the outer loop, we check that its operand
1441                  is defined in the inner loop.  */
1442               if (STMT_VINFO_RELEVANT_P (stmt_info))
1443                 {
1444                   tree phi_op;
1445                   gimple op_def_stmt;
1446
1447                   if (gimple_phi_num_args (phi) != 1)
1448                     return false;
1449
1450                   phi_op = PHI_ARG_DEF (phi, 0);
1451                   if (TREE_CODE (phi_op) != SSA_NAME)
1452                     return false;
1453
1454                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1455                   if (gimple_nop_p (op_def_stmt)
1456                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1457                       || !vinfo_for_stmt (op_def_stmt))
1458                     return false;
1459
1460                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1461                         != vect_used_in_outer
1462                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1463                            != vect_used_in_outer_by_reduction)
1464                     return false;
1465                 }
1466
1467               continue;
1468             }
1469
1470           gcc_assert (stmt_info);
1471
1472           if (STMT_VINFO_LIVE_P (stmt_info))
1473             {
1474               /* FORNOW: not yet supported.  */
1475               if (dump_enabled_p ())
1476                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1477                                  "not vectorized: value used after loop.\n");
1478               return false;
1479             }
1480
1481           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1482               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1483             {
1484               /* A scalar-dependence cycle that we don't support.  */
1485               if (dump_enabled_p ())
1486                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1487                                  "not vectorized: scalar dependence cycle.\n");
1488               return false;
1489             }
1490
1491           if (STMT_VINFO_RELEVANT_P (stmt_info))
1492             {
1493               need_to_vectorize = true;
1494               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1495                 ok = vectorizable_induction (phi, NULL, NULL);
1496             }
1497
1498           if (!ok)
1499             {
1500               if (dump_enabled_p ())
1501                 {
1502                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1503                                    "not vectorized: relevant phi not "
1504                                    "supported: ");
1505                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1506                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1507                 }
1508               return false;
1509             }
1510         }
1511
1512       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1513         {
1514           gimple stmt = gsi_stmt (si);
1515           if (!gimple_clobber_p (stmt)
1516               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1517             return false;
1518         }
1519     } /* bbs */
1520
1521   /* All operations in the loop are either irrelevant (deal with loop
1522      control, or dead), or only used outside the loop and can be moved
1523      out of the loop (e.g. invariants, inductions).  The loop can be
1524      optimized away by scalar optimizations.  We're better off not
1525      touching this loop.  */
1526   if (!need_to_vectorize)
1527     {
1528       if (dump_enabled_p ())
1529         dump_printf_loc (MSG_NOTE, vect_location,
1530                          "All the computation can be taken out of the loop.\n");
1531       if (dump_enabled_p ())
1532         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1533                          "not vectorized: redundant loop. no profit to "
1534                          "vectorize.\n");
1535       return false;
1536     }
1537
1538   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1539     dump_printf_loc (MSG_NOTE, vect_location,
1540                      "vectorization_factor = %d, niters = "
1541                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1542                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1543
1544   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1545        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1546       || ((max_niter = max_stmt_executions_int (loop)) != -1
1547           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1548     {
1549       if (dump_enabled_p ())
1550         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1551                          "not vectorized: iteration count too small.\n");
1552       if (dump_enabled_p ())
1553         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1554                          "not vectorized: iteration count smaller than "
1555                          "vectorization factor.\n");
1556       return false;
1557     }
1558
1559   /* Analyze cost.  Decide if worth while to vectorize.  */
1560
1561   /* Once VF is set, SLP costs should be updated since the number of created
1562      vector stmts depends on VF.  */
1563   vect_update_slp_costs_according_to_vf (loop_vinfo);
1564
1565   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1566                                       &min_profitable_estimate);
1567   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1568
1569   if (min_profitable_iters < 0)
1570     {
1571       if (dump_enabled_p ())
1572         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1573                          "not vectorized: vectorization not profitable.\n");
1574       if (dump_enabled_p ())
1575         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1576                          "not vectorized: vector version will never be "
1577                          "profitable.\n");
1578       return false;
1579     }
1580
1581   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1582                             * vectorization_factor) - 1);
1583
1584
1585   /* Use the cost model only if it is more conservative than user specified
1586      threshold.  */
1587
1588   th = (unsigned) min_scalar_loop_bound;
1589   if (min_profitable_iters
1590       && (!min_scalar_loop_bound
1591           || min_profitable_iters > min_scalar_loop_bound))
1592     th = (unsigned) min_profitable_iters;
1593
1594   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1595
1596   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1597       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1598     {
1599       if (dump_enabled_p ())
1600         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1601                          "not vectorized: vectorization not profitable.\n");
1602       if (dump_enabled_p ())
1603         dump_printf_loc (MSG_NOTE, vect_location,
1604                          "not vectorized: iteration count smaller than user "
1605                          "specified loop bound parameter or minimum profitable "
1606                          "iterations (whichever is more conservative).\n");
1607       return false;
1608     }
1609
1610   if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1611       && ((unsigned HOST_WIDE_INT) estimated_niter
1612           <= MAX (th, (unsigned)min_profitable_estimate)))
1613     {
1614       if (dump_enabled_p ())
1615         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1616                          "not vectorized: estimated iteration count too "
1617                          "small.\n");
1618       if (dump_enabled_p ())
1619         dump_printf_loc (MSG_NOTE, vect_location,
1620                          "not vectorized: estimated iteration count smaller "
1621                          "than specified loop bound parameter or minimum "
1622                          "profitable iterations (whichever is more "
1623                          "conservative).\n");
1624       return false;
1625     }
1626
1627   return true;
1628 }
1629
1630
1631 /* Function vect_analyze_loop_2.
1632
1633    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1634    for it.  The different analyses will record information in the
1635    loop_vec_info struct.  */
1636 static bool
1637 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1638 {
1639   bool ok, slp = false;
1640   int max_vf = MAX_VECTORIZATION_FACTOR;
1641   int min_vf = 2;
1642   unsigned int th;
1643   unsigned int n_stmts = 0;
1644
1645   /* Find all data references in the loop (which correspond to vdefs/vuses)
1646      and analyze their evolution in the loop.  Also adjust the minimal
1647      vectorization factor according to the loads and stores.
1648
1649      FORNOW: Handle only simple, array references, which
1650      alignment can be forced, and aligned pointer-references.  */
1651
1652   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf, &n_stmts);
1653   if (!ok)
1654     {
1655       if (dump_enabled_p ())
1656         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1657                          "bad data references.\n");
1658       return false;
1659     }
1660
1661   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1662      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1663
1664   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1665   if (!ok)
1666     {
1667       if (dump_enabled_p ())
1668         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1669                          "bad data access.\n");
1670       return false;
1671     }
1672
1673   /* Classify all cross-iteration scalar data-flow cycles.
1674      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1675
1676   vect_analyze_scalar_cycles (loop_vinfo);
1677
1678   vect_pattern_recog (loop_vinfo, NULL);
1679
1680   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1681
1682   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1683   if (!ok)
1684     {
1685       if (dump_enabled_p ())
1686         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1687                          "unexpected pattern.\n");
1688       return false;
1689     }
1690
1691   /* Analyze data dependences between the data-refs in the loop
1692      and adjust the maximum vectorization factor according to
1693      the dependences.
1694      FORNOW: fail at the first data dependence that we encounter.  */
1695
1696   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1697   if (!ok
1698       || max_vf < min_vf)
1699     {
1700       if (dump_enabled_p ())
1701             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1702                              "bad data dependence.\n");
1703       return false;
1704     }
1705
1706   ok = vect_determine_vectorization_factor (loop_vinfo);
1707   if (!ok)
1708     {
1709       if (dump_enabled_p ())
1710         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1711                          "can't determine vectorization factor.\n");
1712       return false;
1713     }
1714   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1715     {
1716       if (dump_enabled_p ())
1717         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1718                          "bad data dependence.\n");
1719       return false;
1720     }
1721
1722   /* Analyze the alignment of the data-refs in the loop.
1723      Fail if a data reference is found that cannot be vectorized.  */
1724
1725   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1726   if (!ok)
1727     {
1728       if (dump_enabled_p ())
1729         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1730                          "bad data alignment.\n");
1731       return false;
1732     }
1733
1734   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1735      It is important to call pruning after vect_analyze_data_ref_accesses,
1736      since we use grouping information gathered by interleaving analysis.  */
1737   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1738   if (!ok)
1739     {
1740       if (dump_enabled_p ())
1741         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1742                          "number of versioning for alias "
1743                          "run-time tests exceeds %d "
1744                          "(--param vect-max-version-for-alias-checks)\n",
1745                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1746       return false;
1747     }
1748
1749   /* This pass will decide on using loop versioning and/or loop peeling in
1750      order to enhance the alignment of data references in the loop.  */
1751
1752   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1753   if (!ok)
1754     {
1755       if (dump_enabled_p ())
1756         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1757                          "bad data alignment.\n");
1758       return false;
1759     }
1760
1761   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1762   ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
1763   if (ok)
1764     {
1765       /* Decide which possible SLP instances to SLP.  */
1766       slp = vect_make_slp_decision (loop_vinfo);
1767
1768       /* Find stmts that need to be both vectorized and SLPed.  */
1769       vect_detect_hybrid_slp (loop_vinfo);
1770     }
1771   else
1772     return false;
1773
1774   /* Scan all the operations in the loop and make sure they are
1775      vectorizable.  */
1776
1777   ok = vect_analyze_loop_operations (loop_vinfo, slp);
1778   if (!ok)
1779     {
1780       if (dump_enabled_p ())
1781         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1782                          "bad operation or unsupported loop bound.\n");
1783       return false;
1784     }
1785
1786   /* Decide whether we need to create an epilogue loop to handle
1787      remaining scalar iterations.  */
1788   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
1789         / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1790        * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1791
1792   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1793       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1794     {
1795       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1796                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1797           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1798         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1799     }
1800   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1801            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1802                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1803                /* In case of versioning, check if the maximum number of
1804                   iterations is greater than th.  If they are identical,
1805                   the epilogue is unnecessary.  */
1806                && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
1807                     && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1808                    || (unsigned HOST_WIDE_INT)max_stmt_executions_int
1809                         (LOOP_VINFO_LOOP (loop_vinfo)) > th)))
1810     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1811
1812   /* If an epilogue loop is required make sure we can create one.  */
1813   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1814       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1815     {
1816       if (dump_enabled_p ())
1817         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1818       if (!vect_can_advance_ivs_p (loop_vinfo)
1819           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1820                                            single_exit (LOOP_VINFO_LOOP
1821                                                          (loop_vinfo))))
1822         {
1823           if (dump_enabled_p ())
1824             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1825                              "not vectorized: can't create required "
1826                              "epilog loop\n");
1827           return false;
1828         }
1829     }
1830
1831   return true;
1832 }
1833
1834 /* Function vect_analyze_loop.
1835
1836    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1837    for it.  The different analyses will record information in the
1838    loop_vec_info struct.  */
1839 loop_vec_info
1840 vect_analyze_loop (struct loop *loop)
1841 {
1842   loop_vec_info loop_vinfo;
1843   unsigned int vector_sizes;
1844
1845   /* Autodetect first vector size we try.  */
1846   current_vector_size = 0;
1847   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1848
1849   if (dump_enabled_p ())
1850     dump_printf_loc (MSG_NOTE, vect_location,
1851                      "===== analyze_loop_nest =====\n");
1852
1853   if (loop_outer (loop)
1854       && loop_vec_info_for_loop (loop_outer (loop))
1855       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1856     {
1857       if (dump_enabled_p ())
1858         dump_printf_loc (MSG_NOTE, vect_location,
1859                          "outer-loop already vectorized.\n");
1860       return NULL;
1861     }
1862
1863   while (1)
1864     {
1865       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1866       loop_vinfo = vect_analyze_loop_form (loop);
1867       if (!loop_vinfo)
1868         {
1869           if (dump_enabled_p ())
1870             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1871                              "bad loop form.\n");
1872           return NULL;
1873         }
1874
1875       if (vect_analyze_loop_2 (loop_vinfo))
1876         {
1877           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1878
1879           return loop_vinfo;
1880         }
1881
1882       destroy_loop_vec_info (loop_vinfo, true);
1883
1884       vector_sizes &= ~current_vector_size;
1885       if (vector_sizes == 0
1886           || current_vector_size == 0)
1887         return NULL;
1888
1889       /* Try the next biggest vector size.  */
1890       current_vector_size = 1 << floor_log2 (vector_sizes);
1891       if (dump_enabled_p ())
1892         dump_printf_loc (MSG_NOTE, vect_location,
1893                          "***** Re-trying analysis with "
1894                          "vector size %d\n", current_vector_size);
1895     }
1896 }
1897
1898
1899 /* Function reduction_code_for_scalar_code
1900
1901    Input:
1902    CODE - tree_code of a reduction operations.
1903
1904    Output:
1905    REDUC_CODE - the corresponding tree-code to be used to reduce the
1906       vector of partial results into a single scalar result (which
1907       will also reside in a vector) or ERROR_MARK if the operation is
1908       a supported reduction operation, but does not have such tree-code.
1909
1910    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1911
1912 static bool
1913 reduction_code_for_scalar_code (enum tree_code code,
1914                                 enum tree_code *reduc_code)
1915 {
1916   switch (code)
1917     {
1918       case MAX_EXPR:
1919         *reduc_code = REDUC_MAX_EXPR;
1920         return true;
1921
1922       case MIN_EXPR:
1923         *reduc_code = REDUC_MIN_EXPR;
1924         return true;
1925
1926       case PLUS_EXPR:
1927         *reduc_code = REDUC_PLUS_EXPR;
1928         return true;
1929
1930       case MULT_EXPR:
1931       case MINUS_EXPR:
1932       case BIT_IOR_EXPR:
1933       case BIT_XOR_EXPR:
1934       case BIT_AND_EXPR:
1935         *reduc_code = ERROR_MARK;
1936         return true;
1937
1938       default:
1939        return false;
1940     }
1941 }
1942
1943
1944 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
1945    STMT is printed with a message MSG. */
1946
1947 static void
1948 report_vect_op (int msg_type, gimple stmt, const char *msg)
1949 {
1950   dump_printf_loc (msg_type, vect_location, "%s", msg);
1951   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1952   dump_printf (msg_type, "\n");
1953 }
1954
1955
1956 /* Detect SLP reduction of the form:
1957
1958    #a1 = phi <a5, a0>
1959    a2 = operation (a1)
1960    a3 = operation (a2)
1961    a4 = operation (a3)
1962    a5 = operation (a4)
1963
1964    #a = phi <a5>
1965
1966    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1967    FIRST_STMT is the first reduction stmt in the chain
1968    (a2 = operation (a1)).
1969
1970    Return TRUE if a reduction chain was detected.  */
1971
1972 static bool
1973 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1974 {
1975   struct loop *loop = (gimple_bb (phi))->loop_father;
1976   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1977   enum tree_code code;
1978   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1979   stmt_vec_info use_stmt_info, current_stmt_info;
1980   tree lhs;
1981   imm_use_iterator imm_iter;
1982   use_operand_p use_p;
1983   int nloop_uses, size = 0, n_out_of_loop_uses;
1984   bool found = false;
1985
1986   if (loop != vect_loop)
1987     return false;
1988
1989   lhs = PHI_RESULT (phi);
1990   code = gimple_assign_rhs_code (first_stmt);
1991   while (1)
1992     {
1993       nloop_uses = 0;
1994       n_out_of_loop_uses = 0;
1995       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1996         {
1997           gimple use_stmt = USE_STMT (use_p);
1998           if (is_gimple_debug (use_stmt))
1999             continue;
2000
2001           /* Check if we got back to the reduction phi.  */
2002           if (use_stmt == phi)
2003             {
2004               loop_use_stmt = use_stmt;
2005               found = true;
2006               break;
2007             }
2008
2009           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2010             {
2011               if (vinfo_for_stmt (use_stmt)
2012                   && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
2013                 {
2014                   loop_use_stmt = use_stmt;
2015                   nloop_uses++;
2016                 }
2017             }
2018            else
2019              n_out_of_loop_uses++;
2020
2021            /* There are can be either a single use in the loop or two uses in
2022               phi nodes.  */
2023            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2024              return false;
2025         }
2026
2027       if (found)
2028         break;
2029
2030       /* We reached a statement with no loop uses.  */
2031       if (nloop_uses == 0)
2032         return false;
2033
2034       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2035       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2036         return false;
2037
2038       if (!is_gimple_assign (loop_use_stmt)
2039           || code != gimple_assign_rhs_code (loop_use_stmt)
2040           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2041         return false;
2042
2043       /* Insert USE_STMT into reduction chain.  */
2044       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2045       if (current_stmt)
2046         {
2047           current_stmt_info = vinfo_for_stmt (current_stmt);
2048           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2049           GROUP_FIRST_ELEMENT (use_stmt_info)
2050             = GROUP_FIRST_ELEMENT (current_stmt_info);
2051         }
2052       else
2053         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2054
2055       lhs = gimple_assign_lhs (loop_use_stmt);
2056       current_stmt = loop_use_stmt;
2057       size++;
2058    }
2059
2060   if (!found || loop_use_stmt != phi || size < 2)
2061     return false;
2062
2063   /* Swap the operands, if needed, to make the reduction operand be the second
2064      operand.  */
2065   lhs = PHI_RESULT (phi);
2066   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2067   while (next_stmt)
2068     {
2069       if (gimple_assign_rhs2 (next_stmt) == lhs)
2070         {
2071           tree op = gimple_assign_rhs1 (next_stmt);
2072           gimple def_stmt = NULL;
2073
2074           if (TREE_CODE (op) == SSA_NAME)
2075             def_stmt = SSA_NAME_DEF_STMT (op);
2076
2077           /* Check that the other def is either defined in the loop
2078              ("vect_internal_def"), or it's an induction (defined by a
2079              loop-header phi-node).  */
2080           if (def_stmt
2081               && gimple_bb (def_stmt)
2082               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2083               && (is_gimple_assign (def_stmt)
2084                   || is_gimple_call (def_stmt)
2085                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2086                            == vect_induction_def
2087                   || (gimple_code (def_stmt) == GIMPLE_PHI
2088                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2089                                   == vect_internal_def
2090                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2091             {
2092               lhs = gimple_assign_lhs (next_stmt);
2093               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2094               continue;
2095             }
2096
2097           return false;
2098         }
2099       else
2100         {
2101           tree op = gimple_assign_rhs2 (next_stmt);
2102           gimple def_stmt = NULL;
2103
2104           if (TREE_CODE (op) == SSA_NAME)
2105             def_stmt = SSA_NAME_DEF_STMT (op);
2106
2107           /* Check that the other def is either defined in the loop
2108             ("vect_internal_def"), or it's an induction (defined by a
2109             loop-header phi-node).  */
2110           if (def_stmt
2111               && gimple_bb (def_stmt)
2112               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2113               && (is_gimple_assign (def_stmt)
2114                   || is_gimple_call (def_stmt)
2115                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2116                               == vect_induction_def
2117                   || (gimple_code (def_stmt) == GIMPLE_PHI
2118                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2119                                   == vect_internal_def
2120                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2121             {
2122               if (dump_enabled_p ())
2123                 {
2124                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2125                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2126                   dump_printf (MSG_NOTE, "\n");
2127                 }
2128
2129               swap_ssa_operands (next_stmt,
2130                                  gimple_assign_rhs1_ptr (next_stmt),
2131                                  gimple_assign_rhs2_ptr (next_stmt));
2132               update_stmt (next_stmt);
2133
2134               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2135                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2136             }
2137           else
2138             return false;
2139         }
2140
2141       lhs = gimple_assign_lhs (next_stmt);
2142       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2143     }
2144
2145   /* Save the chain for further analysis in SLP detection.  */
2146   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2147   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2148   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2149
2150   return true;
2151 }
2152
2153
2154 /* Function vect_is_simple_reduction_1
2155
2156    (1) Detect a cross-iteration def-use cycle that represents a simple
2157    reduction computation.  We look for the following pattern:
2158
2159    loop_header:
2160      a1 = phi < a0, a2 >
2161      a3 = ...
2162      a2 = operation (a3, a1)
2163
2164    or
2165
2166    a3 = ...
2167    loop_header:
2168      a1 = phi < a0, a2 >
2169      a2 = operation (a3, a1)
2170
2171    such that:
2172    1. operation is commutative and associative and it is safe to
2173       change the order of the computation (if CHECK_REDUCTION is true)
2174    2. no uses for a2 in the loop (a2 is used out of the loop)
2175    3. no uses of a1 in the loop besides the reduction operation
2176    4. no uses of a1 outside the loop.
2177
2178    Conditions 1,4 are tested here.
2179    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2180
2181    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2182    nested cycles, if CHECK_REDUCTION is false.
2183
2184    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2185    reductions:
2186
2187      a1 = phi < a0, a2 >
2188      inner loop (def of a3)
2189      a2 = phi < a3 >
2190
2191    If MODIFY is true it tries also to rework the code in-place to enable
2192    detection of more reduction patterns.  For the time being we rewrite
2193    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2194 */
2195
2196 static gimple
2197 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2198                             bool check_reduction, bool *double_reduc,
2199                             bool modify)
2200 {
2201   struct loop *loop = (gimple_bb (phi))->loop_father;
2202   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2203   edge latch_e = loop_latch_edge (loop);
2204   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2205   gimple def_stmt, def1 = NULL, def2 = NULL;
2206   enum tree_code orig_code, code;
2207   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2208   tree type;
2209   int nloop_uses;
2210   tree name;
2211   imm_use_iterator imm_iter;
2212   use_operand_p use_p;
2213   bool phi_def;
2214
2215   *double_reduc = false;
2216
2217   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2218      otherwise, we assume outer loop vectorization.  */
2219   gcc_assert ((check_reduction && loop == vect_loop)
2220               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2221
2222   name = PHI_RESULT (phi);
2223   /* ???  If there are no uses of the PHI result the inner loop reduction
2224      won't be detected as possibly double-reduction by vectorizable_reduction
2225      because that tries to walk the PHI arg from the preheader edge which
2226      can be constant.  See PR60382.  */
2227   if (has_zero_uses (name))
2228     return NULL;
2229   nloop_uses = 0;
2230   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2231     {
2232       gimple use_stmt = USE_STMT (use_p);
2233       if (is_gimple_debug (use_stmt))
2234         continue;
2235
2236       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2237         {
2238           if (dump_enabled_p ())
2239             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2240                              "intermediate value used outside loop.\n");
2241
2242           return NULL;
2243         }
2244
2245       if (vinfo_for_stmt (use_stmt)
2246           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2247         nloop_uses++;
2248       if (nloop_uses > 1)
2249         {
2250           if (dump_enabled_p ())
2251             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2252                              "reduction used in loop.\n");
2253           return NULL;
2254         }
2255     }
2256
2257   if (TREE_CODE (loop_arg) != SSA_NAME)
2258     {
2259       if (dump_enabled_p ())
2260         {
2261           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2262                            "reduction: not ssa_name: ");
2263           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2264           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2265         }
2266       return NULL;
2267     }
2268
2269   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2270   if (!def_stmt)
2271     {
2272       if (dump_enabled_p ())
2273         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2274                          "reduction: no def_stmt.\n");
2275       return NULL;
2276     }
2277
2278   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2279     {
2280       if (dump_enabled_p ())
2281         {
2282           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2283           dump_printf (MSG_NOTE, "\n");
2284         }
2285       return NULL;
2286     }
2287
2288   if (is_gimple_assign (def_stmt))
2289     {
2290       name = gimple_assign_lhs (def_stmt);
2291       phi_def = false;
2292     }
2293   else
2294     {
2295       name = PHI_RESULT (def_stmt);
2296       phi_def = true;
2297     }
2298
2299   nloop_uses = 0;
2300   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2301     {
2302       gimple use_stmt = USE_STMT (use_p);
2303       if (is_gimple_debug (use_stmt))
2304         continue;
2305       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2306           && vinfo_for_stmt (use_stmt)
2307           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2308         nloop_uses++;
2309       if (nloop_uses > 1)
2310         {
2311           if (dump_enabled_p ())
2312             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2313                              "reduction used in loop.\n");
2314           return NULL;
2315         }
2316     }
2317
2318   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2319      defined in the inner loop.  */
2320   if (phi_def)
2321     {
2322       op1 = PHI_ARG_DEF (def_stmt, 0);
2323
2324       if (gimple_phi_num_args (def_stmt) != 1
2325           || TREE_CODE (op1) != SSA_NAME)
2326         {
2327           if (dump_enabled_p ())
2328             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2329                              "unsupported phi node definition.\n");
2330
2331           return NULL;
2332         }
2333
2334       def1 = SSA_NAME_DEF_STMT (op1);
2335       if (gimple_bb (def1)
2336           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2337           && loop->inner
2338           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2339           && is_gimple_assign (def1))
2340         {
2341           if (dump_enabled_p ())
2342             report_vect_op (MSG_NOTE, def_stmt,
2343                             "detected double reduction: ");
2344
2345           *double_reduc = true;
2346           return def_stmt;
2347         }
2348
2349       return NULL;
2350     }
2351
2352   code = orig_code = gimple_assign_rhs_code (def_stmt);
2353
2354   /* We can handle "res -= x[i]", which is non-associative by
2355      simply rewriting this into "res += -x[i]".  Avoid changing
2356      gimple instruction for the first simple tests and only do this
2357      if we're allowed to change code at all.  */
2358   if (code == MINUS_EXPR
2359       && modify
2360       && (op1 = gimple_assign_rhs1 (def_stmt))
2361       && TREE_CODE (op1) == SSA_NAME
2362       && SSA_NAME_DEF_STMT (op1) == phi)
2363     code = PLUS_EXPR;
2364
2365   if (check_reduction
2366       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2367     {
2368       if (dump_enabled_p ())
2369         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2370                         "reduction: not commutative/associative: ");
2371       return NULL;
2372     }
2373
2374   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2375     {
2376       if (code != COND_EXPR)
2377         {
2378           if (dump_enabled_p ())
2379             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2380                             "reduction: not binary operation: ");
2381
2382           return NULL;
2383         }
2384
2385       op3 = gimple_assign_rhs1 (def_stmt);
2386       if (COMPARISON_CLASS_P (op3))
2387         {
2388           op4 = TREE_OPERAND (op3, 1);
2389           op3 = TREE_OPERAND (op3, 0);
2390         }
2391
2392       op1 = gimple_assign_rhs2 (def_stmt);
2393       op2 = gimple_assign_rhs3 (def_stmt);
2394
2395       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2396         {
2397           if (dump_enabled_p ())
2398             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2399                             "reduction: uses not ssa_names: ");
2400
2401           return NULL;
2402         }
2403     }
2404   else
2405     {
2406       op1 = gimple_assign_rhs1 (def_stmt);
2407       op2 = gimple_assign_rhs2 (def_stmt);
2408
2409       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2410         {
2411           if (dump_enabled_p ())
2412             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2413                             "reduction: uses not ssa_names: ");
2414
2415           return NULL;
2416         }
2417    }
2418
2419   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2420   if ((TREE_CODE (op1) == SSA_NAME
2421        && !types_compatible_p (type,TREE_TYPE (op1)))
2422       || (TREE_CODE (op2) == SSA_NAME
2423           && !types_compatible_p (type, TREE_TYPE (op2)))
2424       || (op3 && TREE_CODE (op3) == SSA_NAME
2425           && !types_compatible_p (type, TREE_TYPE (op3)))
2426       || (op4 && TREE_CODE (op4) == SSA_NAME
2427           && !types_compatible_p (type, TREE_TYPE (op4))))
2428     {
2429       if (dump_enabled_p ())
2430         {
2431           dump_printf_loc (MSG_NOTE, vect_location,
2432                            "reduction: multiple types: operation type: ");
2433           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2434           dump_printf (MSG_NOTE, ", operands types: ");
2435           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2436                              TREE_TYPE (op1));
2437           dump_printf (MSG_NOTE, ",");
2438           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2439                              TREE_TYPE (op2));
2440           if (op3)
2441             {
2442               dump_printf (MSG_NOTE, ",");
2443               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2444                                  TREE_TYPE (op3));
2445             }
2446
2447           if (op4)
2448             {
2449               dump_printf (MSG_NOTE, ",");
2450               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2451                                  TREE_TYPE (op4));
2452             }
2453           dump_printf (MSG_NOTE, "\n");
2454         }
2455
2456       return NULL;
2457     }
2458
2459   /* Check that it's ok to change the order of the computation.
2460      Generally, when vectorizing a reduction we change the order of the
2461      computation.  This may change the behavior of the program in some
2462      cases, so we need to check that this is ok.  One exception is when
2463      vectorizing an outer-loop: the inner-loop is executed sequentially,
2464      and therefore vectorizing reductions in the inner-loop during
2465      outer-loop vectorization is safe.  */
2466
2467   /* CHECKME: check for !flag_finite_math_only too?  */
2468   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2469       && check_reduction)
2470     {
2471       /* Changing the order of operations changes the semantics.  */
2472       if (dump_enabled_p ())
2473         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2474                         "reduction: unsafe fp math optimization: ");
2475       return NULL;
2476     }
2477   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2478            && check_reduction)
2479     {
2480       /* Changing the order of operations changes the semantics.  */
2481       if (dump_enabled_p ())
2482         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2483                         "reduction: unsafe int math optimization: ");
2484       return NULL;
2485     }
2486   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2487     {
2488       /* Changing the order of operations changes the semantics.  */
2489       if (dump_enabled_p ())
2490         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2491                         "reduction: unsafe fixed-point math optimization: ");
2492       return NULL;
2493     }
2494
2495   /* If we detected "res -= x[i]" earlier, rewrite it into
2496      "res += -x[i]" now.  If this turns out to be useless reassoc
2497      will clean it up again.  */
2498   if (orig_code == MINUS_EXPR)
2499     {
2500       tree rhs = gimple_assign_rhs2 (def_stmt);
2501       tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
2502       gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2503                                                          rhs, NULL);
2504       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2505       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2506                                                           loop_info, NULL));
2507       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2508       gimple_assign_set_rhs2 (def_stmt, negrhs);
2509       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2510       update_stmt (def_stmt);
2511     }
2512
2513   /* Reduction is safe. We're dealing with one of the following:
2514      1) integer arithmetic and no trapv
2515      2) floating point arithmetic, and special flags permit this optimization
2516      3) nested cycle (i.e., outer loop vectorization).  */
2517   if (TREE_CODE (op1) == SSA_NAME)
2518     def1 = SSA_NAME_DEF_STMT (op1);
2519
2520   if (TREE_CODE (op2) == SSA_NAME)
2521     def2 = SSA_NAME_DEF_STMT (op2);
2522
2523   if (code != COND_EXPR
2524       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2525     {
2526       if (dump_enabled_p ())
2527         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2528       return NULL;
2529     }
2530
2531   /* Check that one def is the reduction def, defined by PHI,
2532      the other def is either defined in the loop ("vect_internal_def"),
2533      or it's an induction (defined by a loop-header phi-node).  */
2534
2535   if (def2 && def2 == phi
2536       && (code == COND_EXPR
2537           || !def1 || gimple_nop_p (def1)
2538           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2539           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2540               && (is_gimple_assign (def1)
2541                   || is_gimple_call (def1)
2542                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2543                       == vect_induction_def
2544                   || (gimple_code (def1) == GIMPLE_PHI
2545                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2546                           == vect_internal_def
2547                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2548     {
2549       if (dump_enabled_p ())
2550         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2551       return def_stmt;
2552     }
2553
2554   if (def1 && def1 == phi
2555       && (code == COND_EXPR
2556           || !def2 || gimple_nop_p (def2)
2557           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2558           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2559               && (is_gimple_assign (def2)
2560                   || is_gimple_call (def2)
2561                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2562                       == vect_induction_def
2563                   || (gimple_code (def2) == GIMPLE_PHI
2564                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2565                           == vect_internal_def
2566                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2567     {
2568       if (check_reduction)
2569         {
2570           /* Swap operands (just for simplicity - so that the rest of the code
2571              can assume that the reduction variable is always the last (second)
2572              argument).  */
2573           if (dump_enabled_p ())
2574             report_vect_op (MSG_NOTE, def_stmt,
2575                             "detected reduction: need to swap operands: ");
2576
2577           swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2578                              gimple_assign_rhs2_ptr (def_stmt));
2579
2580           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2581             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2582         }
2583       else
2584         {
2585           if (dump_enabled_p ())
2586             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2587         }
2588
2589       return def_stmt;
2590     }
2591
2592   /* Try to find SLP reduction chain.  */
2593   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2594     {
2595       if (dump_enabled_p ())
2596         report_vect_op (MSG_NOTE, def_stmt,
2597                         "reduction: detected reduction chain: ");
2598
2599       return def_stmt;
2600     }
2601
2602   if (dump_enabled_p ())
2603     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2604                     "reduction: unknown pattern: ");
2605
2606   return NULL;
2607 }
2608
2609 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2610    in-place.  Arguments as there.  */
2611
2612 static gimple
2613 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2614                           bool check_reduction, bool *double_reduc)
2615 {
2616   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2617                                      double_reduc, false);
2618 }
2619
2620 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2621    in-place if it enables detection of more reductions.  Arguments
2622    as there.  */
2623
2624 gimple
2625 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2626                           bool check_reduction, bool *double_reduc)
2627 {
2628   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2629                                      double_reduc, true);
2630 }
2631
2632 /* Calculate the cost of one scalar iteration of the loop.  */
2633 int
2634 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2635 {
2636   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2637   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2638   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2639   int innerloop_iters, i, stmt_cost;
2640
2641   /* Count statements in scalar loop.  Using this as scalar cost for a single
2642      iteration for now.
2643
2644      TODO: Add outer loop support.
2645
2646      TODO: Consider assigning different costs to different scalar
2647      statements.  */
2648
2649   /* FORNOW.  */
2650   innerloop_iters = 1;
2651   if (loop->inner)
2652     innerloop_iters = 50; /* FIXME */
2653
2654   for (i = 0; i < nbbs; i++)
2655     {
2656       gimple_stmt_iterator si;
2657       basic_block bb = bbs[i];
2658
2659       if (bb->loop_father == loop->inner)
2660         factor = innerloop_iters;
2661       else
2662         factor = 1;
2663
2664       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2665         {
2666           gimple stmt = gsi_stmt (si);
2667           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2668
2669           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2670             continue;
2671
2672           /* Skip stmts that are not vectorized inside the loop.  */
2673           if (stmt_info
2674               && !STMT_VINFO_RELEVANT_P (stmt_info)
2675               && (!STMT_VINFO_LIVE_P (stmt_info)
2676                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2677               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2678             continue;
2679
2680           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2681             {
2682               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2683                stmt_cost = vect_get_stmt_cost (scalar_load);
2684              else
2685                stmt_cost = vect_get_stmt_cost (scalar_store);
2686             }
2687           else
2688             stmt_cost = vect_get_stmt_cost (scalar_stmt);
2689
2690           scalar_single_iter_cost += stmt_cost * factor;
2691         }
2692     }
2693   return scalar_single_iter_cost;
2694 }
2695
2696 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2697 int
2698 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2699                              int *peel_iters_epilogue,
2700                              int scalar_single_iter_cost,
2701                              stmt_vector_for_cost *prologue_cost_vec,
2702                              stmt_vector_for_cost *epilogue_cost_vec)
2703 {
2704   int retval = 0;
2705   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2706
2707   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2708     {
2709       *peel_iters_epilogue = vf/2;
2710       if (dump_enabled_p ())
2711         dump_printf_loc (MSG_NOTE, vect_location,
2712                          "cost model: epilogue peel iters set to vf/2 "
2713                          "because loop iterations are unknown .\n");
2714
2715       /* If peeled iterations are known but number of scalar loop
2716          iterations are unknown, count a taken branch per peeled loop.  */
2717       retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2718                                  NULL, 0, vect_prologue);
2719     }
2720   else
2721     {
2722       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2723       peel_iters_prologue = niters < peel_iters_prologue ?
2724                             niters : peel_iters_prologue;
2725       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2726       /* If we need to peel for gaps, but no peeling is required, we have to
2727          peel VF iterations.  */
2728       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2729         *peel_iters_epilogue = vf;
2730     }
2731
2732   if (peel_iters_prologue)
2733     retval += record_stmt_cost (prologue_cost_vec,
2734                                 peel_iters_prologue * scalar_single_iter_cost,
2735                                 scalar_stmt, NULL, 0, vect_prologue);
2736   if (*peel_iters_epilogue)
2737     retval += record_stmt_cost (epilogue_cost_vec,
2738                                 *peel_iters_epilogue * scalar_single_iter_cost,
2739                                 scalar_stmt, NULL, 0, vect_epilogue);
2740   return retval;
2741 }
2742
2743 /* Function vect_estimate_min_profitable_iters
2744
2745    Return the number of iterations required for the vector version of the
2746    loop to be profitable relative to the cost of the scalar version of the
2747    loop.  */
2748
2749 static void
2750 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2751                                     int *ret_min_profitable_niters,
2752                                     int *ret_min_profitable_estimate)
2753 {
2754   int min_profitable_iters;
2755   int min_profitable_estimate;
2756   int peel_iters_prologue;
2757   int peel_iters_epilogue;
2758   unsigned vec_inside_cost = 0;
2759   int vec_outside_cost = 0;
2760   unsigned vec_prologue_cost = 0;
2761   unsigned vec_epilogue_cost = 0;
2762   int scalar_single_iter_cost = 0;
2763   int scalar_outside_cost = 0;
2764   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2765   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2766   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2767
2768   /* Cost model disabled.  */
2769   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2770     {
2771       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2772       *ret_min_profitable_niters = 0;
2773       *ret_min_profitable_estimate = 0;
2774       return;
2775     }
2776
2777   /* Requires loop versioning tests to handle misalignment.  */
2778   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2779     {
2780       /*  FIXME: Make cost depend on complexity of individual check.  */
2781       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2782       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2783                             vect_prologue);
2784       dump_printf (MSG_NOTE,
2785                    "cost model: Adding cost of checks for loop "
2786                    "versioning to treat misalignment.\n");
2787     }
2788
2789   /* Requires loop versioning with alias checks.  */
2790   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2791     {
2792       /*  FIXME: Make cost depend on complexity of individual check.  */
2793       unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2794       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2795                             vect_prologue);
2796       dump_printf (MSG_NOTE,
2797                    "cost model: Adding cost of checks for loop "
2798                    "versioning aliasing.\n");
2799     }
2800
2801   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2802       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2803     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2804                           vect_prologue);
2805
2806   /* Count statements in scalar loop.  Using this as scalar cost for a single
2807      iteration for now.
2808
2809      TODO: Add outer loop support.
2810
2811      TODO: Consider assigning different costs to different scalar
2812      statements.  */
2813
2814   scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2815
2816   /* Add additional cost for the peeled instructions in prologue and epilogue
2817      loop.
2818
2819      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2820      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2821
2822      TODO: Build an expression that represents peel_iters for prologue and
2823      epilogue to be used in a run-time test.  */
2824
2825   if (npeel  < 0)
2826     {
2827       peel_iters_prologue = vf/2;
2828       dump_printf (MSG_NOTE, "cost model: "
2829                    "prologue peel iters set to vf/2.\n");
2830
2831       /* If peeling for alignment is unknown, loop bound of main loop becomes
2832          unknown.  */
2833       peel_iters_epilogue = vf/2;
2834       dump_printf (MSG_NOTE, "cost model: "
2835                    "epilogue peel iters set to vf/2 because "
2836                    "peeling for alignment is unknown.\n");
2837
2838       /* If peeled iterations are unknown, count a taken branch and a not taken
2839          branch per peeled loop. Even if scalar loop iterations are known,
2840          vector iterations are not known since peeled prologue iterations are
2841          not known. Hence guards remain the same.  */
2842       (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2843                             NULL, 0, vect_prologue);
2844       (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2845                             NULL, 0, vect_prologue);
2846       /* FORNOW: Don't attempt to pass individual scalar instructions to
2847          the model; just assume linear cost for scalar iterations.  */
2848       (void) add_stmt_cost (target_cost_data,
2849                             peel_iters_prologue * scalar_single_iter_cost,
2850                             scalar_stmt, NULL, 0, vect_prologue);
2851       (void) add_stmt_cost (target_cost_data,
2852                             peel_iters_epilogue * scalar_single_iter_cost,
2853                             scalar_stmt, NULL, 0, vect_epilogue);
2854     }
2855   else
2856     {
2857       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2858       stmt_info_for_cost *si;
2859       int j;
2860       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2861
2862       prologue_cost_vec.create (2);
2863       epilogue_cost_vec.create (2);
2864       peel_iters_prologue = npeel;
2865
2866       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2867                                           &peel_iters_epilogue,
2868                                           scalar_single_iter_cost,
2869                                           &prologue_cost_vec,
2870                                           &epilogue_cost_vec);
2871
2872       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2873         {
2874           struct _stmt_vec_info *stmt_info
2875             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2876           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2877                                 si->misalign, vect_prologue);
2878         }
2879
2880       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2881         {
2882           struct _stmt_vec_info *stmt_info
2883             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2884           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2885                                 si->misalign, vect_epilogue);
2886         }
2887
2888       prologue_cost_vec.release ();
2889       epilogue_cost_vec.release ();
2890     }
2891
2892   /* FORNOW: The scalar outside cost is incremented in one of the
2893      following ways:
2894
2895      1. The vectorizer checks for alignment and aliasing and generates
2896      a condition that allows dynamic vectorization.  A cost model
2897      check is ANDED with the versioning condition.  Hence scalar code
2898      path now has the added cost of the versioning check.
2899
2900        if (cost > th & versioning_check)
2901          jmp to vector code
2902
2903      Hence run-time scalar is incremented by not-taken branch cost.
2904
2905      2. The vectorizer then checks if a prologue is required.  If the
2906      cost model check was not done before during versioning, it has to
2907      be done before the prologue check.
2908
2909        if (cost <= th)
2910          prologue = scalar_iters
2911        if (prologue == 0)
2912          jmp to vector code
2913        else
2914          execute prologue
2915        if (prologue == num_iters)
2916          go to exit
2917
2918      Hence the run-time scalar cost is incremented by a taken branch,
2919      plus a not-taken branch, plus a taken branch cost.
2920
2921      3. The vectorizer then checks if an epilogue is required.  If the
2922      cost model check was not done before during prologue check, it
2923      has to be done with the epilogue check.
2924
2925        if (prologue == 0)
2926          jmp to vector code
2927        else
2928          execute prologue
2929        if (prologue == num_iters)
2930          go to exit
2931        vector code:
2932          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2933            jmp to epilogue
2934
2935      Hence the run-time scalar cost should be incremented by 2 taken
2936      branches.
2937
2938      TODO: The back end may reorder the BBS's differently and reverse
2939      conditions/branch directions.  Change the estimates below to
2940      something more reasonable.  */
2941
2942   /* If the number of iterations is known and we do not do versioning, we can
2943      decide whether to vectorize at compile time.  Hence the scalar version
2944      do not carry cost model guard costs.  */
2945   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2946       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2947       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2948     {
2949       /* Cost model check occurs at versioning.  */
2950       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2951           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2952         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2953       else
2954         {
2955           /* Cost model check occurs at prologue generation.  */
2956           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2957             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2958               + vect_get_stmt_cost (cond_branch_not_taken);
2959           /* Cost model check occurs at epilogue generation.  */
2960           else
2961             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2962         }
2963     }
2964
2965   /* Complete the target-specific cost calculations.  */
2966   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2967                &vec_inside_cost, &vec_epilogue_cost);
2968
2969   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2970
2971   /* Calculate number of iterations required to make the vector version
2972      profitable, relative to the loop bodies only.  The following condition
2973      must hold true:
2974      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2975      where
2976      SIC = scalar iteration cost, VIC = vector iteration cost,
2977      VOC = vector outside cost, VF = vectorization factor,
2978      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2979      SOC = scalar outside cost for run time cost model check.  */
2980
2981   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2982     {
2983       if (vec_outside_cost <= 0)
2984         min_profitable_iters = 1;
2985       else
2986         {
2987           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2988                                   - vec_inside_cost * peel_iters_prologue
2989                                   - vec_inside_cost * peel_iters_epilogue)
2990                                  / ((scalar_single_iter_cost * vf)
2991                                     - vec_inside_cost);
2992
2993           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2994               <= (((int) vec_inside_cost * min_profitable_iters)
2995                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
2996             min_profitable_iters++;
2997         }
2998     }
2999   /* vector version will never be profitable.  */
3000   else
3001     {
3002       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3003         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3004                     "did not happen for a simd loop");
3005
3006       if (dump_enabled_p ())
3007         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3008                          "cost model: the vector iteration cost = %d "
3009                          "divided by the scalar iteration cost = %d "
3010                          "is greater or equal to the vectorization factor = %d"
3011                          ".\n",
3012                          vec_inside_cost, scalar_single_iter_cost, vf);
3013       *ret_min_profitable_niters = -1;
3014       *ret_min_profitable_estimate = -1;
3015       return;
3016     }
3017
3018   if (dump_enabled_p ())
3019     {
3020       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3021       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3022                    vec_inside_cost);
3023       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3024                    vec_prologue_cost);
3025       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3026                    vec_epilogue_cost);
3027       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3028                    scalar_single_iter_cost);
3029       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3030                    scalar_outside_cost);
3031       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3032                    vec_outside_cost);
3033       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3034                    peel_iters_prologue);
3035       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3036                    peel_iters_epilogue);
3037       dump_printf (MSG_NOTE,
3038                    "  Calculated minimum iters for profitability: %d\n",
3039                    min_profitable_iters);
3040       dump_printf (MSG_NOTE, "\n");
3041     }
3042
3043   min_profitable_iters =
3044         min_profitable_iters < vf ? vf : min_profitable_iters;
3045
3046   /* Because the condition we create is:
3047      if (niters <= min_profitable_iters)
3048        then skip the vectorized loop.  */
3049   min_profitable_iters--;
3050
3051   if (dump_enabled_p ())
3052     dump_printf_loc (MSG_NOTE, vect_location,
3053                      "  Runtime profitability threshold = %d\n",
3054                      min_profitable_iters);
3055
3056   *ret_min_profitable_niters = min_profitable_iters;
3057
3058   /* Calculate number of iterations required to make the vector version
3059      profitable, relative to the loop bodies only.
3060
3061      Non-vectorized variant is SIC * niters and it must win over vector
3062      variant on the expected loop trip count.  The following condition must hold true:
3063      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3064
3065   if (vec_outside_cost <= 0)
3066     min_profitable_estimate = 1;
3067   else
3068     {
3069       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3070                                  - vec_inside_cost * peel_iters_prologue
3071                                  - vec_inside_cost * peel_iters_epilogue)
3072                                  / ((scalar_single_iter_cost * vf)
3073                                    - vec_inside_cost);
3074     }
3075   min_profitable_estimate --;
3076   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3077   if (dump_enabled_p ())
3078     dump_printf_loc (MSG_NOTE, vect_location,
3079                      "  Static estimate profitability threshold = %d\n",
3080                       min_profitable_iters);
3081
3082   *ret_min_profitable_estimate = min_profitable_estimate;
3083 }
3084
3085
3086 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3087    functions. Design better to avoid maintenance issues.  */
3088
3089 /* Function vect_model_reduction_cost.
3090
3091    Models cost for a reduction operation, including the vector ops
3092    generated within the strip-mine loop, the initial definition before
3093    the loop, and the epilogue code that must be generated.  */
3094
3095 static bool
3096 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3097                            int ncopies)
3098 {
3099   int prologue_cost = 0, epilogue_cost = 0;
3100   enum tree_code code;
3101   optab optab;
3102   tree vectype;
3103   gimple stmt, orig_stmt;
3104   tree reduction_op;
3105   enum machine_mode mode;
3106   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3107   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3108   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3109
3110   /* Cost of reduction op inside loop.  */
3111   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3112                                         stmt_info, 0, vect_body);
3113   stmt = STMT_VINFO_STMT (stmt_info);
3114
3115   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3116     {
3117     case GIMPLE_SINGLE_RHS:
3118       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
3119       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
3120       break;
3121     case GIMPLE_UNARY_RHS:
3122       reduction_op = gimple_assign_rhs1 (stmt);
3123       break;
3124     case GIMPLE_BINARY_RHS:
3125       reduction_op = gimple_assign_rhs2 (stmt);
3126       break;
3127     case GIMPLE_TERNARY_RHS:
3128       reduction_op = gimple_assign_rhs3 (stmt);
3129       break;
3130     default:
3131       gcc_unreachable ();
3132     }
3133
3134   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3135   if (!vectype)
3136     {
3137       if (dump_enabled_p ())
3138         {
3139           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3140                            "unsupported data-type ");
3141           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3142                              TREE_TYPE (reduction_op));
3143           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3144         }
3145       return false;
3146    }
3147
3148   mode = TYPE_MODE (vectype);
3149   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3150
3151   if (!orig_stmt)
3152     orig_stmt = STMT_VINFO_STMT (stmt_info);
3153
3154   code = gimple_assign_rhs_code (orig_stmt);
3155
3156   /* Add in cost for initial definition.  */
3157   prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3158                                   stmt_info, 0, vect_prologue);
3159
3160   /* Determine cost of epilogue code.
3161
3162      We have a reduction operator that will reduce the vector in one statement.
3163      Also requires scalar extract.  */
3164
3165   if (!nested_in_vect_loop_p (loop, orig_stmt))
3166     {
3167       if (reduc_code != ERROR_MARK)
3168         {
3169           epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3170                                           stmt_info, 0, vect_epilogue);
3171           epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3172                                           stmt_info, 0, vect_epilogue);
3173         }
3174       else
3175         {
3176           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3177           tree bitsize =
3178             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3179           int element_bitsize = tree_to_uhwi (bitsize);
3180           int nelements = vec_size_in_bits / element_bitsize;
3181
3182           optab = optab_for_tree_code (code, vectype, optab_default);
3183
3184           /* We have a whole vector shift available.  */
3185           if (VECTOR_MODE_P (mode)
3186               && optab_handler (optab, mode) != CODE_FOR_nothing
3187               && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3188             {
3189               /* Final reduction via vector shifts and the reduction operator.
3190                  Also requires scalar extract.  */
3191               epilogue_cost += add_stmt_cost (target_cost_data,
3192                                               exact_log2 (nelements) * 2,
3193                                               vector_stmt, stmt_info, 0,
3194                                               vect_epilogue);
3195               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3196                                               vec_to_scalar, stmt_info, 0,
3197                                               vect_epilogue);
3198             }
3199           else
3200             /* Use extracts and reduction op for final reduction.  For N
3201                elements, we have N extracts and N-1 reduction ops.  */
3202             epilogue_cost += add_stmt_cost (target_cost_data,
3203                                             nelements + nelements - 1,
3204                                             vector_stmt, stmt_info, 0,
3205                                             vect_epilogue);
3206         }
3207     }
3208
3209   if (dump_enabled_p ())
3210     dump_printf (MSG_NOTE,
3211                  "vect_model_reduction_cost: inside_cost = %d, "
3212                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3213                  prologue_cost, epilogue_cost);
3214
3215   return true;
3216 }
3217
3218
3219 /* Function vect_model_induction_cost.
3220
3221    Models cost for induction operations.  */
3222
3223 static void
3224 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3225 {
3226   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3227   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3228   unsigned inside_cost, prologue_cost;
3229
3230   /* loop cost for vec_loop.  */
3231   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3232                                stmt_info, 0, vect_body);
3233
3234   /* prologue cost for vec_init and vec_step.  */
3235   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3236                                  stmt_info, 0, vect_prologue);
3237
3238   if (dump_enabled_p ())
3239     dump_printf_loc (MSG_NOTE, vect_location,
3240                      "vect_model_induction_cost: inside_cost = %d, "
3241                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3242 }
3243
3244
3245 /* Function get_initial_def_for_induction
3246
3247    Input:
3248    STMT - a stmt that performs an induction operation in the loop.
3249    IV_PHI - the initial value of the induction variable
3250
3251    Output:
3252    Return a vector variable, initialized with the first VF values of
3253    the induction variable.  E.g., for an iv with IV_PHI='X' and
3254    evolution S, for a vector of 4 units, we want to return:
3255    [X, X + S, X + 2*S, X + 3*S].  */
3256
3257 static tree
3258 get_initial_def_for_induction (gimple iv_phi)
3259 {
3260   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3261   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3262   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3263   tree vectype;
3264   int nunits;
3265   edge pe = loop_preheader_edge (loop);
3266   struct loop *iv_loop;
3267   basic_block new_bb;
3268   tree new_vec, vec_init, vec_step, t;
3269   tree new_var;
3270   tree new_name;
3271   gimple init_stmt, induction_phi, new_stmt;
3272   tree induc_def, vec_def, vec_dest;
3273   tree init_expr, step_expr;
3274   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3275   int i;
3276   int ncopies;
3277   tree expr;
3278   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3279   bool nested_in_vect_loop = false;
3280   gimple_seq stmts = NULL;
3281   imm_use_iterator imm_iter;
3282   use_operand_p use_p;
3283   gimple exit_phi;
3284   edge latch_e;
3285   tree loop_arg;
3286   gimple_stmt_iterator si;
3287   basic_block bb = gimple_bb (iv_phi);
3288   tree stepvectype;
3289   tree resvectype;
3290
3291   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3292   if (nested_in_vect_loop_p (loop, iv_phi))
3293     {
3294       nested_in_vect_loop = true;
3295       iv_loop = loop->inner;
3296     }
3297   else
3298     iv_loop = loop;
3299   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3300
3301   latch_e = loop_latch_edge (iv_loop);
3302   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3303
3304   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3305   gcc_assert (step_expr != NULL_TREE);
3306
3307   pe = loop_preheader_edge (iv_loop);
3308   init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3309                                      loop_preheader_edge (iv_loop));
3310
3311   vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3312   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3313   gcc_assert (vectype);
3314   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3315   ncopies = vf / nunits;
3316
3317   gcc_assert (phi_info);
3318   gcc_assert (ncopies >= 1);
3319
3320   /* Convert the step to the desired type.  */
3321   step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3322                                                   step_expr),
3323                                     &stmts, true, NULL_TREE);
3324   if (stmts)
3325     {
3326       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3327       gcc_assert (!new_bb);
3328     }
3329
3330   /* Find the first insertion point in the BB.  */
3331   si = gsi_after_labels (bb);
3332
3333   /* Create the vector that holds the initial_value of the induction.  */
3334   if (nested_in_vect_loop)
3335     {
3336       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3337          been created during vectorization of previous stmts.  We obtain it
3338          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3339       vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
3340       /* If the initial value is not of proper type, convert it.  */
3341       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3342         {
3343           new_stmt = gimple_build_assign_with_ops
3344               (VIEW_CONVERT_EXPR,
3345                vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"),
3346                build1 (VIEW_CONVERT_EXPR, vectype, vec_init), NULL_TREE);
3347           vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3348           gimple_assign_set_lhs (new_stmt, vec_init);
3349           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3350                                                  new_stmt);
3351           gcc_assert (!new_bb);
3352           set_vinfo_for_stmt (new_stmt,
3353                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3354         }
3355     }
3356   else
3357     {
3358       vec<constructor_elt, va_gc> *v;
3359
3360       /* iv_loop is the loop to be vectorized. Create:
3361          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3362       new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3363                                        vect_scalar_var, "var_");
3364       new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3365                                                      init_expr),
3366                                        &stmts, false, new_var);
3367       if (stmts)
3368         {
3369           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3370           gcc_assert (!new_bb);
3371         }
3372
3373       vec_alloc (v, nunits);
3374       bool constant_p = is_gimple_min_invariant (new_name);
3375       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3376       for (i = 1; i < nunits; i++)
3377         {
3378           /* Create: new_name_i = new_name + step_expr  */
3379           new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3380                                   new_name, step_expr);
3381           if (!is_gimple_min_invariant (new_name))
3382             {
3383               init_stmt = gimple_build_assign (new_var, new_name);
3384               new_name = make_ssa_name (new_var, init_stmt);
3385               gimple_assign_set_lhs (init_stmt, new_name);
3386               new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3387               gcc_assert (!new_bb);
3388               if (dump_enabled_p ())
3389                 {
3390                   dump_printf_loc (MSG_NOTE, vect_location,
3391                                    "created new init_stmt: ");
3392                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3393                   dump_printf (MSG_NOTE, "\n");
3394                 }
3395               constant_p = false;
3396             }
3397           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3398         }
3399       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3400       if (constant_p)
3401         new_vec = build_vector_from_ctor (vectype, v);
3402       else
3403         new_vec = build_constructor (vectype, v);
3404       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3405     }
3406
3407
3408   /* Create the vector that holds the step of the induction.  */
3409   if (nested_in_vect_loop)
3410     /* iv_loop is nested in the loop to be vectorized. Generate:
3411        vec_step = [S, S, S, S]  */
3412     new_name = step_expr;
3413   else
3414     {
3415       /* iv_loop is the loop to be vectorized. Generate:
3416           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3417       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3418         {
3419           expr = build_int_cst (integer_type_node, vf);
3420           expr = fold_convert (TREE_TYPE (step_expr), expr);
3421         }
3422       else
3423         expr = build_int_cst (TREE_TYPE (step_expr), vf);
3424       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3425                               expr, step_expr);
3426       if (TREE_CODE (step_expr) == SSA_NAME)
3427         new_name = vect_init_vector (iv_phi, new_name,
3428                                      TREE_TYPE (step_expr), NULL);
3429     }
3430
3431   t = unshare_expr (new_name);
3432   gcc_assert (CONSTANT_CLASS_P (new_name)
3433               || TREE_CODE (new_name) == SSA_NAME);
3434   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3435   gcc_assert (stepvectype);
3436   new_vec = build_vector_from_val (stepvectype, t);
3437   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3438
3439
3440   /* Create the following def-use cycle:
3441      loop prolog:
3442          vec_init = ...
3443          vec_step = ...
3444      loop:
3445          vec_iv = PHI <vec_init, vec_loop>
3446          ...
3447          STMT
3448          ...
3449          vec_loop = vec_iv + vec_step;  */
3450
3451   /* Create the induction-phi that defines the induction-operand.  */
3452   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3453   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3454   set_vinfo_for_stmt (induction_phi,
3455                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3456   induc_def = PHI_RESULT (induction_phi);
3457
3458   /* Create the iv update inside the loop  */
3459   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3460                                            induc_def, vec_step);
3461   vec_def = make_ssa_name (vec_dest, new_stmt);
3462   gimple_assign_set_lhs (new_stmt, vec_def);
3463   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3464   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3465                                                    NULL));
3466
3467   /* Set the arguments of the phi node:  */
3468   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3469   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3470                UNKNOWN_LOCATION);
3471
3472
3473   /* In case that vectorization factor (VF) is bigger than the number
3474      of elements that we can fit in a vectype (nunits), we have to generate
3475      more than one vector stmt - i.e - we need to "unroll" the
3476      vector stmt by a factor VF/nunits.  For more details see documentation
3477      in vectorizable_operation.  */
3478
3479   if (ncopies > 1)
3480     {
3481       stmt_vec_info prev_stmt_vinfo;
3482       /* FORNOW. This restriction should be relaxed.  */
3483       gcc_assert (!nested_in_vect_loop);
3484
3485       /* Create the vector that holds the step of the induction.  */
3486       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3487         {
3488           expr = build_int_cst (integer_type_node, nunits);
3489           expr = fold_convert (TREE_TYPE (step_expr), expr);
3490         }
3491       else
3492         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3493       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3494                               expr, step_expr);
3495       if (TREE_CODE (step_expr) == SSA_NAME)
3496         new_name = vect_init_vector (iv_phi, new_name,
3497                                      TREE_TYPE (step_expr), NULL);
3498       t = unshare_expr (new_name);
3499       gcc_assert (CONSTANT_CLASS_P (new_name)
3500                   || TREE_CODE (new_name) == SSA_NAME);
3501       new_vec = build_vector_from_val (stepvectype, t);
3502       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3503
3504       vec_def = induc_def;
3505       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3506       for (i = 1; i < ncopies; i++)
3507         {
3508           /* vec_i = vec_prev + vec_step  */
3509           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3510                                                    vec_def, vec_step);
3511           vec_def = make_ssa_name (vec_dest, new_stmt);
3512           gimple_assign_set_lhs (new_stmt, vec_def);
3513
3514           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3515           if (!useless_type_conversion_p (resvectype, vectype))
3516             {
3517               new_stmt = gimple_build_assign_with_ops
3518                   (VIEW_CONVERT_EXPR,
3519                    vect_get_new_vect_var (resvectype, vect_simple_var,
3520                                           "vec_iv_"),
3521                    build1 (VIEW_CONVERT_EXPR, resvectype,
3522                            gimple_assign_lhs (new_stmt)), NULL_TREE);
3523               gimple_assign_set_lhs (new_stmt,
3524                                      make_ssa_name
3525                                        (gimple_assign_lhs (new_stmt), new_stmt));
3526               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3527             }
3528           set_vinfo_for_stmt (new_stmt,
3529                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3530           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3531           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3532         }
3533     }
3534
3535   if (nested_in_vect_loop)
3536     {
3537       /* Find the loop-closed exit-phi of the induction, and record
3538          the final vector of induction results:  */
3539       exit_phi = NULL;
3540       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3541         {
3542           gimple use_stmt = USE_STMT (use_p);
3543           if (is_gimple_debug (use_stmt))
3544             continue;
3545
3546           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3547             {
3548               exit_phi = use_stmt;
3549               break;
3550             }
3551         }
3552       if (exit_phi)
3553         {
3554           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3555           /* FORNOW. Currently not supporting the case that an inner-loop induction
3556              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3557           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3558                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3559
3560           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3561           if (dump_enabled_p ())
3562             {
3563               dump_printf_loc (MSG_NOTE, vect_location,
3564                                "vector of inductions after inner-loop:");
3565               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3566               dump_printf (MSG_NOTE, "\n");
3567             }
3568         }
3569     }
3570
3571
3572   if (dump_enabled_p ())
3573     {
3574       dump_printf_loc (MSG_NOTE, vect_location,
3575                        "transform induction: created def-use cycle: ");
3576       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3577       dump_printf (MSG_NOTE, "\n");
3578       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3579                         SSA_NAME_DEF_STMT (vec_def), 0);
3580       dump_printf (MSG_NOTE, "\n");
3581     }
3582
3583   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3584   if (!useless_type_conversion_p (resvectype, vectype))
3585     {
3586       new_stmt = gimple_build_assign_with_ops
3587          (VIEW_CONVERT_EXPR,
3588           vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3589           build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3590       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3591       gimple_assign_set_lhs (new_stmt, induc_def);
3592       si = gsi_after_labels (bb);
3593       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3594       set_vinfo_for_stmt (new_stmt,
3595                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3596       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3597         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3598     }
3599
3600   return induc_def;
3601 }
3602
3603
3604 /* Function get_initial_def_for_reduction
3605
3606    Input:
3607    STMT - a stmt that performs a reduction operation in the loop.
3608    INIT_VAL - the initial value of the reduction variable
3609
3610    Output:
3611    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3612         of the reduction (used for adjusting the epilog - see below).
3613    Return a vector variable, initialized according to the operation that STMT
3614         performs. This vector will be used as the initial value of the
3615         vector of partial results.
3616
3617    Option1 (adjust in epilog): Initialize the vector as follows:
3618      add/bit or/xor:    [0,0,...,0,0]
3619      mult/bit and:      [1,1,...,1,1]
3620      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3621    and when necessary (e.g. add/mult case) let the caller know
3622    that it needs to adjust the result by init_val.
3623
3624    Option2: Initialize the vector as follows:
3625      add/bit or/xor:    [init_val,0,0,...,0]
3626      mult/bit and:      [init_val,1,1,...,1]
3627      min/max/cond_expr: [init_val,init_val,...,init_val]
3628    and no adjustments are needed.
3629
3630    For example, for the following code:
3631
3632    s = init_val;
3633    for (i=0;i<n;i++)
3634      s = s + a[i];
3635
3636    STMT is 's = s + a[i]', and the reduction variable is 's'.
3637    For a vector of 4 units, we want to return either [0,0,0,init_val],
3638    or [0,0,0,0] and let the caller know that it needs to adjust
3639    the result at the end by 'init_val'.
3640
3641    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3642    initialization vector is simpler (same element in all entries), if
3643    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3644
3645    A cost model should help decide between these two schemes.  */
3646
3647 tree
3648 get_initial_def_for_reduction (gimple stmt, tree init_val,
3649                                tree *adjustment_def)
3650 {
3651   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3652   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3653   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3654   tree scalar_type = TREE_TYPE (init_val);
3655   tree vectype = get_vectype_for_scalar_type (scalar_type);
3656   int nunits;
3657   enum tree_code code = gimple_assign_rhs_code (stmt);
3658   tree def_for_init;
3659   tree init_def;
3660   tree *elts;
3661   int i;
3662   bool nested_in_vect_loop = false;
3663   tree init_value;
3664   REAL_VALUE_TYPE real_init_val = dconst0;
3665   int int_init_val = 0;
3666   gimple def_stmt = NULL;
3667
3668   gcc_assert (vectype);
3669   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3670
3671   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3672               || SCALAR_FLOAT_TYPE_P (scalar_type));
3673
3674   if (nested_in_vect_loop_p (loop, stmt))
3675     nested_in_vect_loop = true;
3676   else
3677     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3678
3679   /* In case of double reduction we only create a vector variable to be put
3680      in the reduction phi node.  The actual statement creation is done in
3681      vect_create_epilog_for_reduction.  */
3682   if (adjustment_def && nested_in_vect_loop
3683       && TREE_CODE (init_val) == SSA_NAME
3684       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3685       && gimple_code (def_stmt) == GIMPLE_PHI
3686       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3687       && vinfo_for_stmt (def_stmt)
3688       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3689           == vect_double_reduction_def)
3690     {
3691       *adjustment_def = NULL;
3692       return vect_create_destination_var (init_val, vectype);
3693     }
3694
3695   if (TREE_CONSTANT (init_val))
3696     {
3697       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3698         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3699       else
3700         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3701     }
3702   else
3703     init_value = init_val;
3704
3705   switch (code)
3706     {
3707       case WIDEN_SUM_EXPR:
3708       case DOT_PROD_EXPR:
3709       case SAD_EXPR:
3710       case PLUS_EXPR:
3711       case MINUS_EXPR:
3712       case BIT_IOR_EXPR:
3713       case BIT_XOR_EXPR:
3714       case MULT_EXPR:
3715       case BIT_AND_EXPR:
3716         /* ADJUSMENT_DEF is NULL when called from
3717            vect_create_epilog_for_reduction to vectorize double reduction.  */
3718         if (adjustment_def)
3719           {
3720             if (nested_in_vect_loop)
3721               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3722                                                               NULL);
3723             else
3724               *adjustment_def = init_val;
3725           }
3726
3727         if (code == MULT_EXPR)
3728           {
3729             real_init_val = dconst1;
3730             int_init_val = 1;
3731           }
3732
3733         if (code == BIT_AND_EXPR)
3734           int_init_val = -1;
3735
3736         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3737           def_for_init = build_real (scalar_type, real_init_val);
3738         else
3739           def_for_init = build_int_cst (scalar_type, int_init_val);
3740
3741         /* Create a vector of '0' or '1' except the first element.  */
3742         elts = XALLOCAVEC (tree, nunits);
3743         for (i = nunits - 2; i >= 0; --i)
3744           elts[i + 1] = def_for_init;
3745
3746         /* Option1: the first element is '0' or '1' as well.  */
3747         if (adjustment_def)
3748           {
3749             elts[0] = def_for_init;
3750             init_def = build_vector (vectype, elts);
3751             break;
3752           }
3753
3754         /* Option2: the first element is INIT_VAL.  */
3755         elts[0] = init_val;
3756         if (TREE_CONSTANT (init_val))
3757           init_def = build_vector (vectype, elts);
3758         else
3759           {
3760             vec<constructor_elt, va_gc> *v;
3761             vec_alloc (v, nunits);
3762             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3763             for (i = 1; i < nunits; ++i)
3764               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3765             init_def = build_constructor (vectype, v);
3766           }
3767
3768         break;
3769
3770       case MIN_EXPR:
3771       case MAX_EXPR:
3772       case COND_EXPR:
3773         if (adjustment_def)
3774           {
3775             *adjustment_def = NULL_TREE;
3776             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3777             break;
3778           }
3779
3780         init_def = build_vector_from_val (vectype, init_value);
3781         break;
3782
3783       default:
3784         gcc_unreachable ();
3785     }
3786
3787   return init_def;
3788 }
3789
3790
3791 /* Function vect_create_epilog_for_reduction
3792
3793    Create code at the loop-epilog to finalize the result of a reduction
3794    computation.
3795
3796    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3797      reduction statements.
3798    STMT is the scalar reduction stmt that is being vectorized.
3799    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3800      number of elements that we can fit in a vectype (nunits).  In this case
3801      we have to generate more than one vector stmt - i.e - we need to "unroll"
3802      the vector stmt by a factor VF/nunits.  For more details see documentation
3803      in vectorizable_operation.
3804    REDUC_CODE is the tree-code for the epilog reduction.
3805    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3806      computation.
3807    REDUC_INDEX is the index of the operand in the right hand side of the
3808      statement that is defined by REDUCTION_PHI.
3809    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3810    SLP_NODE is an SLP node containing a group of reduction statements. The
3811      first one in this group is STMT.
3812
3813    This function:
3814    1. Creates the reduction def-use cycles: sets the arguments for
3815       REDUCTION_PHIS:
3816       The loop-entry argument is the vectorized initial-value of the reduction.
3817       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3818       sums.
3819    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3820       by applying the operation specified by REDUC_CODE if available, or by
3821       other means (whole-vector shifts or a scalar loop).
3822       The function also creates a new phi node at the loop exit to preserve
3823       loop-closed form, as illustrated below.
3824
3825      The flow at the entry to this function:
3826
3827         loop:
3828           vec_def = phi <null, null>            # REDUCTION_PHI
3829           VECT_DEF = vector_stmt                # vectorized form of STMT
3830           s_loop = scalar_stmt                  # (scalar) STMT
3831         loop_exit:
3832           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3833           use <s_out0>
3834           use <s_out0>
3835
3836      The above is transformed by this function into:
3837
3838         loop:
3839           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3840           VECT_DEF = vector_stmt                # vectorized form of STMT
3841           s_loop = scalar_stmt                  # (scalar) STMT
3842         loop_exit:
3843           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3844           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3845           v_out2 = reduce <v_out1>
3846           s_out3 = extract_field <v_out2, 0>
3847           s_out4 = adjust_result <s_out3>
3848           use <s_out4>
3849           use <s_out4>
3850 */
3851
3852 static void
3853 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3854                                   int ncopies, enum tree_code reduc_code,
3855                                   vec<gimple> reduction_phis,
3856                                   int reduc_index, bool double_reduc,
3857                                   slp_tree slp_node)
3858 {
3859   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3860   stmt_vec_info prev_phi_info;
3861   tree vectype;
3862   enum machine_mode mode;
3863   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3864   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3865   basic_block exit_bb;
3866   tree scalar_dest;
3867   tree scalar_type;
3868   gimple new_phi = NULL, phi;
3869   gimple_stmt_iterator exit_gsi;
3870   tree vec_dest;
3871   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3872   gimple epilog_stmt = NULL;
3873   enum tree_code code = gimple_assign_rhs_code (stmt);
3874   gimple exit_phi;
3875   tree bitsize, bitpos;
3876   tree adjustment_def = NULL;
3877   tree vec_initial_def = NULL;
3878   tree reduction_op, expr, def;
3879   tree orig_name, scalar_result;
3880   imm_use_iterator imm_iter, phi_imm_iter;
3881   use_operand_p use_p, phi_use_p;
3882   bool extract_scalar_result = false;
3883   gimple use_stmt, orig_stmt, reduction_phi = NULL;
3884   bool nested_in_vect_loop = false;
3885   auto_vec<gimple> new_phis;
3886   auto_vec<gimple> inner_phis;
3887   enum vect_def_type dt = vect_unknown_def_type;
3888   int j, i;
3889   auto_vec<tree> scalar_results;
3890   unsigned int group_size = 1, k, ratio;
3891   auto_vec<tree> vec_initial_defs;
3892   auto_vec<gimple> phis;
3893   bool slp_reduc = false;
3894   tree new_phi_result;
3895   gimple inner_phi = NULL;
3896
3897   if (slp_node)
3898     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3899
3900   if (nested_in_vect_loop_p (loop, stmt))
3901     {
3902       outer_loop = loop;
3903       loop = loop->inner;
3904       nested_in_vect_loop = true;
3905       gcc_assert (!slp_node);
3906     }
3907
3908   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3909     {
3910     case GIMPLE_SINGLE_RHS:
3911       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3912                   == ternary_op);
3913       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3914       break;
3915     case GIMPLE_UNARY_RHS:
3916       reduction_op = gimple_assign_rhs1 (stmt);
3917       break;
3918     case GIMPLE_BINARY_RHS:
3919       reduction_op = reduc_index ?
3920                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3921       break;
3922     case GIMPLE_TERNARY_RHS:
3923       reduction_op = gimple_op (stmt, reduc_index + 1);
3924       break;
3925     default:
3926       gcc_unreachable ();
3927     }
3928
3929   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3930   gcc_assert (vectype);
3931   mode = TYPE_MODE (vectype);
3932
3933   /* 1. Create the reduction def-use cycle:
3934      Set the arguments of REDUCTION_PHIS, i.e., transform
3935
3936         loop:
3937           vec_def = phi <null, null>            # REDUCTION_PHI
3938           VECT_DEF = vector_stmt                # vectorized form of STMT
3939           ...
3940
3941      into:
3942
3943         loop:
3944           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3945           VECT_DEF = vector_stmt                # vectorized form of STMT
3946           ...
3947
3948      (in case of SLP, do it for all the phis). */
3949
3950   /* Get the loop-entry arguments.  */
3951   if (slp_node)
3952     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3953                        NULL, slp_node, reduc_index);
3954   else
3955     {
3956       vec_initial_defs.create (1);
3957      /* For the case of reduction, vect_get_vec_def_for_operand returns
3958         the scalar def before the loop, that defines the initial value
3959         of the reduction variable.  */
3960       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3961                                                       &adjustment_def);
3962       vec_initial_defs.quick_push (vec_initial_def);
3963     }
3964
3965   /* Set phi nodes arguments.  */
3966   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
3967     {
3968       tree vec_init_def, def;
3969       gimple_seq stmts;
3970       vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
3971                                            true, NULL_TREE);
3972       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
3973       def = vect_defs[i];
3974       for (j = 0; j < ncopies; j++)
3975         {
3976           /* Set the loop-entry arg of the reduction-phi.  */
3977           add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3978                        UNKNOWN_LOCATION);
3979
3980           /* Set the loop-latch arg for the reduction-phi.  */
3981           if (j > 0)
3982             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3983
3984           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3985
3986           if (dump_enabled_p ())
3987             {
3988               dump_printf_loc (MSG_NOTE, vect_location,
3989                                "transform reduction: created def-use cycle: ");
3990               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
3991               dump_printf (MSG_NOTE, "\n");
3992               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
3993               dump_printf (MSG_NOTE, "\n");
3994             }
3995
3996           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3997         }
3998     }
3999
4000   /* 2. Create epilog code.
4001         The reduction epilog code operates across the elements of the vector
4002         of partial results computed by the vectorized loop.
4003         The reduction epilog code consists of:
4004
4005         step 1: compute the scalar result in a vector (v_out2)
4006         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4007         step 3: adjust the scalar result (s_out3) if needed.
4008
4009         Step 1 can be accomplished using one the following three schemes:
4010           (scheme 1) using reduc_code, if available.
4011           (scheme 2) using whole-vector shifts, if available.
4012           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4013                      combined.
4014
4015           The overall epilog code looks like this:
4016
4017           s_out0 = phi <s_loop>         # original EXIT_PHI
4018           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4019           v_out2 = reduce <v_out1>              # step 1
4020           s_out3 = extract_field <v_out2, 0>    # step 2
4021           s_out4 = adjust_result <s_out3>       # step 3
4022
4023           (step 3 is optional, and steps 1 and 2 may be combined).
4024           Lastly, the uses of s_out0 are replaced by s_out4.  */
4025
4026
4027   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4028          v_out1 = phi <VECT_DEF>
4029          Store them in NEW_PHIS.  */
4030
4031   exit_bb = single_exit (loop)->dest;
4032   prev_phi_info = NULL;
4033   new_phis.create (vect_defs.length ());
4034   FOR_EACH_VEC_ELT (vect_defs, i, def)
4035     {
4036       for (j = 0; j < ncopies; j++)
4037         {
4038           tree new_def = copy_ssa_name (def, NULL);
4039           phi = create_phi_node (new_def, exit_bb);
4040           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
4041           if (j == 0)
4042             new_phis.quick_push (phi);
4043           else
4044             {
4045               def = vect_get_vec_def_for_stmt_copy (dt, def);
4046               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4047             }
4048
4049           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4050           prev_phi_info = vinfo_for_stmt (phi);
4051         }
4052     }
4053
4054   /* The epilogue is created for the outer-loop, i.e., for the loop being
4055      vectorized.  Create exit phis for the outer loop.  */
4056   if (double_reduc)
4057     {
4058       loop = outer_loop;
4059       exit_bb = single_exit (loop)->dest;
4060       inner_phis.create (vect_defs.length ());
4061       FOR_EACH_VEC_ELT (new_phis, i, phi)
4062         {
4063           tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
4064           gimple outer_phi = create_phi_node (new_result, exit_bb);
4065           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4066                            PHI_RESULT (phi));
4067           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4068                                                             loop_vinfo, NULL));
4069           inner_phis.quick_push (phi);
4070           new_phis[i] = outer_phi;
4071           prev_phi_info = vinfo_for_stmt (outer_phi);
4072           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4073             {
4074               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4075               new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
4076               outer_phi = create_phi_node (new_result, exit_bb);
4077               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4078                                PHI_RESULT (phi));
4079               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4080                                                         loop_vinfo, NULL));
4081               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4082               prev_phi_info = vinfo_for_stmt (outer_phi);
4083             }
4084         }
4085     }
4086
4087   exit_gsi = gsi_after_labels (exit_bb);
4088
4089   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4090          (i.e. when reduc_code is not available) and in the final adjustment
4091          code (if needed).  Also get the original scalar reduction variable as
4092          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4093          represents a reduction pattern), the tree-code and scalar-def are
4094          taken from the original stmt that the pattern-stmt (STMT) replaces.
4095          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4096          are taken from STMT.  */
4097
4098   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4099   if (!orig_stmt)
4100     {
4101       /* Regular reduction  */
4102       orig_stmt = stmt;
4103     }
4104   else
4105     {
4106       /* Reduction pattern  */
4107       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4108       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4109       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4110     }
4111
4112   code = gimple_assign_rhs_code (orig_stmt);
4113   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4114      partial results are added and not subtracted.  */
4115   if (code == MINUS_EXPR)
4116     code = PLUS_EXPR;
4117
4118   scalar_dest = gimple_assign_lhs (orig_stmt);
4119   scalar_type = TREE_TYPE (scalar_dest);
4120   scalar_results.create (group_size);
4121   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4122   bitsize = TYPE_SIZE (scalar_type);
4123
4124   /* In case this is a reduction in an inner-loop while vectorizing an outer
4125      loop - we don't need to extract a single scalar result at the end of the
4126      inner-loop (unless it is double reduction, i.e., the use of reduction is
4127      outside the outer-loop).  The final vector of partial results will be used
4128      in the vectorized outer-loop, or reduced to a scalar result at the end of
4129      the outer-loop.  */
4130   if (nested_in_vect_loop && !double_reduc)
4131     goto vect_finalize_reduction;
4132
4133   /* SLP reduction without reduction chain, e.g.,
4134      # a1 = phi <a2, a0>
4135      # b1 = phi <b2, b0>
4136      a2 = operation (a1)
4137      b2 = operation (b1)  */
4138   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4139
4140   /* In case of reduction chain, e.g.,
4141      # a1 = phi <a3, a0>
4142      a2 = operation (a1)
4143      a3 = operation (a2),
4144
4145      we may end up with more than one vector result.  Here we reduce them to
4146      one vector.  */
4147   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4148     {
4149       tree first_vect = PHI_RESULT (new_phis[0]);
4150       tree tmp;
4151       gimple new_vec_stmt = NULL;
4152
4153       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4154       for (k = 1; k < new_phis.length (); k++)
4155         {
4156           gimple next_phi = new_phis[k];
4157           tree second_vect = PHI_RESULT (next_phi);
4158
4159           tmp = build2 (code, vectype,  first_vect, second_vect);
4160           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4161           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4162           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4163           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4164         }
4165
4166       new_phi_result = first_vect;
4167       if (new_vec_stmt)
4168         {
4169           new_phis.truncate (0);
4170           new_phis.safe_push (new_vec_stmt);
4171         }
4172     }
4173   else
4174     new_phi_result = PHI_RESULT (new_phis[0]);
4175
4176   /* 2.3 Create the reduction code, using one of the three schemes described
4177          above. In SLP we simply need to extract all the elements from the
4178          vector (without reducing them), so we use scalar shifts.  */
4179   if (reduc_code != ERROR_MARK && !slp_reduc)
4180     {
4181       tree tmp;
4182
4183       /*** Case 1:  Create:
4184            v_out2 = reduc_expr <v_out1>  */
4185
4186       if (dump_enabled_p ())
4187         dump_printf_loc (MSG_NOTE, vect_location,
4188                          "Reduce using direct vector reduction.\n");
4189
4190       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4191       tmp = build1 (reduc_code, vectype, new_phi_result);
4192       epilog_stmt = gimple_build_assign (vec_dest, tmp);
4193       new_temp = make_ssa_name (vec_dest, epilog_stmt);
4194       gimple_assign_set_lhs (epilog_stmt, new_temp);
4195       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4196
4197       extract_scalar_result = true;
4198     }
4199   else
4200     {
4201       enum tree_code shift_code = ERROR_MARK;
4202       bool have_whole_vector_shift = true;
4203       int bit_offset;
4204       int element_bitsize = tree_to_uhwi (bitsize);
4205       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4206       tree vec_temp;
4207
4208       if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4209         shift_code = VEC_RSHIFT_EXPR;
4210       else
4211         have_whole_vector_shift = false;
4212
4213       /* Regardless of whether we have a whole vector shift, if we're
4214          emulating the operation via tree-vect-generic, we don't want
4215          to use it.  Only the first round of the reduction is likely
4216          to still be profitable via emulation.  */
4217       /* ??? It might be better to emit a reduction tree code here, so that
4218          tree-vect-generic can expand the first round via bit tricks.  */
4219       if (!VECTOR_MODE_P (mode))
4220         have_whole_vector_shift = false;
4221       else
4222         {
4223           optab optab = optab_for_tree_code (code, vectype, optab_default);
4224           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4225             have_whole_vector_shift = false;
4226         }
4227
4228       if (have_whole_vector_shift && !slp_reduc)
4229         {
4230           /*** Case 2: Create:
4231              for (offset = VS/2; offset >= element_size; offset/=2)
4232                 {
4233                   Create:  va' = vec_shift <va, offset>
4234                   Create:  va = vop <va, va'>
4235                 }  */
4236
4237           if (dump_enabled_p ())
4238             dump_printf_loc (MSG_NOTE, vect_location,
4239                              "Reduce using vector shifts\n");
4240
4241           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4242           new_temp = new_phi_result;
4243           for (bit_offset = vec_size_in_bits/2;
4244                bit_offset >= element_bitsize;
4245                bit_offset /= 2)
4246             {
4247               tree bitpos = size_int (bit_offset);
4248
4249               epilog_stmt = gimple_build_assign_with_ops (shift_code,
4250                                                vec_dest, new_temp, bitpos);
4251               new_name = make_ssa_name (vec_dest, epilog_stmt);
4252               gimple_assign_set_lhs (epilog_stmt, new_name);
4253               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4254
4255               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4256                                                           new_name, new_temp);
4257               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4258               gimple_assign_set_lhs (epilog_stmt, new_temp);
4259               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4260             }
4261
4262           extract_scalar_result = true;
4263         }
4264       else
4265         {
4266           tree rhs;
4267
4268           /*** Case 3: Create:
4269              s = extract_field <v_out2, 0>
4270              for (offset = element_size;
4271                   offset < vector_size;
4272                   offset += element_size;)
4273                {
4274                  Create:  s' = extract_field <v_out2, offset>
4275                  Create:  s = op <s, s'>  // For non SLP cases
4276                }  */
4277
4278           if (dump_enabled_p ())
4279             dump_printf_loc (MSG_NOTE, vect_location,
4280                              "Reduce using scalar code.\n");
4281
4282           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4283           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4284             {
4285               if (gimple_code (new_phi) == GIMPLE_PHI)
4286                 vec_temp = PHI_RESULT (new_phi);
4287               else
4288                 vec_temp = gimple_assign_lhs (new_phi);
4289               rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4290                             bitsize_zero_node);
4291               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4292               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4293               gimple_assign_set_lhs (epilog_stmt, new_temp);
4294               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4295
4296               /* In SLP we don't need to apply reduction operation, so we just
4297                  collect s' values in SCALAR_RESULTS.  */
4298               if (slp_reduc)
4299                 scalar_results.safe_push (new_temp);
4300
4301               for (bit_offset = element_bitsize;
4302                    bit_offset < vec_size_in_bits;
4303                    bit_offset += element_bitsize)
4304                 {
4305                   tree bitpos = bitsize_int (bit_offset);
4306                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4307                                      bitsize, bitpos);
4308
4309                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4310                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4311                   gimple_assign_set_lhs (epilog_stmt, new_name);
4312                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4313
4314                   if (slp_reduc)
4315                     {
4316                       /* In SLP we don't need to apply reduction operation, so
4317                          we just collect s' values in SCALAR_RESULTS.  */
4318                       new_temp = new_name;
4319                       scalar_results.safe_push (new_name);
4320                     }
4321                   else
4322                     {
4323                       epilog_stmt = gimple_build_assign_with_ops (code,
4324                                           new_scalar_dest, new_name, new_temp);
4325                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4326                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4327                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4328                     }
4329                 }
4330             }
4331
4332           /* The only case where we need to reduce scalar results in SLP, is
4333              unrolling.  If the size of SCALAR_RESULTS is greater than
4334              GROUP_SIZE, we reduce them combining elements modulo
4335              GROUP_SIZE.  */
4336           if (slp_reduc)
4337             {
4338               tree res, first_res, new_res;
4339               gimple new_stmt;
4340
4341               /* Reduce multiple scalar results in case of SLP unrolling.  */
4342               for (j = group_size; scalar_results.iterate (j, &res);
4343                    j++)
4344                 {
4345                   first_res = scalar_results[j % group_size];
4346                   new_stmt = gimple_build_assign_with_ops (code,
4347                                               new_scalar_dest, first_res, res);
4348                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4349                   gimple_assign_set_lhs (new_stmt, new_res);
4350                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4351                   scalar_results[j % group_size] = new_res;
4352                 }
4353             }
4354           else
4355             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4356             scalar_results.safe_push (new_temp);
4357
4358           extract_scalar_result = false;
4359         }
4360     }
4361
4362   /* 2.4  Extract the final scalar result.  Create:
4363           s_out3 = extract_field <v_out2, bitpos>  */
4364
4365   if (extract_scalar_result)
4366     {
4367       tree rhs;
4368
4369       if (dump_enabled_p ())
4370         dump_printf_loc (MSG_NOTE, vect_location,
4371                          "extract scalar result\n");
4372
4373       if (BYTES_BIG_ENDIAN)
4374         bitpos = size_binop (MULT_EXPR,
4375                              bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4376                              TYPE_SIZE (scalar_type));
4377       else
4378         bitpos = bitsize_zero_node;
4379
4380       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4381       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4382       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4383       gimple_assign_set_lhs (epilog_stmt, new_temp);
4384       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4385       scalar_results.safe_push (new_temp);
4386     }
4387
4388 vect_finalize_reduction:
4389
4390   if (double_reduc)
4391     loop = loop->inner;
4392
4393   /* 2.5 Adjust the final result by the initial value of the reduction
4394          variable. (When such adjustment is not needed, then
4395          'adjustment_def' is zero).  For example, if code is PLUS we create:
4396          new_temp = loop_exit_def + adjustment_def  */
4397
4398   if (adjustment_def)
4399     {
4400       gcc_assert (!slp_reduc);
4401       if (nested_in_vect_loop)
4402         {
4403           new_phi = new_phis[0];
4404           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4405           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4406           new_dest = vect_create_destination_var (scalar_dest, vectype);
4407         }
4408       else
4409         {
4410           new_temp = scalar_results[0];
4411           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4412           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4413           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4414         }
4415
4416       epilog_stmt = gimple_build_assign (new_dest, expr);
4417       new_temp = make_ssa_name (new_dest, epilog_stmt);
4418       gimple_assign_set_lhs (epilog_stmt, new_temp);
4419       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4420       if (nested_in_vect_loop)
4421         {
4422           set_vinfo_for_stmt (epilog_stmt,
4423                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
4424                                                  NULL));
4425           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4426                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4427
4428           if (!double_reduc)
4429             scalar_results.quick_push (new_temp);
4430           else
4431             scalar_results[0] = new_temp;
4432         }
4433       else
4434         scalar_results[0] = new_temp;
4435
4436       new_phis[0] = epilog_stmt;
4437     }
4438
4439   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4440           phis with new adjusted scalar results, i.e., replace use <s_out0>
4441           with use <s_out4>.
4442
4443      Transform:
4444         loop_exit:
4445           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4446           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4447           v_out2 = reduce <v_out1>
4448           s_out3 = extract_field <v_out2, 0>
4449           s_out4 = adjust_result <s_out3>
4450           use <s_out0>
4451           use <s_out0>
4452
4453      into:
4454
4455         loop_exit:
4456           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4457           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4458           v_out2 = reduce <v_out1>
4459           s_out3 = extract_field <v_out2, 0>
4460           s_out4 = adjust_result <s_out3>
4461           use <s_out4>
4462           use <s_out4> */
4463
4464
4465   /* In SLP reduction chain we reduce vector results into one vector if
4466      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4467      the last stmt in the reduction chain, since we are looking for the loop
4468      exit phi node.  */
4469   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4470     {
4471       scalar_dest = gimple_assign_lhs (
4472                         SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4473       group_size = 1;
4474     }
4475
4476   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4477      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4478      need to match SCALAR_RESULTS with corresponding statements.  The first
4479      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4480      the first vector stmt, etc.
4481      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4482   if (group_size > new_phis.length ())
4483     {
4484       ratio = group_size / new_phis.length ();
4485       gcc_assert (!(group_size % new_phis.length ()));
4486     }
4487   else
4488     ratio = 1;
4489
4490   for (k = 0; k < group_size; k++)
4491     {
4492       if (k % ratio == 0)
4493         {
4494           epilog_stmt = new_phis[k / ratio];
4495           reduction_phi = reduction_phis[k / ratio];
4496           if (double_reduc)
4497             inner_phi = inner_phis[k / ratio];
4498         }
4499
4500       if (slp_reduc)
4501         {
4502           gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4503
4504           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4505           /* SLP statements can't participate in patterns.  */
4506           gcc_assert (!orig_stmt);
4507           scalar_dest = gimple_assign_lhs (current_stmt);
4508         }
4509
4510       phis.create (3);
4511       /* Find the loop-closed-use at the loop exit of the original scalar
4512          result.  (The reduction result is expected to have two immediate uses -
4513          one at the latch block, and one at the loop exit).  */
4514       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4515         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4516             && !is_gimple_debug (USE_STMT (use_p)))
4517           phis.safe_push (USE_STMT (use_p));
4518
4519       /* While we expect to have found an exit_phi because of loop-closed-ssa
4520          form we can end up without one if the scalar cycle is dead.  */
4521
4522       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4523         {
4524           if (outer_loop)
4525             {
4526               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4527               gimple vect_phi;
4528
4529               /* FORNOW. Currently not supporting the case that an inner-loop
4530                  reduction is not used in the outer-loop (but only outside the
4531                  outer-loop), unless it is double reduction.  */
4532               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4533                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4534                           || double_reduc);
4535
4536               STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4537               if (!double_reduc
4538                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4539                       != vect_double_reduction_def)
4540                 continue;
4541
4542               /* Handle double reduction:
4543
4544                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4545                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4546                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4547                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4548
4549                  At that point the regular reduction (stmt2 and stmt3) is
4550                  already vectorized, as well as the exit phi node, stmt4.
4551                  Here we vectorize the phi node of double reduction, stmt1, and
4552                  update all relevant statements.  */
4553
4554               /* Go through all the uses of s2 to find double reduction phi
4555                  node, i.e., stmt1 above.  */
4556               orig_name = PHI_RESULT (exit_phi);
4557               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4558                 {
4559                   stmt_vec_info use_stmt_vinfo;
4560                   stmt_vec_info new_phi_vinfo;
4561                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4562                   basic_block bb = gimple_bb (use_stmt);
4563                   gimple use;
4564
4565                   /* Check that USE_STMT is really double reduction phi
4566                      node.  */
4567                   if (gimple_code (use_stmt) != GIMPLE_PHI
4568                       || gimple_phi_num_args (use_stmt) != 2
4569                       || bb->loop_father != outer_loop)
4570                     continue;
4571                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4572                   if (!use_stmt_vinfo
4573                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4574                           != vect_double_reduction_def)
4575                     continue;
4576
4577                   /* Create vector phi node for double reduction:
4578                      vs1 = phi <vs0, vs2>
4579                      vs1 was created previously in this function by a call to
4580                        vect_get_vec_def_for_operand and is stored in
4581                        vec_initial_def;
4582                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4583                      vs0 is created here.  */
4584
4585                   /* Create vector phi node.  */
4586                   vect_phi = create_phi_node (vec_initial_def, bb);
4587                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4588                                     loop_vec_info_for_loop (outer_loop), NULL);
4589                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4590
4591                   /* Create vs0 - initial def of the double reduction phi.  */
4592                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4593                                              loop_preheader_edge (outer_loop));
4594                   init_def = get_initial_def_for_reduction (stmt,
4595                                                           preheader_arg, NULL);
4596                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4597                                                     vectype, NULL);
4598
4599                   /* Update phi node arguments with vs0 and vs2.  */
4600                   add_phi_arg (vect_phi, vect_phi_init,
4601                                loop_preheader_edge (outer_loop),
4602                                UNKNOWN_LOCATION);
4603                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4604                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4605                   if (dump_enabled_p ())
4606                     {
4607                       dump_printf_loc (MSG_NOTE, vect_location,
4608                                        "created double reduction phi node: ");
4609                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4610                       dump_printf (MSG_NOTE, "\n");
4611                     }
4612
4613                   vect_phi_res = PHI_RESULT (vect_phi);
4614
4615                   /* Replace the use, i.e., set the correct vs1 in the regular
4616                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4617                      loop is redundant.  */
4618                   use = reduction_phi;
4619                   for (j = 0; j < ncopies; j++)
4620                     {
4621                       edge pr_edge = loop_preheader_edge (loop);
4622                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4623                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4624                     }
4625                 }
4626             }
4627         }
4628
4629       phis.release ();
4630       if (nested_in_vect_loop)
4631         {
4632           if (double_reduc)
4633             loop = outer_loop;
4634           else
4635             continue;
4636         }
4637
4638       phis.create (3);
4639       /* Find the loop-closed-use at the loop exit of the original scalar
4640          result.  (The reduction result is expected to have two immediate uses,
4641          one at the latch block, and one at the loop exit).  For double
4642          reductions we are looking for exit phis of the outer loop.  */
4643       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4644         {
4645           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4646             {
4647               if (!is_gimple_debug (USE_STMT (use_p)))
4648                 phis.safe_push (USE_STMT (use_p));
4649             }
4650           else
4651             {
4652               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4653                 {
4654                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4655
4656                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4657                     {
4658                       if (!flow_bb_inside_loop_p (loop,
4659                                              gimple_bb (USE_STMT (phi_use_p)))
4660                           && !is_gimple_debug (USE_STMT (phi_use_p)))
4661                         phis.safe_push (USE_STMT (phi_use_p));
4662                     }
4663                 }
4664             }
4665         }
4666
4667       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4668         {
4669           /* Replace the uses:  */
4670           orig_name = PHI_RESULT (exit_phi);
4671           scalar_result = scalar_results[k];
4672           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4673             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4674               SET_USE (use_p, scalar_result);
4675         }
4676
4677       phis.release ();
4678     }
4679 }
4680
4681
4682 /* Function vectorizable_reduction.
4683
4684    Check if STMT performs a reduction operation that can be vectorized.
4685    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4686    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4687    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4688
4689    This function also handles reduction idioms (patterns) that have been
4690    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4691    of this form:
4692      X = pattern_expr (arg0, arg1, ..., X)
4693    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4694    sequence that had been detected and replaced by the pattern-stmt (STMT).
4695
4696    In some cases of reduction patterns, the type of the reduction variable X is
4697    different than the type of the other arguments of STMT.
4698    In such cases, the vectype that is used when transforming STMT into a vector
4699    stmt is different than the vectype that is used to determine the
4700    vectorization factor, because it consists of a different number of elements
4701    than the actual number of elements that are being operated upon in parallel.
4702
4703    For example, consider an accumulation of shorts into an int accumulator.
4704    On some targets it's possible to vectorize this pattern operating on 8
4705    shorts at a time (hence, the vectype for purposes of determining the
4706    vectorization factor should be V8HI); on the other hand, the vectype that
4707    is used to create the vector form is actually V4SI (the type of the result).
4708
4709    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4710    indicates what is the actual level of parallelism (V8HI in the example), so
4711    that the right vectorization factor would be derived.  This vectype
4712    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4713    be used to create the vectorized stmt.  The right vectype for the vectorized
4714    stmt is obtained from the type of the result X:
4715         get_vectype_for_scalar_type (TREE_TYPE (X))
4716
4717    This means that, contrary to "regular" reductions (or "regular" stmts in
4718    general), the following equation:
4719       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4720    does *NOT* necessarily hold for reduction patterns.  */
4721
4722 bool
4723 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4724                         gimple *vec_stmt, slp_tree slp_node)
4725 {
4726   tree vec_dest;
4727   tree scalar_dest;
4728   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4729   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4730   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4731   tree vectype_in = NULL_TREE;
4732   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4733   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4734   enum tree_code code, orig_code, epilog_reduc_code;
4735   enum machine_mode vec_mode;
4736   int op_type;
4737   optab optab, reduc_optab;
4738   tree new_temp = NULL_TREE;
4739   tree def;
4740   gimple def_stmt;
4741   enum vect_def_type dt;
4742   gimple new_phi = NULL;
4743   tree scalar_type;
4744   bool is_simple_use;
4745   gimple orig_stmt;
4746   stmt_vec_info orig_stmt_info;
4747   tree expr = NULL_TREE;
4748   int i;
4749   int ncopies;
4750   int epilog_copies;
4751   stmt_vec_info prev_stmt_info, prev_phi_info;
4752   bool single_defuse_cycle = false;
4753   tree reduc_def = NULL_TREE;
4754   gimple new_stmt = NULL;
4755   int j;
4756   tree ops[3];
4757   bool nested_cycle = false, found_nested_cycle_def = false;
4758   gimple reduc_def_stmt = NULL;
4759   /* The default is that the reduction variable is the last in statement.  */
4760   int reduc_index = 2;
4761   bool double_reduc = false, dummy;
4762   basic_block def_bb;
4763   struct loop * def_stmt_loop, *outer_loop = NULL;
4764   tree def_arg;
4765   gimple def_arg_stmt;
4766   auto_vec<tree> vec_oprnds0;
4767   auto_vec<tree> vec_oprnds1;
4768   auto_vec<tree> vect_defs;
4769   auto_vec<gimple> phis;
4770   int vec_num;
4771   tree def0, def1, tem, op0, op1 = NULL_TREE;
4772
4773   /* In case of reduction chain we switch to the first stmt in the chain, but
4774      we don't update STMT_INFO, since only the last stmt is marked as reduction
4775      and has reduction properties.  */
4776   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4777     stmt = GROUP_FIRST_ELEMENT (stmt_info);
4778
4779   if (nested_in_vect_loop_p (loop, stmt))
4780     {
4781       outer_loop = loop;
4782       loop = loop->inner;
4783       nested_cycle = true;
4784     }
4785
4786   /* 1. Is vectorizable reduction?  */
4787   /* Not supportable if the reduction variable is used in the loop, unless
4788      it's a reduction chain.  */
4789   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4790       && !GROUP_FIRST_ELEMENT (stmt_info))
4791     return false;
4792
4793   /* Reductions that are not used even in an enclosing outer-loop,
4794      are expected to be "live" (used out of the loop).  */
4795   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4796       && !STMT_VINFO_LIVE_P (stmt_info))
4797     return false;
4798
4799   /* Make sure it was already recognized as a reduction computation.  */
4800   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4801       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4802     return false;
4803
4804   /* 2. Has this been recognized as a reduction pattern?
4805
4806      Check if STMT represents a pattern that has been recognized
4807      in earlier analysis stages.  For stmts that represent a pattern,
4808      the STMT_VINFO_RELATED_STMT field records the last stmt in
4809      the original sequence that constitutes the pattern.  */
4810
4811   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4812   if (orig_stmt)
4813     {
4814       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4815       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4816       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4817     }
4818
4819   /* 3. Check the operands of the operation.  The first operands are defined
4820         inside the loop body. The last operand is the reduction variable,
4821         which is defined by the loop-header-phi.  */
4822
4823   gcc_assert (is_gimple_assign (stmt));
4824
4825   /* Flatten RHS.  */
4826   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4827     {
4828     case GIMPLE_SINGLE_RHS:
4829       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4830       if (op_type == ternary_op)
4831         {
4832           tree rhs = gimple_assign_rhs1 (stmt);
4833           ops[0] = TREE_OPERAND (rhs, 0);
4834           ops[1] = TREE_OPERAND (rhs, 1);
4835           ops[2] = TREE_OPERAND (rhs, 2);
4836           code = TREE_CODE (rhs);
4837         }
4838       else
4839         return false;
4840       break;
4841
4842     case GIMPLE_BINARY_RHS:
4843       code = gimple_assign_rhs_code (stmt);
4844       op_type = TREE_CODE_LENGTH (code);
4845       gcc_assert (op_type == binary_op);
4846       ops[0] = gimple_assign_rhs1 (stmt);
4847       ops[1] = gimple_assign_rhs2 (stmt);
4848       break;
4849
4850     case GIMPLE_TERNARY_RHS:
4851       code = gimple_assign_rhs_code (stmt);
4852       op_type = TREE_CODE_LENGTH (code);
4853       gcc_assert (op_type == ternary_op);
4854       ops[0] = gimple_assign_rhs1 (stmt);
4855       ops[1] = gimple_assign_rhs2 (stmt);
4856       ops[2] = gimple_assign_rhs3 (stmt);
4857       break;
4858
4859     case GIMPLE_UNARY_RHS:
4860       return false;
4861
4862     default:
4863       gcc_unreachable ();
4864     }
4865
4866   if (code == COND_EXPR && slp_node)
4867     return false;
4868
4869   scalar_dest = gimple_assign_lhs (stmt);
4870   scalar_type = TREE_TYPE (scalar_dest);
4871   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4872       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4873     return false;
4874
4875   /* Do not try to vectorize bit-precision reductions.  */
4876   if ((TYPE_PRECISION (scalar_type)
4877        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4878     return false;
4879
4880   /* All uses but the last are expected to be defined in the loop.
4881      The last use is the reduction variable.  In case of nested cycle this
4882      assumption is not true: we use reduc_index to record the index of the
4883      reduction variable.  */
4884   for (i = 0; i < op_type - 1; i++)
4885     {
4886       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4887       if (i == 0 && code == COND_EXPR)
4888         continue;
4889
4890       is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4891                                             &def_stmt, &def, &dt, &tem);
4892       if (!vectype_in)
4893         vectype_in = tem;
4894       gcc_assert (is_simple_use);
4895
4896       if (dt != vect_internal_def
4897           && dt != vect_external_def
4898           && dt != vect_constant_def
4899           && dt != vect_induction_def
4900           && !(dt == vect_nested_cycle && nested_cycle))
4901         return false;
4902
4903       if (dt == vect_nested_cycle)
4904         {
4905           found_nested_cycle_def = true;
4906           reduc_def_stmt = def_stmt;
4907           reduc_index = i;
4908         }
4909     }
4910
4911   is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4912                                         &def_stmt, &def, &dt, &tem);
4913   if (!vectype_in)
4914     vectype_in = tem;
4915   gcc_assert (is_simple_use);
4916   if (!(dt == vect_reduction_def
4917         || dt == vect_nested_cycle
4918         || ((dt == vect_internal_def || dt == vect_external_def
4919              || dt == vect_constant_def || dt == vect_induction_def)
4920             && nested_cycle && found_nested_cycle_def)))
4921     {
4922       /* For pattern recognized stmts, orig_stmt might be a reduction,
4923          but some helper statements for the pattern might not, or
4924          might be COND_EXPRs with reduction uses in the condition.  */
4925       gcc_assert (orig_stmt);
4926       return false;
4927     }
4928   if (!found_nested_cycle_def)
4929     reduc_def_stmt = def_stmt;
4930
4931   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4932   if (orig_stmt)
4933     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4934                                                        reduc_def_stmt,
4935                                                        !nested_cycle,
4936                                                        &dummy));
4937   else
4938     {
4939       gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4940                                              !nested_cycle, &dummy);
4941       /* We changed STMT to be the first stmt in reduction chain, hence we
4942          check that in this case the first element in the chain is STMT.  */
4943       gcc_assert (stmt == tmp
4944                   || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4945     }
4946
4947   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4948     return false;
4949
4950   if (slp_node || PURE_SLP_STMT (stmt_info))
4951     ncopies = 1;
4952   else
4953     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4954                / TYPE_VECTOR_SUBPARTS (vectype_in));
4955
4956   gcc_assert (ncopies >= 1);
4957
4958   vec_mode = TYPE_MODE (vectype_in);
4959
4960   if (code == COND_EXPR)
4961     {
4962       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4963         {
4964           if (dump_enabled_p ())
4965             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4966                              "unsupported condition in reduction\n");
4967
4968             return false;
4969         }
4970     }
4971   else
4972     {
4973       /* 4. Supportable by target?  */
4974
4975       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
4976           || code == LROTATE_EXPR || code == RROTATE_EXPR)
4977         {
4978           /* Shifts and rotates are only supported by vectorizable_shifts,
4979              not vectorizable_reduction.  */
4980           if (dump_enabled_p ())
4981             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4982                              "unsupported shift or rotation.\n");
4983           return false;
4984         }
4985
4986       /* 4.1. check support for the operation in the loop  */
4987       optab = optab_for_tree_code (code, vectype_in, optab_default);
4988       if (!optab)
4989         {
4990           if (dump_enabled_p ())
4991             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4992                              "no optab.\n");
4993
4994           return false;
4995         }
4996
4997       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4998         {
4999           if (dump_enabled_p ())
5000             dump_printf (MSG_NOTE, "op not supported by target.\n");
5001
5002           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5003               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5004                   < vect_min_worthwhile_factor (code))
5005             return false;
5006
5007           if (dump_enabled_p ())
5008             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5009         }
5010
5011       /* Worthwhile without SIMD support?  */
5012       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5013           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5014              < vect_min_worthwhile_factor (code))
5015         {
5016           if (dump_enabled_p ())
5017             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5018                              "not worthwhile without SIMD support.\n");
5019
5020           return false;
5021         }
5022     }
5023
5024   /* 4.2. Check support for the epilog operation.
5025
5026           If STMT represents a reduction pattern, then the type of the
5027           reduction variable may be different than the type of the rest
5028           of the arguments.  For example, consider the case of accumulation
5029           of shorts into an int accumulator; The original code:
5030                         S1: int_a = (int) short_a;
5031           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
5032
5033           was replaced with:
5034                         STMT: int_acc = widen_sum <short_a, int_acc>
5035
5036           This means that:
5037           1. The tree-code that is used to create the vector operation in the
5038              epilog code (that reduces the partial results) is not the
5039              tree-code of STMT, but is rather the tree-code of the original
5040              stmt from the pattern that STMT is replacing.  I.e, in the example
5041              above we want to use 'widen_sum' in the loop, but 'plus' in the
5042              epilog.
5043           2. The type (mode) we use to check available target support
5044              for the vector operation to be created in the *epilog*, is
5045              determined by the type of the reduction variable (in the example
5046              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5047              However the type (mode) we use to check available target support
5048              for the vector operation to be created *inside the loop*, is
5049              determined by the type of the other arguments to STMT (in the
5050              example we'd check this: optab_handler (widen_sum_optab,
5051              vect_short_mode)).
5052
5053           This is contrary to "regular" reductions, in which the types of all
5054           the arguments are the same as the type of the reduction variable.
5055           For "regular" reductions we can therefore use the same vector type
5056           (and also the same tree-code) when generating the epilog code and
5057           when generating the code inside the loop.  */
5058
5059   if (orig_stmt)
5060     {
5061       /* This is a reduction pattern: get the vectype from the type of the
5062          reduction variable, and get the tree-code from orig_stmt.  */
5063       orig_code = gimple_assign_rhs_code (orig_stmt);
5064       gcc_assert (vectype_out);
5065       vec_mode = TYPE_MODE (vectype_out);
5066     }
5067   else
5068     {
5069       /* Regular reduction: use the same vectype and tree-code as used for
5070          the vector code inside the loop can be used for the epilog code. */
5071       orig_code = code;
5072     }
5073
5074   if (nested_cycle)
5075     {
5076       def_bb = gimple_bb (reduc_def_stmt);
5077       def_stmt_loop = def_bb->loop_father;
5078       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5079                                        loop_preheader_edge (def_stmt_loop));
5080       if (TREE_CODE (def_arg) == SSA_NAME
5081           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5082           && gimple_code (def_arg_stmt) == GIMPLE_PHI
5083           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5084           && vinfo_for_stmt (def_arg_stmt)
5085           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5086               == vect_double_reduction_def)
5087         double_reduc = true;
5088     }
5089
5090   epilog_reduc_code = ERROR_MARK;
5091   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5092     {
5093       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5094                                          optab_default);
5095       if (!reduc_optab)
5096         {
5097           if (dump_enabled_p ())
5098             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5099                              "no optab for reduction.\n");
5100
5101           epilog_reduc_code = ERROR_MARK;
5102         }
5103
5104       if (reduc_optab
5105           && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5106         {
5107           if (dump_enabled_p ())
5108             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5109                              "reduc op not supported by target.\n");
5110
5111           epilog_reduc_code = ERROR_MARK;
5112         }
5113     }
5114   else
5115     {
5116       if (!nested_cycle || double_reduc)
5117         {
5118           if (dump_enabled_p ())
5119             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5120                              "no reduc code for scalar code.\n");
5121
5122           return false;
5123         }
5124     }
5125
5126   if (double_reduc && ncopies > 1)
5127     {
5128       if (dump_enabled_p ())
5129         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5130                          "multiple types in double reduction\n");
5131
5132       return false;
5133     }
5134
5135   /* In case of widenning multiplication by a constant, we update the type
5136      of the constant to be the type of the other operand.  We check that the
5137      constant fits the type in the pattern recognition pass.  */
5138   if (code == DOT_PROD_EXPR
5139       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5140     {
5141       if (TREE_CODE (ops[0]) == INTEGER_CST)
5142         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5143       else if (TREE_CODE (ops[1]) == INTEGER_CST)
5144         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5145       else
5146         {
5147           if (dump_enabled_p ())
5148             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5149                              "invalid types in dot-prod\n");
5150
5151           return false;
5152         }
5153     }
5154
5155   if (!vec_stmt) /* transformation not required.  */
5156     {
5157       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
5158         return false;
5159       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5160       return true;
5161     }
5162
5163   /** Transform.  **/
5164
5165   if (dump_enabled_p ())
5166     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5167
5168   /* FORNOW: Multiple types are not supported for condition.  */
5169   if (code == COND_EXPR)
5170     gcc_assert (ncopies == 1);
5171
5172   /* Create the destination vector  */
5173   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5174
5175   /* In case the vectorization factor (VF) is bigger than the number
5176      of elements that we can fit in a vectype (nunits), we have to generate
5177      more than one vector stmt - i.e - we need to "unroll" the
5178      vector stmt by a factor VF/nunits.  For more details see documentation
5179      in vectorizable_operation.  */
5180
5181   /* If the reduction is used in an outer loop we need to generate
5182      VF intermediate results, like so (e.g. for ncopies=2):
5183         r0 = phi (init, r0)
5184         r1 = phi (init, r1)
5185         r0 = x0 + r0;
5186         r1 = x1 + r1;
5187     (i.e. we generate VF results in 2 registers).
5188     In this case we have a separate def-use cycle for each copy, and therefore
5189     for each copy we get the vector def for the reduction variable from the
5190     respective phi node created for this copy.
5191
5192     Otherwise (the reduction is unused in the loop nest), we can combine
5193     together intermediate results, like so (e.g. for ncopies=2):
5194         r = phi (init, r)
5195         r = x0 + r;
5196         r = x1 + r;
5197    (i.e. we generate VF/2 results in a single register).
5198    In this case for each copy we get the vector def for the reduction variable
5199    from the vectorized reduction operation generated in the previous iteration.
5200   */
5201
5202   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5203     {
5204       single_defuse_cycle = true;
5205       epilog_copies = 1;
5206     }
5207   else
5208     epilog_copies = ncopies;
5209
5210   prev_stmt_info = NULL;
5211   prev_phi_info = NULL;
5212   if (slp_node)
5213     {
5214       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5215       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5216                   == TYPE_VECTOR_SUBPARTS (vectype_in));
5217     }
5218   else
5219     {
5220       vec_num = 1;
5221       vec_oprnds0.create (1);
5222       if (op_type == ternary_op)
5223         vec_oprnds1.create (1);
5224     }
5225
5226   phis.create (vec_num);
5227   vect_defs.create (vec_num);
5228   if (!slp_node)
5229     vect_defs.quick_push (NULL_TREE);
5230
5231   for (j = 0; j < ncopies; j++)
5232     {
5233       if (j == 0 || !single_defuse_cycle)
5234         {
5235           for (i = 0; i < vec_num; i++)
5236             {
5237               /* Create the reduction-phi that defines the reduction
5238                  operand.  */
5239               new_phi = create_phi_node (vec_dest, loop->header);
5240               set_vinfo_for_stmt (new_phi,
5241                                   new_stmt_vec_info (new_phi, loop_vinfo,
5242                                                      NULL));
5243                if (j == 0 || slp_node)
5244                  phis.quick_push (new_phi);
5245             }
5246         }
5247
5248       if (code == COND_EXPR)
5249         {
5250           gcc_assert (!slp_node);
5251           vectorizable_condition (stmt, gsi, vec_stmt,
5252                                   PHI_RESULT (phis[0]),
5253                                   reduc_index, NULL);
5254           /* Multiple types are not supported for condition.  */
5255           break;
5256         }
5257
5258       /* Handle uses.  */
5259       if (j == 0)
5260         {
5261           op0 = ops[!reduc_index];
5262           if (op_type == ternary_op)
5263             {
5264               if (reduc_index == 0)
5265                 op1 = ops[2];
5266               else
5267                 op1 = ops[1];
5268             }
5269
5270           if (slp_node)
5271             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5272                                slp_node, -1);
5273           else
5274             {
5275               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5276                                                             stmt, NULL);
5277               vec_oprnds0.quick_push (loop_vec_def0);
5278               if (op_type == ternary_op)
5279                {
5280                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5281                                                                NULL);
5282                  vec_oprnds1.quick_push (loop_vec_def1);
5283                }
5284             }
5285         }
5286       else
5287         {
5288           if (!slp_node)
5289             {
5290               enum vect_def_type dt;
5291               gimple dummy_stmt;
5292               tree dummy;
5293
5294               vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5295                                   &dummy_stmt, &dummy, &dt);
5296               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5297                                                               loop_vec_def0);
5298               vec_oprnds0[0] = loop_vec_def0;
5299               if (op_type == ternary_op)
5300                 {
5301                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5302                                       &dummy, &dt);
5303                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5304                                                                 loop_vec_def1);
5305                   vec_oprnds1[0] = loop_vec_def1;
5306                 }
5307             }
5308
5309           if (single_defuse_cycle)
5310             reduc_def = gimple_assign_lhs (new_stmt);
5311
5312           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5313         }
5314
5315       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5316         {
5317           if (slp_node)
5318             reduc_def = PHI_RESULT (phis[i]);
5319           else
5320             {
5321               if (!single_defuse_cycle || j == 0)
5322                 reduc_def = PHI_RESULT (new_phi);
5323             }
5324
5325           def1 = ((op_type == ternary_op)
5326                   ? vec_oprnds1[i] : NULL);
5327           if (op_type == binary_op)
5328             {
5329               if (reduc_index == 0)
5330                 expr = build2 (code, vectype_out, reduc_def, def0);
5331               else
5332                 expr = build2 (code, vectype_out, def0, reduc_def);
5333             }
5334           else
5335             {
5336               if (reduc_index == 0)
5337                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5338               else
5339                 {
5340                   if (reduc_index == 1)
5341                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5342                   else
5343                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5344                 }
5345             }
5346
5347           new_stmt = gimple_build_assign (vec_dest, expr);
5348           new_temp = make_ssa_name (vec_dest, new_stmt);
5349           gimple_assign_set_lhs (new_stmt, new_temp);
5350           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5351
5352           if (slp_node)
5353             {
5354               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5355               vect_defs.quick_push (new_temp);
5356             }
5357           else
5358             vect_defs[0] = new_temp;
5359         }
5360
5361       if (slp_node)
5362         continue;
5363
5364       if (j == 0)
5365         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5366       else
5367         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5368
5369       prev_stmt_info = vinfo_for_stmt (new_stmt);
5370       prev_phi_info = vinfo_for_stmt (new_phi);
5371     }
5372
5373   /* Finalize the reduction-phi (set its arguments) and create the
5374      epilog reduction code.  */
5375   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5376     {
5377       new_temp = gimple_assign_lhs (*vec_stmt);
5378       vect_defs[0] = new_temp;
5379     }
5380
5381   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5382                                     epilog_reduc_code, phis, reduc_index,
5383                                     double_reduc, slp_node);
5384
5385   return true;
5386 }
5387
5388 /* Function vect_min_worthwhile_factor.
5389
5390    For a loop where we could vectorize the operation indicated by CODE,
5391    return the minimum vectorization factor that makes it worthwhile
5392    to use generic vectors.  */
5393 int
5394 vect_min_worthwhile_factor (enum tree_code code)
5395 {
5396   switch (code)
5397     {
5398     case PLUS_EXPR:
5399     case MINUS_EXPR:
5400     case NEGATE_EXPR:
5401       return 4;
5402
5403     case BIT_AND_EXPR:
5404     case BIT_IOR_EXPR:
5405     case BIT_XOR_EXPR:
5406     case BIT_NOT_EXPR:
5407       return 2;
5408
5409     default:
5410       return INT_MAX;
5411     }
5412 }
5413
5414
5415 /* Function vectorizable_induction
5416
5417    Check if PHI performs an induction computation that can be vectorized.
5418    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5419    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5420    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5421
5422 bool
5423 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5424                         gimple *vec_stmt)
5425 {
5426   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5427   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5428   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5429   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5430   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5431   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5432   tree vec_def;
5433
5434   gcc_assert (ncopies >= 1);
5435   /* FORNOW. These restrictions should be relaxed.  */
5436   if (nested_in_vect_loop_p (loop, phi))
5437     {
5438       imm_use_iterator imm_iter;
5439       use_operand_p use_p;
5440       gimple exit_phi;
5441       edge latch_e;
5442       tree loop_arg;
5443
5444       if (ncopies > 1)
5445         {
5446           if (dump_enabled_p ())
5447             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5448                              "multiple types in nested loop.\n");
5449           return false;
5450         }
5451
5452       exit_phi = NULL;
5453       latch_e = loop_latch_edge (loop->inner);
5454       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5455       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5456         {
5457           gimple use_stmt = USE_STMT (use_p);
5458           if (is_gimple_debug (use_stmt))
5459             continue;
5460
5461           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
5462             {
5463               exit_phi = use_stmt;
5464               break;
5465             }
5466         }
5467       if (exit_phi)
5468         {
5469           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5470           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5471                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5472             {
5473               if (dump_enabled_p ())
5474                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5475                                  "inner-loop induction only used outside "
5476                                  "of the outer vectorized loop.\n");
5477               return false;
5478             }
5479         }
5480     }
5481
5482   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5483     return false;
5484
5485   /* FORNOW: SLP not supported.  */
5486   if (STMT_SLP_TYPE (stmt_info))
5487     return false;
5488
5489   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5490
5491   if (gimple_code (phi) != GIMPLE_PHI)
5492     return false;
5493
5494   if (!vec_stmt) /* transformation not required.  */
5495     {
5496       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5497       if (dump_enabled_p ())
5498         dump_printf_loc (MSG_NOTE, vect_location,
5499                          "=== vectorizable_induction ===\n");
5500       vect_model_induction_cost (stmt_info, ncopies);
5501       return true;
5502     }
5503
5504   /** Transform.  **/
5505
5506   if (dump_enabled_p ())
5507     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5508
5509   vec_def = get_initial_def_for_induction (phi);
5510   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5511   return true;
5512 }
5513
5514 /* Function vectorizable_live_operation.
5515
5516    STMT computes a value that is used outside the loop.  Check if
5517    it can be supported.  */
5518
5519 bool
5520 vectorizable_live_operation (gimple stmt,
5521                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5522                              gimple *vec_stmt)
5523 {
5524   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5525   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5526   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5527   int i;
5528   int op_type;
5529   tree op;
5530   tree def;
5531   gimple def_stmt;
5532   enum vect_def_type dt;
5533   enum tree_code code;
5534   enum gimple_rhs_class rhs_class;
5535
5536   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5537
5538   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5539     return false;
5540
5541   if (!is_gimple_assign (stmt))
5542     {
5543       if (gimple_call_internal_p (stmt)
5544           && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5545           && gimple_call_lhs (stmt)
5546           && loop->simduid
5547           && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5548           && loop->simduid
5549              == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5550         {
5551           edge e = single_exit (loop);
5552           basic_block merge_bb = e->dest;
5553           imm_use_iterator imm_iter;
5554           use_operand_p use_p;
5555           tree lhs = gimple_call_lhs (stmt);
5556
5557           FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5558             {
5559               gimple use_stmt = USE_STMT (use_p);
5560               if (gimple_code (use_stmt) == GIMPLE_PHI
5561                   && gimple_bb (use_stmt) == merge_bb)
5562                 {
5563                   if (vec_stmt)
5564                     {
5565                       tree vfm1
5566                         = build_int_cst (unsigned_type_node,
5567                                          loop_vinfo->vectorization_factor - 1);
5568                       SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5569                     }
5570                   return true;
5571                 }
5572             }
5573         }
5574
5575       return false;
5576     }
5577
5578   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5579     return false;
5580
5581   /* FORNOW. CHECKME. */
5582   if (nested_in_vect_loop_p (loop, stmt))
5583     return false;
5584
5585   code = gimple_assign_rhs_code (stmt);
5586   op_type = TREE_CODE_LENGTH (code);
5587   rhs_class = get_gimple_rhs_class (code);
5588   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5589   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5590
5591   /* FORNOW: support only if all uses are invariant.  This means
5592      that the scalar operations can remain in place, unvectorized.
5593      The original last scalar value that they compute will be used.  */
5594
5595   for (i = 0; i < op_type; i++)
5596     {
5597       if (rhs_class == GIMPLE_SINGLE_RHS)
5598         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5599       else
5600         op = gimple_op (stmt, i + 1);
5601       if (op
5602           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5603                                   &dt))
5604         {
5605           if (dump_enabled_p ())
5606             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5607                              "use not simple.\n");
5608           return false;
5609         }
5610
5611       if (dt != vect_external_def && dt != vect_constant_def)
5612         return false;
5613     }
5614
5615   /* No transformation is required for the cases we currently support.  */
5616   return true;
5617 }
5618
5619 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5620
5621 static void
5622 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5623 {
5624   ssa_op_iter op_iter;
5625   imm_use_iterator imm_iter;
5626   def_operand_p def_p;
5627   gimple ustmt;
5628
5629   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5630     {
5631       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5632         {
5633           basic_block bb;
5634
5635           if (!is_gimple_debug (ustmt))
5636             continue;
5637
5638           bb = gimple_bb (ustmt);
5639
5640           if (!flow_bb_inside_loop_p (loop, bb))
5641             {
5642               if (gimple_debug_bind_p (ustmt))
5643                 {
5644                   if (dump_enabled_p ())
5645                     dump_printf_loc (MSG_NOTE, vect_location,
5646                                      "killing debug use\n");
5647
5648                   gimple_debug_bind_reset_value (ustmt);
5649                   update_stmt (ustmt);
5650                 }
5651               else
5652                 gcc_unreachable ();
5653             }
5654         }
5655     }
5656 }
5657
5658
5659 /* This function builds ni_name = number of iterations.  Statements
5660    are emitted on the loop preheader edge.  */
5661
5662 static tree
5663 vect_build_loop_niters (loop_vec_info loop_vinfo)
5664 {
5665   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5666   if (TREE_CODE (ni) == INTEGER_CST)
5667     return ni;
5668   else
5669     {
5670       tree ni_name, var;
5671       gimple_seq stmts = NULL;
5672       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5673
5674       var = create_tmp_var (TREE_TYPE (ni), "niters");
5675       ni_name = force_gimple_operand (ni, &stmts, false, var);
5676       if (stmts)
5677         gsi_insert_seq_on_edge_immediate (pe, stmts);
5678
5679       return ni_name;
5680     }
5681 }
5682
5683
5684 /* This function generates the following statements:
5685
5686    ni_name = number of iterations loop executes
5687    ratio = ni_name / vf
5688    ratio_mult_vf_name = ratio * vf
5689
5690    and places them on the loop preheader edge.  */
5691
5692 static void
5693 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5694                                  tree ni_name,
5695                                  tree *ratio_mult_vf_name_ptr,
5696                                  tree *ratio_name_ptr)
5697 {
5698   tree ni_minus_gap_name;
5699   tree var;
5700   tree ratio_name;
5701   tree ratio_mult_vf_name;
5702   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5703   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5704   tree log_vf;
5705
5706   log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
5707
5708   /* If epilogue loop is required because of data accesses with gaps, we
5709      subtract one iteration from the total number of iterations here for
5710      correct calculation of RATIO.  */
5711   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5712     {
5713       ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5714                                        ni_name,
5715                                        build_one_cst (TREE_TYPE (ni_name)));
5716       if (!is_gimple_val (ni_minus_gap_name))
5717         {
5718           var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
5719           gimple stmts = NULL;
5720           ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5721                                                     true, var);
5722           gsi_insert_seq_on_edge_immediate (pe, stmts);
5723         }
5724     }
5725   else
5726     ni_minus_gap_name = ni_name;
5727
5728   /* Create: ratio = ni >> log2(vf) */
5729   /* ???  As we have ni == number of latch executions + 1, ni could
5730      have overflown to zero.  So avoid computing ratio based on ni
5731      but compute it using the fact that we know ratio will be at least
5732      one, thus via (ni - vf) >> log2(vf) + 1.  */
5733   ratio_name
5734     = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
5735                    fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
5736                                 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5737                                              ni_minus_gap_name,
5738                                              build_int_cst
5739                                                (TREE_TYPE (ni_name), vf)),
5740                                 log_vf),
5741                    build_int_cst (TREE_TYPE (ni_name), 1));
5742   if (!is_gimple_val (ratio_name))
5743     {
5744       var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
5745       gimple stmts = NULL;
5746       ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5747       gsi_insert_seq_on_edge_immediate (pe, stmts);
5748     }
5749   *ratio_name_ptr = ratio_name;
5750
5751   /* Create: ratio_mult_vf = ratio << log2 (vf).  */
5752
5753   if (ratio_mult_vf_name_ptr)
5754     {
5755       ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5756                                         ratio_name, log_vf);
5757       if (!is_gimple_val (ratio_mult_vf_name))
5758         {
5759           var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
5760           gimple stmts = NULL;
5761           ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5762                                                      true, var);
5763           gsi_insert_seq_on_edge_immediate (pe, stmts);
5764         }
5765       *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5766     }
5767
5768   return;
5769 }
5770
5771
5772 /* Function vect_transform_loop.
5773
5774    The analysis phase has determined that the loop is vectorizable.
5775    Vectorize the loop - created vectorized stmts to replace the scalar
5776    stmts in the loop, and update the loop exit condition.  */
5777
5778 void
5779 vect_transform_loop (loop_vec_info loop_vinfo)
5780 {
5781   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5782   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5783   int nbbs = loop->num_nodes;
5784   gimple_stmt_iterator si;
5785   int i;
5786   tree ratio = NULL;
5787   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5788   bool grouped_store;
5789   bool slp_scheduled = false;
5790   gimple stmt, pattern_stmt;
5791   gimple_seq pattern_def_seq = NULL;
5792   gimple_stmt_iterator pattern_def_si = gsi_none ();
5793   bool transform_pattern_stmt = false;
5794   bool check_profitability = false;
5795   int th;
5796   /* Record number of iterations before we started tampering with the profile. */
5797   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5798
5799   if (dump_enabled_p ())
5800     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5801
5802   /* If profile is inprecise, we have chance to fix it up.  */
5803   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5804     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5805
5806   /* Use the more conservative vectorization threshold.  If the number
5807      of iterations is constant assume the cost check has been performed
5808      by our caller.  If the threshold makes all loops profitable that
5809      run at least the vectorization factor number of times checking
5810      is pointless, too.  */
5811   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
5812   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5813       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5814     {
5815       if (dump_enabled_p ())
5816         dump_printf_loc (MSG_NOTE, vect_location,
5817                          "Profitability threshold is %d loop iterations.\n",
5818                          th);
5819       check_profitability = true;
5820     }
5821
5822   /* Version the loop first, if required, so the profitability check
5823      comes first.  */
5824
5825   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5826       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5827     {
5828       vect_loop_versioning (loop_vinfo, th, check_profitability);
5829       check_profitability = false;
5830     }
5831
5832   tree ni_name = vect_build_loop_niters (loop_vinfo);
5833   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5834
5835   /* Peel the loop if there are data refs with unknown alignment.
5836      Only one data ref with unknown store is allowed.  */
5837
5838   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5839     {
5840       vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5841                                      th, check_profitability);
5842       check_profitability = false;
5843       /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5844          be re-computed.  */
5845       ni_name = NULL_TREE;
5846     }
5847
5848   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5849      compile time constant), or it is a constant that doesn't divide by the
5850      vectorization factor, then an epilog loop needs to be created.
5851      We therefore duplicate the loop: the original loop will be vectorized,
5852      and will compute the first (n/VF) iterations.  The second copy of the loop
5853      will remain scalar and will compute the remaining (n%VF) iterations.
5854      (VF is the vectorization factor).  */
5855
5856   if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
5857       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5858     {
5859       tree ratio_mult_vf;
5860       if (!ni_name)
5861         ni_name = vect_build_loop_niters (loop_vinfo);
5862       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
5863                                        &ratio);
5864       vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
5865                                       th, check_profitability);
5866     }
5867   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5868     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5869                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5870   else
5871     {
5872       if (!ni_name)
5873         ni_name = vect_build_loop_niters (loop_vinfo);
5874       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
5875     }
5876
5877   /* 1) Make sure the loop header has exactly two entries
5878      2) Make sure we have a preheader basic block.  */
5879
5880   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5881
5882   split_edge (loop_preheader_edge (loop));
5883
5884   /* FORNOW: the vectorizer supports only loops which body consist
5885      of one basic block (header + empty latch). When the vectorizer will
5886      support more involved loop forms, the order by which the BBs are
5887      traversed need to be reconsidered.  */
5888
5889   for (i = 0; i < nbbs; i++)
5890     {
5891       basic_block bb = bbs[i];
5892       stmt_vec_info stmt_info;
5893       gimple phi;
5894
5895       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5896         {
5897           phi = gsi_stmt (si);
5898           if (dump_enabled_p ())
5899             {
5900               dump_printf_loc (MSG_NOTE, vect_location,
5901                                "------>vectorizing phi: ");
5902               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5903               dump_printf (MSG_NOTE, "\n");
5904             }
5905           stmt_info = vinfo_for_stmt (phi);
5906           if (!stmt_info)
5907             continue;
5908
5909           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5910             vect_loop_kill_debug_uses (loop, phi);
5911
5912           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5913               && !STMT_VINFO_LIVE_P (stmt_info))
5914             continue;
5915
5916           if (STMT_VINFO_VECTYPE (stmt_info)
5917               && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5918                   != (unsigned HOST_WIDE_INT) vectorization_factor)
5919               && dump_enabled_p ())
5920             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
5921
5922           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5923             {
5924               if (dump_enabled_p ())
5925                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
5926               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5927             }
5928         }
5929
5930       pattern_stmt = NULL;
5931       for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5932         {
5933           bool is_store;
5934
5935           if (transform_pattern_stmt)
5936             stmt = pattern_stmt;
5937           else
5938             {
5939               stmt = gsi_stmt (si);
5940               /* During vectorization remove existing clobber stmts.  */
5941               if (gimple_clobber_p (stmt))
5942                 {
5943                   unlink_stmt_vdef (stmt);
5944                   gsi_remove (&si, true);
5945                   release_defs (stmt);
5946                   continue;
5947                 }
5948             }
5949
5950           if (dump_enabled_p ())
5951             {
5952               dump_printf_loc (MSG_NOTE, vect_location,
5953                                "------>vectorizing statement: ");
5954               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5955               dump_printf (MSG_NOTE, "\n");
5956             }
5957
5958           stmt_info = vinfo_for_stmt (stmt);
5959
5960           /* vector stmts created in the outer-loop during vectorization of
5961              stmts in an inner-loop may not have a stmt_info, and do not
5962              need to be vectorized.  */
5963           if (!stmt_info)
5964             {
5965               gsi_next (&si);
5966               continue;
5967             }
5968
5969           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5970             vect_loop_kill_debug_uses (loop, stmt);
5971
5972           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5973               && !STMT_VINFO_LIVE_P (stmt_info))
5974             {
5975               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5976                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5977                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5978                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5979                 {
5980                   stmt = pattern_stmt;
5981                   stmt_info = vinfo_for_stmt (stmt);
5982                 }
5983               else
5984                 {
5985                   gsi_next (&si);
5986                   continue;
5987                 }
5988             }
5989           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5990                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5991                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5992                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5993             transform_pattern_stmt = true;
5994
5995           /* If pattern statement has def stmts, vectorize them too.  */
5996           if (is_pattern_stmt_p (stmt_info))
5997             {
5998               if (pattern_def_seq == NULL)
5999                 {
6000                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6001                   pattern_def_si = gsi_start (pattern_def_seq);
6002                 }
6003               else if (!gsi_end_p (pattern_def_si))
6004                 gsi_next (&pattern_def_si);
6005               if (pattern_def_seq != NULL)
6006                 {
6007                   gimple pattern_def_stmt = NULL;
6008                   stmt_vec_info pattern_def_stmt_info = NULL;
6009
6010                   while (!gsi_end_p (pattern_def_si))
6011                     {
6012                       pattern_def_stmt = gsi_stmt (pattern_def_si);
6013                       pattern_def_stmt_info
6014                         = vinfo_for_stmt (pattern_def_stmt);
6015                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6016                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6017                         break;
6018                       gsi_next (&pattern_def_si);
6019                     }
6020
6021                   if (!gsi_end_p (pattern_def_si))
6022                     {
6023                       if (dump_enabled_p ())
6024                         {
6025                           dump_printf_loc (MSG_NOTE, vect_location,
6026                                            "==> vectorizing pattern def "
6027                                            "stmt: ");
6028                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6029                                             pattern_def_stmt, 0);
6030                           dump_printf (MSG_NOTE, "\n");
6031                         }
6032
6033                       stmt = pattern_def_stmt;
6034                       stmt_info = pattern_def_stmt_info;
6035                     }
6036                   else
6037                     {
6038                       pattern_def_si = gsi_none ();
6039                       transform_pattern_stmt = false;
6040                     }
6041                 }
6042               else
6043                 transform_pattern_stmt = false;
6044             }
6045
6046           if (STMT_VINFO_VECTYPE (stmt_info))
6047             {
6048               unsigned int nunits
6049                 = (unsigned int)
6050                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6051               if (!STMT_SLP_TYPE (stmt_info)
6052                   && nunits != (unsigned int) vectorization_factor
6053                   && dump_enabled_p ())
6054                   /* For SLP VF is set according to unrolling factor, and not
6055                      to vector size, hence for SLP this print is not valid.  */
6056                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6057             }
6058
6059           /* SLP. Schedule all the SLP instances when the first SLP stmt is
6060              reached.  */
6061           if (STMT_SLP_TYPE (stmt_info))
6062             {
6063               if (!slp_scheduled)
6064                 {
6065                   slp_scheduled = true;
6066
6067                   if (dump_enabled_p ())
6068                     dump_printf_loc (MSG_NOTE, vect_location,
6069                                      "=== scheduling SLP instances ===\n");
6070
6071                   vect_schedule_slp (loop_vinfo, NULL);
6072                 }
6073
6074               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
6075               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6076                 {
6077                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6078                     {
6079                       pattern_def_seq = NULL;
6080                       gsi_next (&si);
6081                     }
6082                   continue;
6083                 }
6084             }
6085
6086           /* -------- vectorize statement ------------ */
6087           if (dump_enabled_p ())
6088             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6089
6090           grouped_store = false;
6091           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6092           if (is_store)
6093             {
6094               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6095                 {
6096                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6097                      interleaving chain was completed - free all the stores in
6098                      the chain.  */
6099                   gsi_next (&si);
6100                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6101                 }
6102               else
6103                 {
6104                   /* Free the attached stmt_vec_info and remove the stmt.  */
6105                   gimple store = gsi_stmt (si);
6106                   free_stmt_vec_info (store);
6107                   unlink_stmt_vdef (store);
6108                   gsi_remove (&si, true);
6109                   release_defs (store);
6110                 }
6111
6112               /* Stores can only appear at the end of pattern statements.  */
6113               gcc_assert (!transform_pattern_stmt);
6114               pattern_def_seq = NULL;
6115             }
6116           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6117             {
6118               pattern_def_seq = NULL;
6119               gsi_next (&si);
6120             }
6121         }                       /* stmts in BB */
6122     }                           /* BBs in loop */
6123
6124   slpeel_make_loop_iterate_ntimes (loop, ratio);
6125
6126   /* Reduce loop iterations by the vectorization factor.  */
6127   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6128                       expected_iterations / vectorization_factor);
6129   loop->nb_iterations_upper_bound
6130     = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6131   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6132       && loop->nb_iterations_upper_bound != 0)
6133     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6134   if (loop->any_estimate)
6135     {
6136       loop->nb_iterations_estimate
6137         = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6138        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6139            && loop->nb_iterations_estimate != 0)
6140          loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6141     }
6142
6143   if (dump_enabled_p ())
6144     {
6145       dump_printf_loc (MSG_NOTE, vect_location,
6146                        "LOOP VECTORIZED\n");
6147       if (loop->inner)
6148         dump_printf_loc (MSG_NOTE, vect_location,
6149                          "OUTER LOOP VECTORIZED\n");
6150       dump_printf (MSG_NOTE, "\n");
6151     }
6152 }