gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2013 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "ggc.h"
  28 #include "tree.h"
  29 #include "basic-block.h"
  30 #include "gimple-pretty-print.h"
  31 #include "tree-flow.h"
  32 #include "tree-pass.h"
  33 #include "cfgloop.h"
  34 #include "expr.h"
  35 #include "recog.h"
  36 #include "optabs.h"
  37 #include "params.h"
  38 #include "diagnostic-core.h"
  39 #include "tree-chrec.h"
  40 #include "tree-scalar-evolution.h"
  41 #include "tree-vectorizer.h"
  42 #include "target.h"
  43
  44 /* Loop Vectorization Pass.
  45
  46    This pass tries to vectorize loops.
  47
  48    For example, the vectorizer transforms the following simple loop:
  49
  50         short a[N]; short b[N]; short c[N]; int i;
  51
  52         for (i=0; i<N; i++){
  53           a[i] = b[i] + c[i];
  54         }
  55
  56    as if it was manually vectorized by rewriting the source code into:
  57
  58         typedef int __attribute__((mode(V8HI))) v8hi;
  59         short a[N];  short b[N]; short c[N];   int i;
  60         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  61         v8hi va, vb, vc;
  62
  63         for (i=0; i<N/8; i++){
  64           vb = pb[i];
  65           vc = pc[i];
  66           va = vb + vc;
  67           pa[i] = va;
  68         }
  69
  70         The main entry to this pass is vectorize_loops(), in which
  71    the vectorizer applies a set of analyses on a given set of loops,
  72    followed by the actual vectorization transformation for the loops that
  73    had successfully passed the analysis phase.
  74         Throughout this pass we make a distinction between two types of
  75    data: scalars (which are represented by SSA_NAMES), and memory references
  76    ("data-refs").  These two types of data require different handling both
  77    during analysis and transformation. The types of data-refs that the
  78    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  79    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  80    accesses are required to have a simple (consecutive) access pattern.
  81
  82    Analysis phase:
  83    ===============
  84         The driver for the analysis phase is vect_analyze_loop().
  85    It applies a set of analyses, some of which rely on the scalar evolution
  86    analyzer (scev) developed by Sebastian Pop.
  87
  88         During the analysis phase the vectorizer records some information
  89    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  90    loop, as well as general information about the loop as a whole, which is
  91    recorded in a "loop_vec_info" struct attached to each loop.
  92
  93    Transformation phase:
  94    =====================
  95         The loop transformation phase scans all the stmts in the loop, and
  96    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
  97    the loop that needs to be vectorized.  It inserts the vector code sequence
  98    just before the scalar stmt S, and records a pointer to the vector code
  99    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 100    attached to S).  This pointer will be used for the vectorization of following
 101    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 102    otherwise, we rely on dead code elimination for removing it.
 103
 104         For example, say stmt S1 was vectorized into stmt VS1:
 105
 106    VS1: vb = px[i];
 107    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 108    S2:  a = b;
 109
 110    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 111    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 112    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 113    resulting sequence would be:
 114
 115    VS1: vb = px[i];
 116    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 117    VS2: va = vb;
 118    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 119
 120         Operands that are not SSA_NAMEs, are data-refs that appear in
 121    load/store operations (like 'x[i]' in S1), and are handled differently.
 122
 123    Target modeling:
 124    =================
 125         Currently the only target specific information that is used is the
 126    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 127    Targets that can support different sizes of vectors, for now will need
 128    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 129    flexibility will be added in the future.
 130
 131         Since we only vectorize operations which vector form can be
 132    expressed using existing tree codes, to verify that an operation is
 133    supported, the vectorizer checks the relevant optab at the relevant
 134    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 135    the value found is CODE_FOR_nothing, then there's no target support, and
 136    we can't vectorize the stmt.
 137
 138    For additional information on this project see:
 139    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 140 */
 141
 142 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 143
 144 /* Function vect_determine_vectorization_factor
 145
 146    Determine the vectorization factor (VF).  VF is the number of data elements
 147    that are operated upon in parallel in a single iteration of the vectorized
 148    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 149    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 150    elements can fit in a single vector register.
 151
 152    We currently support vectorization of loops in which all types operated upon
 153    are of the same size.  Therefore this function currently sets VF according to
 154    the size of the types operated upon, and fails if there are multiple sizes
 155    in the loop.
 156
 157    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 158    original loop:
 159         for (i=0; i<N; i++){
 160           a[i] = b[i] + c[i];
 161         }
 162
 163    vectorized loop:
 164         for (i=0; i<N; i+=VF){
 165           a[i:VF] = b[i:VF] + c[i:VF];
 166         }
 167 */
 168
 169 static bool
 170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 171 {
 172   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 173   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 174   int nbbs = loop->num_nodes;
 175   gimple_stmt_iterator si;
 176   unsigned int vectorization_factor = 0;
 177   tree scalar_type;
 178   gimple phi;
 179   tree vectype;
 180   unsigned int nunits;
 181   stmt_vec_info stmt_info;
 182   int i;
 183   HOST_WIDE_INT dummy;
 184   gimple stmt, pattern_stmt = NULL;
 185   gimple_seq pattern_def_seq = NULL;
 186   gimple_stmt_iterator pattern_def_si = gsi_none ();
 187   bool analyze_pattern_stmt = false;
 188
 189   if (dump_enabled_p ())
 190     dump_printf_loc (MSG_NOTE, vect_location,
 191                      "=== vect_determine_vectorization_factor ===");
 192
 193   for (i = 0; i < nbbs; i++)
 194     {
 195       basic_block bb = bbs[i];
 196
 197       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 198         {
 199           phi = gsi_stmt (si);
 200           stmt_info = vinfo_for_stmt (phi);
 201           if (dump_enabled_p ())
 202             {
 203               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 204               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 205             }
 206
 207           gcc_assert (stmt_info);
 208
 209           if (STMT_VINFO_RELEVANT_P (stmt_info))
 210             {
 211               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 212               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 213
 214               if (dump_enabled_p ())
 215                 {
 216                   dump_printf_loc (MSG_NOTE, vect_location,
 217                                    "get vectype for scalar type:  ");
 218                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 219                 }
 220
 221               vectype = get_vectype_for_scalar_type (scalar_type);
 222               if (!vectype)
 223                 {
 224                   if (dump_enabled_p ())
 225                     {
 226                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 227                                        "not vectorized: unsupported "
 228                                        "data-type ");
 229                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 230                                          scalar_type);
 231                     }
 232                   return false;
 233                 }
 234               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 235
 236               if (dump_enabled_p ())
 237                 {
 238                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 239                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 240                 }
 241
 242               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 243               if (dump_enabled_p ())
 244                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
 245
 246               if (!vectorization_factor
 247                   || (nunits > vectorization_factor))
 248                 vectorization_factor = nunits;
 249             }
 250         }
 251
 252       for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
 253         {
 254           tree vf_vectype;
 255
 256           if (analyze_pattern_stmt)
 257             stmt = pattern_stmt;
 258           else
 259             stmt = gsi_stmt (si);
 260
 261           stmt_info = vinfo_for_stmt (stmt);
 262
 263           if (dump_enabled_p ())
 264             {
 265               dump_printf_loc (MSG_NOTE, vect_location,
 266                                "==> examining statement: ");
 267               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 268             }
 269
 270           gcc_assert (stmt_info);
 271
 272           /* Skip stmts which do not need to be vectorized.  */
 273           if (!STMT_VINFO_RELEVANT_P (stmt_info)
 274               && !STMT_VINFO_LIVE_P (stmt_info))
 275             {
 276               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 277                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 278                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 279                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 280                 {
 281                   stmt = pattern_stmt;
 282                   stmt_info = vinfo_for_stmt (pattern_stmt);
 283                   if (dump_enabled_p ())
 284                     {
 285                       dump_printf_loc (MSG_NOTE, vect_location,
 286                                        "==> examining pattern statement: ");
 287                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 288                     }
 289                 }
 290               else
 291                 {
 292                   if (dump_enabled_p ())
 293                     dump_printf_loc (MSG_NOTE, vect_location, "skip.");
 294                   gsi_next (&si);
 295                   continue;
 296                 }
 297             }
 298           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 299                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 300                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 301                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 302             analyze_pattern_stmt = true;
 303
 304           /* If a pattern statement has def stmts, analyze them too.  */
 305           if (is_pattern_stmt_p (stmt_info))
 306             {
 307               if (pattern_def_seq == NULL)
 308                 {
 309                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 310                   pattern_def_si = gsi_start (pattern_def_seq);
 311                 }
 312               else if (!gsi_end_p (pattern_def_si))
 313                 gsi_next (&pattern_def_si);
 314               if (pattern_def_seq != NULL)
 315                 {
 316                   gimple pattern_def_stmt = NULL;
 317                   stmt_vec_info pattern_def_stmt_info = NULL;
 318
 319                   while (!gsi_end_p (pattern_def_si))
 320                     {
 321                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 322                       pattern_def_stmt_info
 323                         = vinfo_for_stmt (pattern_def_stmt);
 324                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 325                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 326                         break;
 327                       gsi_next (&pattern_def_si);
 328                     }
 329
 330                   if (!gsi_end_p (pattern_def_si))
 331                     {
 332                       if (dump_enabled_p ())
 333                         {
 334                           dump_printf_loc (MSG_NOTE, vect_location,
 335                                            "==> examining pattern def stmt: ");
 336                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 337                                             pattern_def_stmt, 0);
 338                         }
 339
 340                       stmt = pattern_def_stmt;
 341                       stmt_info = pattern_def_stmt_info;
 342                     }
 343                   else
 344                     {
 345                       pattern_def_si = gsi_none ();
 346                       analyze_pattern_stmt = false;
 347                     }
 348                 }
 349               else
 350                 analyze_pattern_stmt = false;
 351             }
 352
 353           if (gimple_get_lhs (stmt) == NULL_TREE)
 354             {
 355               if (dump_enabled_p ())
 356                 {
 357                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 358                                    "not vectorized: irregular stmt.");
 359                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 360                                     0);
 361                 }
 362               return false;
 363             }
 364
 365           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 366             {
 367               if (dump_enabled_p ())
 368                 {
 369                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 370                                    "not vectorized: vector stmt in loop:");
 371                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 372                 }
 373               return false;
 374             }
 375
 376           if (STMT_VINFO_VECTYPE (stmt_info))
 377             {
 378               /* The only case when a vectype had been already set is for stmts
 379                  that contain a dataref, or for "pattern-stmts" (stmts
 380                  generated by the vectorizer to represent/replace a certain
 381                  idiom).  */
 382               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 383                           || is_pattern_stmt_p (stmt_info)
 384                           || !gsi_end_p (pattern_def_si));
 385               vectype = STMT_VINFO_VECTYPE (stmt_info);
 386             }
 387           else
 388             {
 389               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 390               scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 391               if (dump_enabled_p ())
 392                 {
 393                   dump_printf_loc (MSG_NOTE, vect_location,
 394                                    "get vectype for scalar type:  ");
 395                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 396                 }
 397               vectype = get_vectype_for_scalar_type (scalar_type);
 398               if (!vectype)
 399                 {
 400                   if (dump_enabled_p ())
 401                     {
 402                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 403                                        "not vectorized: unsupported "
 404                                        "data-type ");
 405                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 406                                          scalar_type);
 407                     }
 408                   return false;
 409                 }
 410
 411               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 412             }
 413
 414           /* The vectorization factor is according to the smallest
 415              scalar type (or the largest vector size, but we only
 416              support one vector size per loop).  */
 417           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 418                                                        &dummy);
 419           if (dump_enabled_p ())
 420             {
 421               dump_printf_loc (MSG_NOTE, vect_location,
 422                                "get vectype for scalar type:  ");
 423               dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 424             }
 425           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 426           if (!vf_vectype)
 427             {
 428               if (dump_enabled_p ())
 429                 {
 430                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 431                                    "not vectorized: unsupported data-type ");
 432                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 433                                      scalar_type);
 434                 }
 435               return false;
 436             }
 437
 438           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 439                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 440             {
 441               if (dump_enabled_p ())
 442                 {
 443                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 444                                    "not vectorized: different sized vector "
 445                                    "types in statement, ");
 446                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 447                                      vectype);
 448                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 449                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 450                                      vf_vectype);
 451                 }
 452               return false;
 453             }
 454
 455           if (dump_enabled_p ())
 456             {
 457               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 458               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 459             }
 460
 461           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 462           if (dump_enabled_p ())
 463             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
 464           if (!vectorization_factor
 465               || (nunits > vectorization_factor))
 466             vectorization_factor = nunits;
 467
 468           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 469             {
 470               pattern_def_seq = NULL;
 471               gsi_next (&si);
 472             }
 473         }
 474     }
 475
 476   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 477   if (dump_enabled_p ())
 478     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d",
 479                      vectorization_factor);
 480   if (vectorization_factor <= 1)
 481     {
 482       if (dump_enabled_p ())
 483         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 484                          "not vectorized: unsupported data-type");
 485       return false;
 486     }
 487   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 488
 489   return true;
 490 }
 491
 492
 493 /* Function vect_is_simple_iv_evolution.
 494
 495    FORNOW: A simple evolution of an induction variables in the loop is
 496    considered a polynomial evolution with constant step.  */
 497
 498 static bool
 499 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 500                              tree * step)
 501 {
 502   tree init_expr;
 503   tree step_expr;
 504   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 505
 506   /* When there is no evolution in this loop, the evolution function
 507      is not "simple".  */
 508   if (evolution_part == NULL_TREE)
 509     return false;
 510
 511   /* When the evolution is a polynomial of degree >= 2
 512      the evolution function is not "simple".  */
 513   if (tree_is_chrec (evolution_part))
 514     return false;
 515
 516   step_expr = evolution_part;
 517   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 518
 519   if (dump_enabled_p ())
 520     {
 521       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 522       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 523       dump_printf (MSG_NOTE, ",  init: ");
 524       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 525     }
 526
 527   *init = init_expr;
 528   *step = step_expr;
 529
 530   if (TREE_CODE (step_expr) != INTEGER_CST)
 531     {
 532       if (dump_enabled_p ())
 533         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 534                          "step unknown.");
 535       return false;
 536     }
 537
 538   return true;
 539 }
 540
 541 /* Function vect_analyze_scalar_cycles_1.
 542
 543    Examine the cross iteration def-use cycles of scalar variables
 544    in LOOP.  LOOP_VINFO represents the loop that is now being
 545    considered for vectorization (can be LOOP, or an outer-loop
 546    enclosing LOOP).  */
 547
 548 static void
 549 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 550 {
 551   basic_block bb = loop->header;
 552   tree dumy;
 553   vec<gimple> worklist;
 554   worklist.create (64);
 555   gimple_stmt_iterator gsi;
 556   bool double_reduc;
 557
 558   if (dump_enabled_p ())
 559     dump_printf_loc (MSG_NOTE, vect_location,
 560                      "=== vect_analyze_scalar_cycles ===");
 561
 562   /* First - identify all inductions.  Reduction detection assumes that all the
 563      inductions have been identified, therefore, this order must not be
 564      changed.  */
 565   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 566     {
 567       gimple phi = gsi_stmt (gsi);
 568       tree access_fn = NULL;
 569       tree def = PHI_RESULT (phi);
 570       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 571
 572       if (dump_enabled_p ())
 573         {
 574           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 575           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 576         }
 577
 578       /* Skip virtual phi's.  The data dependences that are associated with
 579          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 580       if (virtual_operand_p (def))
 581         continue;
 582
 583       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 584
 585       /* Analyze the evolution function.  */
 586       access_fn = analyze_scalar_evolution (loop, def);
 587       if (access_fn)
 588         {
 589           STRIP_NOPS (access_fn);
 590           if (dump_enabled_p ())
 591             {
 592               dump_printf_loc (MSG_NOTE, vect_location,
 593                                "Access function of PHI: ");
 594               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 595             }
 596           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 597             = evolution_part_in_loop_num (access_fn, loop->num);
 598         }
 599
 600       if (!access_fn
 601           || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
 602         {
 603           worklist.safe_push (phi);
 604           continue;
 605         }
 606
 607       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 608
 609       if (dump_enabled_p ())
 610         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.");
 611       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 612     }
 613
 614
 615   /* Second - identify all reductions and nested cycles.  */
 616   while (worklist.length () > 0)
 617     {
 618       gimple phi = worklist.pop ();
 619       tree def = PHI_RESULT (phi);
 620       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 621       gimple reduc_stmt;
 622       bool nested_cycle;
 623
 624       if (dump_enabled_p ())
 625         {
 626           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 627           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 628         }
 629
 630       gcc_assert (!virtual_operand_p (def)
 631                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 632
 633       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 634       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 635                                                 &double_reduc);
 636       if (reduc_stmt)
 637         {
 638           if (double_reduc)
 639             {
 640               if (dump_enabled_p ())
 641                 dump_printf_loc (MSG_NOTE, vect_location,
 642                                  "Detected double reduction.");
 643
 644               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 645               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 646                                                     vect_double_reduction_def;
 647             }
 648           else
 649             {
 650               if (nested_cycle)
 651                 {
 652                   if (dump_enabled_p ())
 653                     dump_printf_loc (MSG_NOTE, vect_location,
 654                                      "Detected vectorizable nested cycle.");
 655
 656                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 657                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 658                                                              vect_nested_cycle;
 659                 }
 660               else
 661                 {
 662                   if (dump_enabled_p ())
 663                     dump_printf_loc (MSG_NOTE, vect_location,
 664                                      "Detected reduction.");
 665
 666                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 667                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 668                                                            vect_reduction_def;
 669                   /* Store the reduction cycles for possible vectorization in
 670                      loop-aware SLP.  */
 671                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 672                 }
 673             }
 674         }
 675       else
 676         if (dump_enabled_p ())
 677           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 678                            "Unknown def-use cycle pattern.");
 679     }
 680
 681   worklist.release ();
 682 }
 683
 684
 685 /* Function vect_analyze_scalar_cycles.
 686
 687    Examine the cross iteration def-use cycles of scalar variables, by
 688    analyzing the loop-header PHIs of scalar variables.  Classify each
 689    cycle as one of the following: invariant, induction, reduction, unknown.
 690    We do that for the loop represented by LOOP_VINFO, and also to its
 691    inner-loop, if exists.
 692    Examples for scalar cycles:
 693
 694    Example1: reduction:
 695
 696               loop1:
 697               for (i=0; i<N; i++)
 698                  sum += a[i];
 699
 700    Example2: induction:
 701
 702               loop2:
 703               for (i=0; i<N; i++)
 704                  a[i] = i;  */
 705
 706 static void
 707 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 708 {
 709   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 710
 711   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 712
 713   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 714      Reductions in such inner-loop therefore have different properties than
 715      the reductions in the nest that gets vectorized:
 716      1. When vectorized, they are executed in the same order as in the original
 717         scalar loop, so we can't change the order of computation when
 718         vectorizing them.
 719      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 720         current checks are too strict.  */
 721
 722   if (loop->inner)
 723     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 724 }
 725
 726 /* Function vect_get_loop_niters.
 727
 728    Determine how many iterations the loop is executed.
 729    If an expression that represents the number of iterations
 730    can be constructed, place it in NUMBER_OF_ITERATIONS.
 731    Return the loop exit condition.  */
 732
 733 static gimple
 734 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
 735 {
 736   tree niters;
 737
 738   if (dump_enabled_p ())
 739     dump_printf_loc (MSG_NOTE, vect_location,
 740                      "=== get_loop_niters ===");
 741   niters = number_of_exit_cond_executions (loop);
 742
 743   if (niters != NULL_TREE
 744       && niters != chrec_dont_know)
 745     {
 746       *number_of_iterations = niters;
 747
 748       if (dump_enabled_p ())
 749         {
 750           dump_printf_loc (MSG_NOTE, vect_location, "==> get_loop_niters:");
 751           dump_generic_expr (MSG_NOTE, TDF_SLIM, *number_of_iterations);
 752         }
 753     }
 754
 755   return get_loop_exit_condition (loop);
 756 }
 757
 758
 759 /* Function bb_in_loop_p
 760
 761    Used as predicate for dfs order traversal of the loop bbs.  */
 762
 763 static bool
 764 bb_in_loop_p (const_basic_block bb, const void *data)
 765 {
 766   const struct loop *const loop = (const struct loop *)data;
 767   if (flow_bb_inside_loop_p (loop, bb))
 768     return true;
 769   return false;
 770 }
 771
 772
 773 /* Function new_loop_vec_info.
 774
 775    Create and initialize a new loop_vec_info struct for LOOP, as well as
 776    stmt_vec_info structs for all the stmts in LOOP.  */
 777
 778 static loop_vec_info
 779 new_loop_vec_info (struct loop *loop)
 780 {
 781   loop_vec_info res;
 782   basic_block *bbs;
 783   gimple_stmt_iterator si;
 784   unsigned int i, nbbs;
 785
 786   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 787   LOOP_VINFO_LOOP (res) = loop;
 788
 789   bbs = get_loop_body (loop);
 790
 791   /* Create/Update stmt_info for all stmts in the loop.  */
 792   for (i = 0; i < loop->num_nodes; i++)
 793     {
 794       basic_block bb = bbs[i];
 795
 796       /* BBs in a nested inner-loop will have been already processed (because
 797          we will have called vect_analyze_loop_form for any nested inner-loop).
 798          Therefore, for stmts in an inner-loop we just want to update the
 799          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 800          loop_info of the outer-loop we are currently considering to vectorize
 801          (instead of the loop_info of the inner-loop).
 802          For stmts in other BBs we need to create a stmt_info from scratch.  */
 803       if (bb->loop_father != loop)
 804         {
 805           /* Inner-loop bb.  */
 806           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 807           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 808             {
 809               gimple phi = gsi_stmt (si);
 810               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 811               loop_vec_info inner_loop_vinfo =
 812                 STMT_VINFO_LOOP_VINFO (stmt_info);
 813               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 814               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 815             }
 816           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 817            {
 818               gimple stmt = gsi_stmt (si);
 819               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 820               loop_vec_info inner_loop_vinfo =
 821                  STMT_VINFO_LOOP_VINFO (stmt_info);
 822               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 823               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 824            }
 825         }
 826       else
 827         {
 828           /* bb in current nest.  */
 829           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 830             {
 831               gimple phi = gsi_stmt (si);
 832               gimple_set_uid (phi, 0);
 833               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 834             }
 835
 836           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 837             {
 838               gimple stmt = gsi_stmt (si);
 839               gimple_set_uid (stmt, 0);
 840               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 841             }
 842         }
 843     }
 844
 845   /* CHECKME: We want to visit all BBs before their successors (except for
 846      latch blocks, for which this assertion wouldn't hold).  In the simple
 847      case of the loop forms we allow, a dfs order of the BBs would the same
 848      as reversed postorder traversal, so we are safe.  */
 849
 850    free (bbs);
 851    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 852    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 853                               bbs, loop->num_nodes, loop);
 854    gcc_assert (nbbs == loop->num_nodes);
 855
 856   LOOP_VINFO_BBS (res) = bbs;
 857   LOOP_VINFO_NITERS (res) = NULL;
 858   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 859   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 860   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 861   LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
 862   LOOP_VINFO_VECT_FACTOR (res) = 0;
 863   LOOP_VINFO_LOOP_NEST (res).create (3);
 864   LOOP_VINFO_DATAREFS (res).create (10);
 865   LOOP_VINFO_DDRS (res).create (10 * 10);
 866   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 867   LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
 868              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 869   LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
 870              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 871   LOOP_VINFO_GROUPED_STORES (res).create (10);
 872   LOOP_VINFO_REDUCTIONS (res).create (10);
 873   LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
 874   LOOP_VINFO_SLP_INSTANCES (res).create (10);
 875   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 876   LOOP_VINFO_PEELING_HTAB (res) = NULL;
 877   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
 878   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
 879   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
 880
 881   return res;
 882 }
 883
 884
 885 /* Function destroy_loop_vec_info.
 886
 887    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 888    stmts in the loop.  */
 889
 890 void
 891 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 892 {
 893   struct loop *loop;
 894   basic_block *bbs;
 895   int nbbs;
 896   gimple_stmt_iterator si;
 897   int j;
 898   vec<slp_instance> slp_instances;
 899   slp_instance instance;
 900   bool swapped;
 901
 902   if (!loop_vinfo)
 903     return;
 904
 905   loop = LOOP_VINFO_LOOP (loop_vinfo);
 906
 907   bbs = LOOP_VINFO_BBS (loop_vinfo);
 908   nbbs = loop->num_nodes;
 909   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
 910
 911   if (!clean_stmts)
 912     {
 913       free (LOOP_VINFO_BBS (loop_vinfo));
 914       free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 915       free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 916       LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
 917       LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
 918       LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
 919
 920       free (loop_vinfo);
 921       loop->aux = NULL;
 922       return;
 923     }
 924
 925   for (j = 0; j < nbbs; j++)
 926     {
 927       basic_block bb = bbs[j];
 928       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 929         free_stmt_vec_info (gsi_stmt (si));
 930
 931       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 932         {
 933           gimple stmt = gsi_stmt (si);
 934
 935           /* We may have broken canonical form by moving a constant
 936              into RHS1 of a commutative op.  Fix such occurrences.  */
 937           if (swapped && is_gimple_assign (stmt))
 938             {
 939               enum tree_code code = gimple_assign_rhs_code (stmt);
 940
 941               if ((code == PLUS_EXPR
 942                    || code == POINTER_PLUS_EXPR
 943                    || code == MULT_EXPR)
 944                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 945                 swap_tree_operands (stmt,
 946                                     gimple_assign_rhs1_ptr (stmt),
 947                                     gimple_assign_rhs2_ptr (stmt));
 948             }
 949
 950           /* Free stmt_vec_info.  */
 951           free_stmt_vec_info (stmt);
 952           gsi_next (&si);
 953         }
 954     }
 955
 956   free (LOOP_VINFO_BBS (loop_vinfo));
 957   free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 958   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 959   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
 960   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
 961   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
 962   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
 963   FOR_EACH_VEC_ELT (slp_instances, j, instance)
 964     vect_free_slp_instance (instance);
 965
 966   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
 967   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
 968   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
 969   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
 970
 971   if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
 972     htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
 973
 974   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
 975
 976   free (loop_vinfo);
 977   loop->aux = NULL;
 978 }
 979
 980
 981 /* Function vect_analyze_loop_1.
 982
 983    Apply a set of analyses on LOOP, and create a loop_vec_info struct
 984    for it. The different analyses will record information in the
 985    loop_vec_info struct.  This is a subset of the analyses applied in
 986    vect_analyze_loop, to be applied on an inner-loop nested in the loop
 987    that is now considered for (outer-loop) vectorization.  */
 988
 989 static loop_vec_info
 990 vect_analyze_loop_1 (struct loop *loop)
 991 {
 992   loop_vec_info loop_vinfo;
 993
 994   if (dump_enabled_p ())
 995     dump_printf_loc (MSG_NOTE, vect_location,
 996                      "===== analyze_loop_nest_1 =====");
 997
 998   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
 999
1000   loop_vinfo = vect_analyze_loop_form (loop);
1001   if (!loop_vinfo)
1002     {
1003       if (dump_enabled_p ())
1004         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1005                          "bad inner-loop form.");
1006       return NULL;
1007     }
1008
1009   return loop_vinfo;
1010 }
1011
1012
1013 /* Function vect_analyze_loop_form.
1014
1015    Verify that certain CFG restrictions hold, including:
1016    - the loop has a pre-header
1017    - the loop has a single entry and exit
1018    - the loop exit condition is simple enough, and the number of iterations
1019      can be analyzed (a countable loop).  */
1020
1021 loop_vec_info
1022 vect_analyze_loop_form (struct loop *loop)
1023 {
1024   loop_vec_info loop_vinfo;
1025   gimple loop_cond;
1026   tree number_of_iterations = NULL;
1027   loop_vec_info inner_loop_vinfo = NULL;
1028
1029   if (dump_enabled_p ())
1030     dump_printf_loc (MSG_NOTE, vect_location,
1031                      "=== vect_analyze_loop_form ===");
1032
1033   /* Different restrictions apply when we are considering an inner-most loop,
1034      vs. an outer (nested) loop.
1035      (FORNOW. May want to relax some of these restrictions in the future).  */
1036
1037   if (!loop->inner)
1038     {
1039       /* Inner-most loop.  We currently require that the number of BBs is
1040          exactly 2 (the header and latch).  Vectorizable inner-most loops
1041          look like this:
1042
1043                         (pre-header)
1044                            |
1045                           header <--------+
1046                            | |            |
1047                            | +--> latch --+
1048                            |
1049                         (exit-bb)  */
1050
1051       if (loop->num_nodes != 2)
1052         {
1053           if (dump_enabled_p ())
1054             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1055                              "not vectorized: control flow in loop.");
1056           return NULL;
1057         }
1058
1059       if (empty_block_p (loop->header))
1060     {
1061           if (dump_enabled_p ())
1062             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1063                              "not vectorized: empty loop.");
1064       return NULL;
1065     }
1066     }
1067   else
1068     {
1069       struct loop *innerloop = loop->inner;
1070       edge entryedge;
1071
1072       /* Nested loop. We currently require that the loop is doubly-nested,
1073          contains a single inner loop, and the number of BBs is exactly 5.
1074          Vectorizable outer-loops look like this:
1075
1076                         (pre-header)
1077                            |
1078                           header <---+
1079                            |         |
1080                           inner-loop |
1081                            |         |
1082                           tail ------+
1083                            |
1084                         (exit-bb)
1085
1086          The inner-loop has the properties expected of inner-most loops
1087          as described above.  */
1088
1089       if ((loop->inner)->inner || (loop->inner)->next)
1090         {
1091           if (dump_enabled_p ())
1092             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1093                              "not vectorized: multiple nested loops.");
1094           return NULL;
1095         }
1096
1097       /* Analyze the inner-loop.  */
1098       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1099       if (!inner_loop_vinfo)
1100         {
1101           if (dump_enabled_p ())
1102             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1103                              "not vectorized: Bad inner loop.");
1104           return NULL;
1105         }
1106
1107       if (!expr_invariant_in_loop_p (loop,
1108                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
1109         {
1110           if (dump_enabled_p ())
1111             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1112                              "not vectorized: inner-loop count not invariant.");
1113           destroy_loop_vec_info (inner_loop_vinfo, true);
1114           return NULL;
1115         }
1116
1117       if (loop->num_nodes != 5)
1118         {
1119           if (dump_enabled_p ())
1120             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1121                              "not vectorized: control flow in loop.");
1122           destroy_loop_vec_info (inner_loop_vinfo, true);
1123           return NULL;
1124         }
1125
1126       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1127       entryedge = EDGE_PRED (innerloop->header, 0);
1128       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1129         entryedge = EDGE_PRED (innerloop->header, 1);
1130
1131       if (entryedge->src != loop->header
1132           || !single_exit (innerloop)
1133           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1134         {
1135           if (dump_enabled_p ())
1136             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1137                              "not vectorized: unsupported outerloop form.");
1138           destroy_loop_vec_info (inner_loop_vinfo, true);
1139           return NULL;
1140         }
1141
1142       if (dump_enabled_p ())
1143         dump_printf_loc (MSG_NOTE, vect_location,
1144                          "Considering outer-loop vectorization.");
1145     }
1146
1147   if (!single_exit (loop)
1148       || EDGE_COUNT (loop->header->preds) != 2)
1149     {
1150       if (dump_enabled_p ())
1151         {
1152           if (!single_exit (loop))
1153             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1154                              "not vectorized: multiple exits.");
1155           else if (EDGE_COUNT (loop->header->preds) != 2)
1156             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1157                              "not vectorized: too many incoming edges.");
1158         }
1159       if (inner_loop_vinfo)
1160         destroy_loop_vec_info (inner_loop_vinfo, true);
1161       return NULL;
1162     }
1163
1164   /* We assume that the loop exit condition is at the end of the loop. i.e,
1165      that the loop is represented as a do-while (with a proper if-guard
1166      before the loop if needed), where the loop header contains all the
1167      executable statements, and the latch is empty.  */
1168   if (!empty_block_p (loop->latch)
1169       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1170     {
1171       if (dump_enabled_p ())
1172         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1173                          "not vectorized: latch block not empty.");
1174       if (inner_loop_vinfo)
1175         destroy_loop_vec_info (inner_loop_vinfo, true);
1176       return NULL;
1177     }
1178
1179   /* Make sure there exists a single-predecessor exit bb:  */
1180   if (!single_pred_p (single_exit (loop)->dest))
1181     {
1182       edge e = single_exit (loop);
1183       if (!(e->flags & EDGE_ABNORMAL))
1184         {
1185           split_loop_exit_edge (e);
1186           if (dump_enabled_p ())
1187             dump_printf (MSG_NOTE, "split exit edge.");
1188         }
1189       else
1190         {
1191           if (dump_enabled_p ())
1192             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1193                              "not vectorized: abnormal loop exit edge.");
1194           if (inner_loop_vinfo)
1195             destroy_loop_vec_info (inner_loop_vinfo, true);
1196           return NULL;
1197         }
1198     }
1199
1200   loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1201   if (!loop_cond)
1202     {
1203       if (dump_enabled_p ())
1204         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1205                          "not vectorized: complicated exit condition.");
1206       if (inner_loop_vinfo)
1207         destroy_loop_vec_info (inner_loop_vinfo, true);
1208       return NULL;
1209     }
1210
1211   if (!number_of_iterations)
1212     {
1213       if (dump_enabled_p ())
1214         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1215                          "not vectorized: number of iterations cannot be "
1216                          "computed.");
1217       if (inner_loop_vinfo)
1218         destroy_loop_vec_info (inner_loop_vinfo, true);
1219       return NULL;
1220     }
1221
1222   if (chrec_contains_undetermined (number_of_iterations))
1223     {
1224       if (dump_enabled_p ())
1225             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1226                              "Infinite number of iterations.");
1227       if (inner_loop_vinfo)
1228         destroy_loop_vec_info (inner_loop_vinfo, true);
1229       return NULL;
1230     }
1231
1232   if (!NITERS_KNOWN_P (number_of_iterations))
1233     {
1234       if (dump_enabled_p ())
1235         {
1236           dump_printf_loc (MSG_NOTE, vect_location,
1237                            "Symbolic number of iterations is ");
1238           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1239         }
1240     }
1241   else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1242     {
1243       if (dump_enabled_p ())
1244         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1245                          "not vectorized: number of iterations = 0.");
1246       if (inner_loop_vinfo)
1247         destroy_loop_vec_info (inner_loop_vinfo, false);
1248       return NULL;
1249     }
1250
1251   loop_vinfo = new_loop_vec_info (loop);
1252   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1253   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1254
1255   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1256
1257   /* CHECKME: May want to keep it around it in the future.  */
1258   if (inner_loop_vinfo)
1259     destroy_loop_vec_info (inner_loop_vinfo, false);
1260
1261   gcc_assert (!loop->aux);
1262   loop->aux = loop_vinfo;
1263   return loop_vinfo;
1264 }
1265
1266
1267 /* Function vect_analyze_loop_operations.
1268
1269    Scan the loop stmts and make sure they are all vectorizable.  */
1270
1271 static bool
1272 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1273 {
1274   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1275   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1276   int nbbs = loop->num_nodes;
1277   gimple_stmt_iterator si;
1278   unsigned int vectorization_factor = 0;
1279   int i;
1280   gimple phi;
1281   stmt_vec_info stmt_info;
1282   bool need_to_vectorize = false;
1283   int min_profitable_iters;
1284   int min_scalar_loop_bound;
1285   unsigned int th;
1286   bool only_slp_in_loop = true, ok;
1287   HOST_WIDE_INT max_niter;
1288   HOST_WIDE_INT estimated_niter;
1289   int min_profitable_estimate;
1290
1291   if (dump_enabled_p ())
1292     dump_printf_loc (MSG_NOTE, vect_location,
1293                      "=== vect_analyze_loop_operations ===");
1294
1295   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1296   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1297   if (slp)
1298     {
1299       /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1300          vectorization factor of the loop is the unrolling factor required by
1301          the SLP instances.  If that unrolling factor is 1, we say, that we
1302          perform pure SLP on loop - cross iteration parallelism is not
1303          exploited.  */
1304       for (i = 0; i < nbbs; i++)
1305         {
1306           basic_block bb = bbs[i];
1307           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1308             {
1309               gimple stmt = gsi_stmt (si);
1310               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1311               gcc_assert (stmt_info);
1312               if ((STMT_VINFO_RELEVANT_P (stmt_info)
1313                    || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1314                   && !PURE_SLP_STMT (stmt_info))
1315                 /* STMT needs both SLP and loop-based vectorization.  */
1316                 only_slp_in_loop = false;
1317             }
1318         }
1319
1320       if (only_slp_in_loop)
1321         vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1322       else
1323         vectorization_factor = least_common_multiple (vectorization_factor,
1324                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1325
1326       LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1327       if (dump_enabled_p ())
1328         dump_printf_loc (MSG_NOTE, vect_location,
1329                          "Updating vectorization factor to %d ",
1330                          vectorization_factor);
1331     }
1332
1333   for (i = 0; i < nbbs; i++)
1334     {
1335       basic_block bb = bbs[i];
1336
1337       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1338         {
1339           phi = gsi_stmt (si);
1340           ok = true;
1341
1342           stmt_info = vinfo_for_stmt (phi);
1343           if (dump_enabled_p ())
1344             {
1345               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1346               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1347             }
1348
1349           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1350              (i.e., a phi in the tail of the outer-loop).  */
1351           if (! is_loop_header_bb_p (bb))
1352             {
1353               /* FORNOW: we currently don't support the case that these phis
1354                  are not used in the outerloop (unless it is double reduction,
1355                  i.e., this phi is vect_reduction_def), cause this case
1356                  requires to actually do something here.  */
1357               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1358                    || STMT_VINFO_LIVE_P (stmt_info))
1359                   && STMT_VINFO_DEF_TYPE (stmt_info)
1360                      != vect_double_reduction_def)
1361                 {
1362                   if (dump_enabled_p ())
1363                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1364                                      "Unsupported loop-closed phi in "
1365                                      "outer-loop.");
1366                   return false;
1367                 }
1368
1369               /* If PHI is used in the outer loop, we check that its operand
1370                  is defined in the inner loop.  */
1371               if (STMT_VINFO_RELEVANT_P (stmt_info))
1372                 {
1373                   tree phi_op;
1374                   gimple op_def_stmt;
1375
1376                   if (gimple_phi_num_args (phi) != 1)
1377                     return false;
1378
1379                   phi_op = PHI_ARG_DEF (phi, 0);
1380                   if (TREE_CODE (phi_op) != SSA_NAME)
1381                     return false;
1382
1383                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1384                   if (!op_def_stmt
1385                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1386                       || !vinfo_for_stmt (op_def_stmt))
1387                     return false;
1388
1389                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1390                         != vect_used_in_outer
1391                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1392                            != vect_used_in_outer_by_reduction)
1393                     return false;
1394                 }
1395
1396               continue;
1397             }
1398
1399           gcc_assert (stmt_info);
1400
1401           if (STMT_VINFO_LIVE_P (stmt_info))
1402             {
1403               /* FORNOW: not yet supported.  */
1404               if (dump_enabled_p ())
1405                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1406                                  "not vectorized: value used after loop.");
1407               return false;
1408             }
1409
1410           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1411               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1412             {
1413               /* A scalar-dependence cycle that we don't support.  */
1414               if (dump_enabled_p ())
1415                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1416                                  "not vectorized: scalar dependence cycle.");
1417               return false;
1418             }
1419
1420           if (STMT_VINFO_RELEVANT_P (stmt_info))
1421             {
1422               need_to_vectorize = true;
1423               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1424                 ok = vectorizable_induction (phi, NULL, NULL);
1425             }
1426
1427           if (!ok)
1428             {
1429               if (dump_enabled_p ())
1430                 {
1431                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1432                                    "not vectorized: relevant phi not "
1433                                    "supported: ");
1434                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1435                 }
1436               return false;
1437             }
1438         }
1439
1440       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1441         {
1442           gimple stmt = gsi_stmt (si);
1443           if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1444             return false;
1445         }
1446     } /* bbs */
1447
1448   /* All operations in the loop are either irrelevant (deal with loop
1449      control, or dead), or only used outside the loop and can be moved
1450      out of the loop (e.g. invariants, inductions).  The loop can be
1451      optimized away by scalar optimizations.  We're better off not
1452      touching this loop.  */
1453   if (!need_to_vectorize)
1454     {
1455       if (dump_enabled_p ())
1456         dump_printf_loc (MSG_NOTE, vect_location,
1457                          "All the computation can be taken out of the loop.");
1458       if (dump_enabled_p ())
1459         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1460                          "not vectorized: redundant loop. no profit to "
1461                          "vectorize.");
1462       return false;
1463     }
1464
1465   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1466     dump_printf_loc (MSG_NOTE, vect_location,
1467                      "vectorization_factor = %d, niters = "
1468                      HOST_WIDE_INT_PRINT_DEC, vectorization_factor,
1469                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1470
1471   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1472        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1473       || ((max_niter = max_stmt_executions_int (loop)) != -1
1474           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1475     {
1476       if (dump_enabled_p ())
1477         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1478                          "not vectorized: iteration count too small.");
1479       if (dump_enabled_p ())
1480         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1481                          "not vectorized: iteration count smaller than "
1482                          "vectorization factor.");
1483       return false;
1484     }
1485
1486   /* Analyze cost.  Decide if worth while to vectorize.  */
1487
1488   /* Once VF is set, SLP costs should be updated since the number of created
1489      vector stmts depends on VF.  */
1490   vect_update_slp_costs_according_to_vf (loop_vinfo);
1491
1492   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1493                                       &min_profitable_estimate);
1494   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1495
1496   if (min_profitable_iters < 0)
1497     {
1498       if (dump_enabled_p ())
1499         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1500                          "not vectorized: vectorization not profitable.");
1501       if (dump_enabled_p ())
1502         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1503                          "not vectorized: vector version will never be "
1504                          "profitable.");
1505       return false;
1506     }
1507
1508   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1509                             * vectorization_factor) - 1);
1510
1511
1512   /* Use the cost model only if it is more conservative than user specified
1513      threshold.  */
1514
1515   th = (unsigned) min_scalar_loop_bound;
1516   if (min_profitable_iters
1517       && (!min_scalar_loop_bound
1518           || min_profitable_iters > min_scalar_loop_bound))
1519     th = (unsigned) min_profitable_iters;
1520
1521   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1522       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1523     {
1524       if (dump_enabled_p ())
1525         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1526                          "not vectorized: vectorization not profitable.");
1527       if (dump_enabled_p ())
1528         dump_printf_loc (MSG_NOTE, vect_location,
1529                          "not vectorized: iteration count smaller than user "
1530                          "specified loop bound parameter or minimum profitable "
1531                          "iterations (whichever is more conservative).");
1532       return false;
1533     }
1534
1535   if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1536       && ((unsigned HOST_WIDE_INT) estimated_niter
1537           <= MAX (th, (unsigned)min_profitable_estimate)))
1538     {
1539       if (dump_enabled_p ())
1540         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1541                          "not vectorized: estimated iteration count too "
1542                          "small.");
1543       if (dump_enabled_p ())
1544         dump_printf_loc (MSG_NOTE, vect_location,
1545                          "not vectorized: estimated iteration count smaller "
1546                          "than specified loop bound parameter or minimum "
1547                          "profitable iterations (whichever is more "
1548                          "conservative).");
1549       return false;
1550     }
1551
1552   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1553       || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1554       || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1555     {
1556       if (dump_enabled_p ())
1557         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required.");
1558       if (!vect_can_advance_ivs_p (loop_vinfo))
1559         {
1560           if (dump_enabled_p ())
1561             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1562                              "not vectorized: can't create epilog loop 1.");
1563           return false;
1564         }
1565       if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1566         {
1567           if (dump_enabled_p ())
1568             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1569                              "not vectorized: can't create epilog loop 2.");
1570           return false;
1571         }
1572     }
1573
1574   return true;
1575 }
1576
1577
1578 /* Function vect_analyze_loop_2.
1579
1580    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1581    for it.  The different analyses will record information in the
1582    loop_vec_info struct.  */
1583 static bool
1584 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1585 {
1586   bool ok, slp = false;
1587   int max_vf = MAX_VECTORIZATION_FACTOR;
1588   int min_vf = 2;
1589
1590   /* Find all data references in the loop (which correspond to vdefs/vuses)
1591      and analyze their evolution in the loop.  Also adjust the minimal
1592      vectorization factor according to the loads and stores.
1593
1594      FORNOW: Handle only simple, array references, which
1595      alignment can be forced, and aligned pointer-references.  */
1596
1597   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1598   if (!ok)
1599     {
1600       if (dump_enabled_p ())
1601         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1602                          "bad data references.");
1603       return false;
1604     }
1605
1606   /* Classify all cross-iteration scalar data-flow cycles.
1607      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1608
1609   vect_analyze_scalar_cycles (loop_vinfo);
1610
1611   vect_pattern_recog (loop_vinfo, NULL);
1612
1613   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1614
1615   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1616   if (!ok)
1617     {
1618       if (dump_enabled_p ())
1619         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1620                          "unexpected pattern.");
1621       return false;
1622     }
1623
1624   /* Analyze data dependences between the data-refs in the loop
1625      and adjust the maximum vectorization factor according to
1626      the dependences.
1627      FORNOW: fail at the first data dependence that we encounter.  */
1628
1629   ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf);
1630   if (!ok
1631       || max_vf < min_vf)
1632     {
1633       if (dump_enabled_p ())
1634             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1635                              "bad data dependence.");
1636       return false;
1637     }
1638
1639   ok = vect_determine_vectorization_factor (loop_vinfo);
1640   if (!ok)
1641     {
1642       if (dump_enabled_p ())
1643         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1644                          "can't determine vectorization factor.");
1645       return false;
1646     }
1647   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1648     {
1649       if (dump_enabled_p ())
1650         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1651                          "bad data dependence.");
1652       return false;
1653     }
1654
1655   /* Analyze the alignment of the data-refs in the loop.
1656      Fail if a data reference is found that cannot be vectorized.  */
1657
1658   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1659   if (!ok)
1660     {
1661       if (dump_enabled_p ())
1662         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1663                          "bad data alignment.");
1664       return false;
1665     }
1666
1667   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1668      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1669
1670   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1671   if (!ok)
1672     {
1673       if (dump_enabled_p ())
1674         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1675                          "bad data access.");
1676       return false;
1677     }
1678
1679   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1680      It is important to call pruning after vect_analyze_data_ref_accesses,
1681      since we use grouping information gathered by interleaving analysis.  */
1682   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1683   if (!ok)
1684     {
1685       if (dump_enabled_p ())
1686         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1687                          "too long list of versioning for alias "
1688                          "run-time tests.");
1689       return false;
1690     }
1691
1692   /* This pass will decide on using loop versioning and/or loop peeling in
1693      order to enhance the alignment of data references in the loop.  */
1694
1695   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1696   if (!ok)
1697     {
1698       if (dump_enabled_p ())
1699         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1700                          "bad data alignment.");
1701       return false;
1702     }
1703
1704   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1705   ok = vect_analyze_slp (loop_vinfo, NULL);
1706   if (ok)
1707     {
1708       /* Decide which possible SLP instances to SLP.  */
1709       slp = vect_make_slp_decision (loop_vinfo);
1710
1711       /* Find stmts that need to be both vectorized and SLPed.  */
1712       vect_detect_hybrid_slp (loop_vinfo);
1713     }
1714   else
1715     return false;
1716
1717   /* Scan all the operations in the loop and make sure they are
1718      vectorizable.  */
1719
1720   ok = vect_analyze_loop_operations (loop_vinfo, slp);
1721   if (!ok)
1722     {
1723       if (dump_enabled_p ())
1724         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1725                          "bad operation or unsupported loop bound.");
1726       return false;
1727     }
1728
1729   return true;
1730 }
1731
1732 /* Function vect_analyze_loop.
1733
1734    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1735    for it.  The different analyses will record information in the
1736    loop_vec_info struct.  */
1737 loop_vec_info
1738 vect_analyze_loop (struct loop *loop)
1739 {
1740   loop_vec_info loop_vinfo;
1741   unsigned int vector_sizes;
1742
1743   /* Autodetect first vector size we try.  */
1744   current_vector_size = 0;
1745   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1746
1747   if (dump_enabled_p ())
1748     dump_printf_loc (MSG_NOTE, vect_location,
1749                      "===== analyze_loop_nest =====");
1750
1751   if (loop_outer (loop)
1752       && loop_vec_info_for_loop (loop_outer (loop))
1753       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1754     {
1755       if (dump_enabled_p ())
1756         dump_printf_loc (MSG_NOTE, vect_location,
1757                          "outer-loop already vectorized.");
1758       return NULL;
1759     }
1760
1761   while (1)
1762     {
1763       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1764       loop_vinfo = vect_analyze_loop_form (loop);
1765       if (!loop_vinfo)
1766         {
1767           if (dump_enabled_p ())
1768             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1769                              "bad loop form.");
1770           return NULL;
1771         }
1772
1773       if (vect_analyze_loop_2 (loop_vinfo))
1774         {
1775           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1776
1777           return loop_vinfo;
1778         }
1779
1780       destroy_loop_vec_info (loop_vinfo, true);
1781
1782       vector_sizes &= ~current_vector_size;
1783       if (vector_sizes == 0
1784           || current_vector_size == 0)
1785         return NULL;
1786
1787       /* Try the next biggest vector size.  */
1788       current_vector_size = 1 << floor_log2 (vector_sizes);
1789       if (dump_enabled_p ())
1790         dump_printf_loc (MSG_NOTE, vect_location,
1791                          "***** Re-trying analysis with "
1792                          "vector size %d\n", current_vector_size);
1793     }
1794 }
1795
1796
1797 /* Function reduction_code_for_scalar_code
1798
1799    Input:
1800    CODE - tree_code of a reduction operations.
1801
1802    Output:
1803    REDUC_CODE - the corresponding tree-code to be used to reduce the
1804       vector of partial results into a single scalar result (which
1805       will also reside in a vector) or ERROR_MARK if the operation is
1806       a supported reduction operation, but does not have such tree-code.
1807
1808    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1809
1810 static bool
1811 reduction_code_for_scalar_code (enum tree_code code,
1812                                 enum tree_code *reduc_code)
1813 {
1814   switch (code)
1815     {
1816       case MAX_EXPR:
1817         *reduc_code = REDUC_MAX_EXPR;
1818         return true;
1819
1820       case MIN_EXPR:
1821         *reduc_code = REDUC_MIN_EXPR;
1822         return true;
1823
1824       case PLUS_EXPR:
1825         *reduc_code = REDUC_PLUS_EXPR;
1826         return true;
1827
1828       case MULT_EXPR:
1829       case MINUS_EXPR:
1830       case BIT_IOR_EXPR:
1831       case BIT_XOR_EXPR:
1832       case BIT_AND_EXPR:
1833         *reduc_code = ERROR_MARK;
1834         return true;
1835
1836       default:
1837        return false;
1838     }
1839 }
1840
1841
1842 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
1843    STMT is printed with a message MSG. */
1844
1845 static void
1846 report_vect_op (int msg_type, gimple stmt, const char *msg)
1847 {
1848   dump_printf_loc (msg_type, vect_location, "%s", msg);
1849   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1850 }
1851
1852
1853 /* Detect SLP reduction of the form:
1854
1855    #a1 = phi <a5, a0>
1856    a2 = operation (a1)
1857    a3 = operation (a2)
1858    a4 = operation (a3)
1859    a5 = operation (a4)
1860
1861    #a = phi <a5>
1862
1863    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1864    FIRST_STMT is the first reduction stmt in the chain
1865    (a2 = operation (a1)).
1866
1867    Return TRUE if a reduction chain was detected.  */
1868
1869 static bool
1870 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1871 {
1872   struct loop *loop = (gimple_bb (phi))->loop_father;
1873   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1874   enum tree_code code;
1875   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1876   stmt_vec_info use_stmt_info, current_stmt_info;
1877   tree lhs;
1878   imm_use_iterator imm_iter;
1879   use_operand_p use_p;
1880   int nloop_uses, size = 0, n_out_of_loop_uses;
1881   bool found = false;
1882
1883   if (loop != vect_loop)
1884     return false;
1885
1886   lhs = PHI_RESULT (phi);
1887   code = gimple_assign_rhs_code (first_stmt);
1888   while (1)
1889     {
1890       nloop_uses = 0;
1891       n_out_of_loop_uses = 0;
1892       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1893         {
1894           gimple use_stmt = USE_STMT (use_p);
1895           if (is_gimple_debug (use_stmt))
1896             continue;
1897
1898           use_stmt = USE_STMT (use_p);
1899
1900           /* Check if we got back to the reduction phi.  */
1901           if (use_stmt == phi)
1902             {
1903               loop_use_stmt = use_stmt;
1904               found = true;
1905               break;
1906             }
1907
1908           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1909             {
1910               if (vinfo_for_stmt (use_stmt)
1911                   && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1912                 {
1913                   loop_use_stmt = use_stmt;
1914                   nloop_uses++;
1915                 }
1916             }
1917            else
1918              n_out_of_loop_uses++;
1919
1920            /* There are can be either a single use in the loop or two uses in
1921               phi nodes.  */
1922            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1923              return false;
1924         }
1925
1926       if (found)
1927         break;
1928
1929       /* We reached a statement with no loop uses.  */
1930       if (nloop_uses == 0)
1931         return false;
1932
1933       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
1934       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
1935         return false;
1936
1937       if (!is_gimple_assign (loop_use_stmt)
1938           || code != gimple_assign_rhs_code (loop_use_stmt)
1939           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
1940         return false;
1941
1942       /* Insert USE_STMT into reduction chain.  */
1943       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
1944       if (current_stmt)
1945         {
1946           current_stmt_info = vinfo_for_stmt (current_stmt);
1947           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
1948           GROUP_FIRST_ELEMENT (use_stmt_info)
1949             = GROUP_FIRST_ELEMENT (current_stmt_info);
1950         }
1951       else
1952         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
1953
1954       lhs = gimple_assign_lhs (loop_use_stmt);
1955       current_stmt = loop_use_stmt;
1956       size++;
1957    }
1958
1959   if (!found || loop_use_stmt != phi || size < 2)
1960     return false;
1961
1962   /* Swap the operands, if needed, to make the reduction operand be the second
1963      operand.  */
1964   lhs = PHI_RESULT (phi);
1965   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1966   while (next_stmt)
1967     {
1968       if (gimple_assign_rhs2 (next_stmt) == lhs)
1969         {
1970           tree op = gimple_assign_rhs1 (next_stmt);
1971           gimple def_stmt = NULL;
1972
1973           if (TREE_CODE (op) == SSA_NAME)
1974             def_stmt = SSA_NAME_DEF_STMT (op);
1975
1976           /* Check that the other def is either defined in the loop
1977              ("vect_internal_def"), or it's an induction (defined by a
1978              loop-header phi-node).  */
1979           if (def_stmt
1980               && gimple_bb (def_stmt)
1981               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1982               && (is_gimple_assign (def_stmt)
1983                   || is_gimple_call (def_stmt)
1984                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1985                            == vect_induction_def
1986                   || (gimple_code (def_stmt) == GIMPLE_PHI
1987                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1988                                   == vect_internal_def
1989                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
1990             {
1991               lhs = gimple_assign_lhs (next_stmt);
1992               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1993               continue;
1994             }
1995
1996           return false;
1997         }
1998       else
1999         {
2000           tree op = gimple_assign_rhs2 (next_stmt);
2001           gimple def_stmt = NULL;
2002
2003           if (TREE_CODE (op) == SSA_NAME)
2004             def_stmt = SSA_NAME_DEF_STMT (op);
2005
2006           /* Check that the other def is either defined in the loop
2007             ("vect_internal_def"), or it's an induction (defined by a
2008             loop-header phi-node).  */
2009           if (def_stmt
2010               && gimple_bb (def_stmt)
2011               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2012               && (is_gimple_assign (def_stmt)
2013                   || is_gimple_call (def_stmt)
2014                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2015                               == vect_induction_def
2016                   || (gimple_code (def_stmt) == GIMPLE_PHI
2017                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2018                                   == vect_internal_def
2019                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2020             {
2021               if (dump_enabled_p ())
2022                 {
2023                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2024                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2025                 }
2026
2027               swap_tree_operands (next_stmt,
2028                                   gimple_assign_rhs1_ptr (next_stmt),
2029                                   gimple_assign_rhs2_ptr (next_stmt));
2030               update_stmt (next_stmt);
2031
2032               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2033                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2034             }
2035           else
2036             return false;
2037         }
2038
2039       lhs = gimple_assign_lhs (next_stmt);
2040       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2041     }
2042
2043   /* Save the chain for further analysis in SLP detection.  */
2044   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2045   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2046   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2047
2048   return true;
2049 }
2050
2051
2052 /* Function vect_is_simple_reduction_1
2053
2054    (1) Detect a cross-iteration def-use cycle that represents a simple
2055    reduction computation.  We look for the following pattern:
2056
2057    loop_header:
2058      a1 = phi < a0, a2 >
2059      a3 = ...
2060      a2 = operation (a3, a1)
2061
2062    such that:
2063    1. operation is commutative and associative and it is safe to
2064       change the order of the computation (if CHECK_REDUCTION is true)
2065    2. no uses for a2 in the loop (a2 is used out of the loop)
2066    3. no uses of a1 in the loop besides the reduction operation
2067    4. no uses of a1 outside the loop.
2068
2069    Conditions 1,4 are tested here.
2070    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2071
2072    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2073    nested cycles, if CHECK_REDUCTION is false.
2074
2075    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2076    reductions:
2077
2078      a1 = phi < a0, a2 >
2079      inner loop (def of a3)
2080      a2 = phi < a3 >
2081
2082    If MODIFY is true it tries also to rework the code in-place to enable
2083    detection of more reduction patterns.  For the time being we rewrite
2084    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2085 */
2086
2087 static gimple
2088 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2089                             bool check_reduction, bool *double_reduc,
2090                             bool modify)
2091 {
2092   struct loop *loop = (gimple_bb (phi))->loop_father;
2093   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2094   edge latch_e = loop_latch_edge (loop);
2095   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2096   gimple def_stmt, def1 = NULL, def2 = NULL;
2097   enum tree_code orig_code, code;
2098   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2099   tree type;
2100   int nloop_uses;
2101   tree name;
2102   imm_use_iterator imm_iter;
2103   use_operand_p use_p;
2104   bool phi_def;
2105
2106   *double_reduc = false;
2107
2108   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2109      otherwise, we assume outer loop vectorization.  */
2110   gcc_assert ((check_reduction && loop == vect_loop)
2111               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2112
2113   name = PHI_RESULT (phi);
2114   nloop_uses = 0;
2115   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2116     {
2117       gimple use_stmt = USE_STMT (use_p);
2118       if (is_gimple_debug (use_stmt))
2119         continue;
2120
2121       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2122         {
2123           if (dump_enabled_p ())
2124             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2125                              "intermediate value used outside loop.");
2126
2127           return NULL;
2128         }
2129
2130       if (vinfo_for_stmt (use_stmt)
2131           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2132         nloop_uses++;
2133       if (nloop_uses > 1)
2134         {
2135           if (dump_enabled_p ())
2136             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2137                              "reduction used in loop.");
2138           return NULL;
2139         }
2140     }
2141
2142   if (TREE_CODE (loop_arg) != SSA_NAME)
2143     {
2144       if (dump_enabled_p ())
2145         {
2146           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2147                            "reduction: not ssa_name: ");
2148           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2149         }
2150       return NULL;
2151     }
2152
2153   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2154   if (!def_stmt)
2155     {
2156       if (dump_enabled_p ())
2157         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2158                          "reduction: no def_stmt.");
2159       return NULL;
2160     }
2161
2162   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2163     {
2164       if (dump_enabled_p ())
2165         dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2166       return NULL;
2167     }
2168
2169   if (is_gimple_assign (def_stmt))
2170     {
2171       name = gimple_assign_lhs (def_stmt);
2172       phi_def = false;
2173     }
2174   else
2175     {
2176       name = PHI_RESULT (def_stmt);
2177       phi_def = true;
2178     }
2179
2180   nloop_uses = 0;
2181   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2182     {
2183       gimple use_stmt = USE_STMT (use_p);
2184       if (is_gimple_debug (use_stmt))
2185         continue;
2186       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2187           && vinfo_for_stmt (use_stmt)
2188           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2189         nloop_uses++;
2190       if (nloop_uses > 1)
2191         {
2192           if (dump_enabled_p ())
2193             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2194                              "reduction used in loop.");
2195           return NULL;
2196         }
2197     }
2198
2199   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2200      defined in the inner loop.  */
2201   if (phi_def)
2202     {
2203       op1 = PHI_ARG_DEF (def_stmt, 0);
2204
2205       if (gimple_phi_num_args (def_stmt) != 1
2206           || TREE_CODE (op1) != SSA_NAME)
2207         {
2208           if (dump_enabled_p ())
2209             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2210                              "unsupported phi node definition.");
2211
2212           return NULL;
2213         }
2214
2215       def1 = SSA_NAME_DEF_STMT (op1);
2216       if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2217           && loop->inner
2218           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2219           && is_gimple_assign (def1))
2220         {
2221           if (dump_enabled_p ())
2222             report_vect_op (MSG_NOTE, def_stmt,
2223                             "detected double reduction: ");
2224
2225           *double_reduc = true;
2226           return def_stmt;
2227         }
2228
2229       return NULL;
2230     }
2231
2232   code = orig_code = gimple_assign_rhs_code (def_stmt);
2233
2234   /* We can handle "res -= x[i]", which is non-associative by
2235      simply rewriting this into "res += -x[i]".  Avoid changing
2236      gimple instruction for the first simple tests and only do this
2237      if we're allowed to change code at all.  */
2238   if (code == MINUS_EXPR
2239       && modify
2240       && (op1 = gimple_assign_rhs1 (def_stmt))
2241       && TREE_CODE (op1) == SSA_NAME
2242       && SSA_NAME_DEF_STMT (op1) == phi)
2243     code = PLUS_EXPR;
2244
2245   if (check_reduction
2246       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2247     {
2248       if (dump_enabled_p ())
2249         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2250                         "reduction: not commutative/associative: ");
2251       return NULL;
2252     }
2253
2254   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2255     {
2256       if (code != COND_EXPR)
2257         {
2258           if (dump_enabled_p ())
2259             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2260                             "reduction: not binary operation: ");
2261
2262           return NULL;
2263         }
2264
2265       op3 = gimple_assign_rhs1 (def_stmt);
2266       if (COMPARISON_CLASS_P (op3))
2267         {
2268           op4 = TREE_OPERAND (op3, 1);
2269           op3 = TREE_OPERAND (op3, 0);
2270         }
2271
2272       op1 = gimple_assign_rhs2 (def_stmt);
2273       op2 = gimple_assign_rhs3 (def_stmt);
2274
2275       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2276         {
2277           if (dump_enabled_p ())
2278             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2279                             "reduction: uses not ssa_names: ");
2280
2281           return NULL;
2282         }
2283     }
2284   else
2285     {
2286       op1 = gimple_assign_rhs1 (def_stmt);
2287       op2 = gimple_assign_rhs2 (def_stmt);
2288
2289       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2290         {
2291           if (dump_enabled_p ())
2292             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2293                             "reduction: uses not ssa_names: ");
2294
2295           return NULL;
2296         }
2297    }
2298
2299   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2300   if ((TREE_CODE (op1) == SSA_NAME
2301        && !types_compatible_p (type,TREE_TYPE (op1)))
2302       || (TREE_CODE (op2) == SSA_NAME
2303           && !types_compatible_p (type, TREE_TYPE (op2)))
2304       || (op3 && TREE_CODE (op3) == SSA_NAME
2305           && !types_compatible_p (type, TREE_TYPE (op3)))
2306       || (op4 && TREE_CODE (op4) == SSA_NAME
2307           && !types_compatible_p (type, TREE_TYPE (op4))))
2308     {
2309       if (dump_enabled_p ())
2310         {
2311           dump_printf_loc (MSG_NOTE, vect_location,
2312                            "reduction: multiple types: operation type: ");
2313           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2314           dump_printf (MSG_NOTE, ", operands types: ");
2315           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2316                              TREE_TYPE (op1));
2317           dump_printf (MSG_NOTE, ",");
2318           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2319                              TREE_TYPE (op2));
2320           if (op3)
2321             {
2322               dump_printf (MSG_NOTE, ",");
2323               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2324                                  TREE_TYPE (op3));
2325             }
2326
2327           if (op4)
2328             {
2329               dump_printf (MSG_NOTE, ",");
2330               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2331                                  TREE_TYPE (op4));
2332             }
2333         }
2334
2335       return NULL;
2336     }
2337
2338   /* Check that it's ok to change the order of the computation.
2339      Generally, when vectorizing a reduction we change the order of the
2340      computation.  This may change the behavior of the program in some
2341      cases, so we need to check that this is ok.  One exception is when
2342      vectorizing an outer-loop: the inner-loop is executed sequentially,
2343      and therefore vectorizing reductions in the inner-loop during
2344      outer-loop vectorization is safe.  */
2345
2346   /* CHECKME: check for !flag_finite_math_only too?  */
2347   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2348       && check_reduction)
2349     {
2350       /* Changing the order of operations changes the semantics.  */
2351       if (dump_enabled_p ())
2352         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2353                         "reduction: unsafe fp math optimization: ");
2354       return NULL;
2355     }
2356   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2357            && check_reduction)
2358     {
2359       /* Changing the order of operations changes the semantics.  */
2360       if (dump_enabled_p ())
2361         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2362                         "reduction: unsafe int math optimization: ");
2363       return NULL;
2364     }
2365   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2366     {
2367       /* Changing the order of operations changes the semantics.  */
2368       if (dump_enabled_p ())
2369         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2370                         "reduction: unsafe fixed-point math optimization: ");
2371       return NULL;
2372     }
2373
2374   /* If we detected "res -= x[i]" earlier, rewrite it into
2375      "res += -x[i]" now.  If this turns out to be useless reassoc
2376      will clean it up again.  */
2377   if (orig_code == MINUS_EXPR)
2378     {
2379       tree rhs = gimple_assign_rhs2 (def_stmt);
2380       tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
2381       gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2382                                                          rhs, NULL);
2383       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2384       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2385                                                           loop_info, NULL));
2386       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2387       gimple_assign_set_rhs2 (def_stmt, negrhs);
2388       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2389       update_stmt (def_stmt);
2390     }
2391
2392   /* Reduction is safe. We're dealing with one of the following:
2393      1) integer arithmetic and no trapv
2394      2) floating point arithmetic, and special flags permit this optimization
2395      3) nested cycle (i.e., outer loop vectorization).  */
2396   if (TREE_CODE (op1) == SSA_NAME)
2397     def1 = SSA_NAME_DEF_STMT (op1);
2398
2399   if (TREE_CODE (op2) == SSA_NAME)
2400     def2 = SSA_NAME_DEF_STMT (op2);
2401
2402   if (code != COND_EXPR
2403       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2404     {
2405       if (dump_enabled_p ())
2406         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2407       return NULL;
2408     }
2409
2410   /* Check that one def is the reduction def, defined by PHI,
2411      the other def is either defined in the loop ("vect_internal_def"),
2412      or it's an induction (defined by a loop-header phi-node).  */
2413
2414   if (def2 && def2 == phi
2415       && (code == COND_EXPR
2416           || !def1 || gimple_nop_p (def1)
2417           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2418               && (is_gimple_assign (def1)
2419                   || is_gimple_call (def1)
2420                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2421                       == vect_induction_def
2422                   || (gimple_code (def1) == GIMPLE_PHI
2423                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2424                           == vect_internal_def
2425                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2426     {
2427       if (dump_enabled_p ())
2428         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2429       return def_stmt;
2430     }
2431
2432   if (def1 && def1 == phi
2433       && (code == COND_EXPR
2434           || !def2 || gimple_nop_p (def2)
2435           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2436               && (is_gimple_assign (def2)
2437                   || is_gimple_call (def2)
2438                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2439                       == vect_induction_def
2440                   || (gimple_code (def2) == GIMPLE_PHI
2441                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2442                           == vect_internal_def
2443                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2444     {
2445       if (check_reduction)
2446         {
2447           /* Swap operands (just for simplicity - so that the rest of the code
2448              can assume that the reduction variable is always the last (second)
2449              argument).  */
2450           if (dump_enabled_p ())
2451             report_vect_op (MSG_NOTE, def_stmt,
2452                             "detected reduction: need to swap operands: ");
2453
2454           swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2455                               gimple_assign_rhs2_ptr (def_stmt));
2456
2457           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2458             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2459         }
2460       else
2461         {
2462           if (dump_enabled_p ())
2463             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2464         }
2465
2466       return def_stmt;
2467     }
2468
2469   /* Try to find SLP reduction chain.  */
2470   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2471     {
2472       if (dump_enabled_p ())
2473         report_vect_op (MSG_NOTE, def_stmt,
2474                         "reduction: detected reduction chain: ");
2475
2476       return def_stmt;
2477     }
2478
2479   if (dump_enabled_p ())
2480     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2481                     "reduction: unknown pattern: ");
2482
2483   return NULL;
2484 }
2485
2486 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2487    in-place.  Arguments as there.  */
2488
2489 static gimple
2490 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2491                           bool check_reduction, bool *double_reduc)
2492 {
2493   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2494                                      double_reduc, false);
2495 }
2496
2497 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2498    in-place if it enables detection of more reductions.  Arguments
2499    as there.  */
2500
2501 gimple
2502 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2503                           bool check_reduction, bool *double_reduc)
2504 {
2505   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2506                                      double_reduc, true);
2507 }
2508
2509 /* Calculate the cost of one scalar iteration of the loop.  */
2510 int
2511 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2512 {
2513   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2514   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2515   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2516   int innerloop_iters, i, stmt_cost;
2517
2518   /* Count statements in scalar loop.  Using this as scalar cost for a single
2519      iteration for now.
2520
2521      TODO: Add outer loop support.
2522
2523      TODO: Consider assigning different costs to different scalar
2524      statements.  */
2525
2526   /* FORNOW.  */
2527   innerloop_iters = 1;
2528   if (loop->inner)
2529     innerloop_iters = 50; /* FIXME */
2530
2531   for (i = 0; i < nbbs; i++)
2532     {
2533       gimple_stmt_iterator si;
2534       basic_block bb = bbs[i];
2535
2536       if (bb->loop_father == loop->inner)
2537         factor = innerloop_iters;
2538       else
2539         factor = 1;
2540
2541       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2542         {
2543           gimple stmt = gsi_stmt (si);
2544           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2545
2546           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2547             continue;
2548
2549           /* Skip stmts that are not vectorized inside the loop.  */
2550           if (stmt_info
2551               && !STMT_VINFO_RELEVANT_P (stmt_info)
2552               && (!STMT_VINFO_LIVE_P (stmt_info)
2553                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2554               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2555             continue;
2556
2557           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2558             {
2559               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2560                stmt_cost = vect_get_stmt_cost (scalar_load);
2561              else
2562                stmt_cost = vect_get_stmt_cost (scalar_store);
2563             }
2564           else
2565             stmt_cost = vect_get_stmt_cost (scalar_stmt);
2566
2567           scalar_single_iter_cost += stmt_cost * factor;
2568         }
2569     }
2570   return scalar_single_iter_cost;
2571 }
2572
2573 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2574 int
2575 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2576                              int *peel_iters_epilogue,
2577                              int scalar_single_iter_cost,
2578                              stmt_vector_for_cost *prologue_cost_vec,
2579                              stmt_vector_for_cost *epilogue_cost_vec)
2580 {
2581   int retval = 0;
2582   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2583
2584   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2585     {
2586       *peel_iters_epilogue = vf/2;
2587       if (dump_enabled_p ())
2588         dump_printf_loc (MSG_NOTE, vect_location,
2589                          "cost model: epilogue peel iters set to vf/2 "
2590                          "because loop iterations are unknown .");
2591
2592       /* If peeled iterations are known but number of scalar loop
2593          iterations are unknown, count a taken branch per peeled loop.  */
2594       retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2595                                  NULL, 0, vect_prologue);
2596     }
2597   else
2598     {
2599       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2600       peel_iters_prologue = niters < peel_iters_prologue ?
2601                             niters : peel_iters_prologue;
2602       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2603       /* If we need to peel for gaps, but no peeling is required, we have to
2604          peel VF iterations.  */
2605       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2606         *peel_iters_epilogue = vf;
2607     }
2608
2609   if (peel_iters_prologue)
2610     retval += record_stmt_cost (prologue_cost_vec,
2611                                 peel_iters_prologue * scalar_single_iter_cost,
2612                                 scalar_stmt, NULL, 0, vect_prologue);
2613   if (*peel_iters_epilogue)
2614     retval += record_stmt_cost (epilogue_cost_vec,
2615                                 *peel_iters_epilogue * scalar_single_iter_cost,
2616                                 scalar_stmt, NULL, 0, vect_epilogue);
2617   return retval;
2618 }
2619
2620 /* Function vect_estimate_min_profitable_iters
2621
2622    Return the number of iterations required for the vector version of the
2623    loop to be profitable relative to the cost of the scalar version of the
2624    loop.  */
2625
2626 static void
2627 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2628                                     int *ret_min_profitable_niters,
2629                                     int *ret_min_profitable_estimate)
2630 {
2631   int min_profitable_iters;
2632   int min_profitable_estimate;
2633   int peel_iters_prologue;
2634   int peel_iters_epilogue;
2635   unsigned vec_inside_cost = 0;
2636   int vec_outside_cost = 0;
2637   unsigned vec_prologue_cost = 0;
2638   unsigned vec_epilogue_cost = 0;
2639   int scalar_single_iter_cost = 0;
2640   int scalar_outside_cost = 0;
2641   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2642   int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2643   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2644
2645   /* Cost model disabled.  */
2646   if (!flag_vect_cost_model)
2647     {
2648       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.");
2649       *ret_min_profitable_niters = 0;
2650       *ret_min_profitable_estimate = 0;
2651       return;
2652     }
2653
2654   /* Requires loop versioning tests to handle misalignment.  */
2655   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2656     {
2657       /*  FIXME: Make cost depend on complexity of individual check.  */
2658       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2659       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2660                             vect_prologue);
2661       dump_printf (MSG_NOTE,
2662                    "cost model: Adding cost of checks for loop "
2663                    "versioning to treat misalignment.\n");
2664     }
2665
2666   /* Requires loop versioning with alias checks.  */
2667   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2668     {
2669       /*  FIXME: Make cost depend on complexity of individual check.  */
2670       unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2671       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2672                             vect_prologue);
2673       dump_printf (MSG_NOTE,
2674                    "cost model: Adding cost of checks for loop "
2675                    "versioning aliasing.\n");
2676     }
2677
2678   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2679       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2680     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2681                           vect_prologue);
2682
2683   /* Count statements in scalar loop.  Using this as scalar cost for a single
2684      iteration for now.
2685
2686      TODO: Add outer loop support.
2687
2688      TODO: Consider assigning different costs to different scalar
2689      statements.  */
2690
2691   scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2692
2693   /* Add additional cost for the peeled instructions in prologue and epilogue
2694      loop.
2695
2696      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2697      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2698
2699      TODO: Build an expression that represents peel_iters for prologue and
2700      epilogue to be used in a run-time test.  */
2701
2702   if (npeel  < 0)
2703     {
2704       peel_iters_prologue = vf/2;
2705       dump_printf (MSG_NOTE, "cost model: "
2706                    "prologue peel iters set to vf/2.");
2707
2708       /* If peeling for alignment is unknown, loop bound of main loop becomes
2709          unknown.  */
2710       peel_iters_epilogue = vf/2;
2711       dump_printf (MSG_NOTE, "cost model: "
2712                    "epilogue peel iters set to vf/2 because "
2713                    "peeling for alignment is unknown.");
2714
2715       /* If peeled iterations are unknown, count a taken branch and a not taken
2716          branch per peeled loop. Even if scalar loop iterations are known,
2717          vector iterations are not known since peeled prologue iterations are
2718          not known. Hence guards remain the same.  */
2719       (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2720                             NULL, 0, vect_prologue);
2721       (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2722                             NULL, 0, vect_prologue);
2723       /* FORNOW: Don't attempt to pass individual scalar instructions to
2724          the model; just assume linear cost for scalar iterations.  */
2725       (void) add_stmt_cost (target_cost_data,
2726                             peel_iters_prologue * scalar_single_iter_cost,
2727                             scalar_stmt, NULL, 0, vect_prologue);
2728       (void) add_stmt_cost (target_cost_data,
2729                             peel_iters_epilogue * scalar_single_iter_cost,
2730                             scalar_stmt, NULL, 0, vect_epilogue);
2731     }
2732   else
2733     {
2734       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2735       stmt_info_for_cost *si;
2736       int j;
2737       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2738
2739       prologue_cost_vec.create (2);
2740       epilogue_cost_vec.create (2);
2741       peel_iters_prologue = npeel;
2742
2743       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2744                                           &peel_iters_epilogue,
2745                                           scalar_single_iter_cost,
2746                                           &prologue_cost_vec,
2747                                           &epilogue_cost_vec);
2748
2749       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2750         {
2751           struct _stmt_vec_info *stmt_info
2752             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2753           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2754                                 si->misalign, vect_prologue);
2755         }
2756
2757       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2758         {
2759           struct _stmt_vec_info *stmt_info
2760             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2761           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2762                                 si->misalign, vect_epilogue);
2763         }
2764
2765       prologue_cost_vec.release ();
2766       epilogue_cost_vec.release ();
2767     }
2768
2769   /* FORNOW: The scalar outside cost is incremented in one of the
2770      following ways:
2771
2772      1. The vectorizer checks for alignment and aliasing and generates
2773      a condition that allows dynamic vectorization.  A cost model
2774      check is ANDED with the versioning condition.  Hence scalar code
2775      path now has the added cost of the versioning check.
2776
2777        if (cost > th & versioning_check)
2778          jmp to vector code
2779
2780      Hence run-time scalar is incremented by not-taken branch cost.
2781
2782      2. The vectorizer then checks if a prologue is required.  If the
2783      cost model check was not done before during versioning, it has to
2784      be done before the prologue check.
2785
2786        if (cost <= th)
2787          prologue = scalar_iters
2788        if (prologue == 0)
2789          jmp to vector code
2790        else
2791          execute prologue
2792        if (prologue == num_iters)
2793          go to exit
2794
2795      Hence the run-time scalar cost is incremented by a taken branch,
2796      plus a not-taken branch, plus a taken branch cost.
2797
2798      3. The vectorizer then checks if an epilogue is required.  If the
2799      cost model check was not done before during prologue check, it
2800      has to be done with the epilogue check.
2801
2802        if (prologue == 0)
2803          jmp to vector code
2804        else
2805          execute prologue
2806        if (prologue == num_iters)
2807          go to exit
2808        vector code:
2809          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2810            jmp to epilogue
2811
2812      Hence the run-time scalar cost should be incremented by 2 taken
2813      branches.
2814
2815      TODO: The back end may reorder the BBS's differently and reverse
2816      conditions/branch directions.  Change the estimates below to
2817      something more reasonable.  */
2818
2819   /* If the number of iterations is known and we do not do versioning, we can
2820      decide whether to vectorize at compile time.  Hence the scalar version
2821      do not carry cost model guard costs.  */
2822   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2823       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2824       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2825     {
2826       /* Cost model check occurs at versioning.  */
2827       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2828           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2829         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2830       else
2831         {
2832           /* Cost model check occurs at prologue generation.  */
2833           if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2834             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2835               + vect_get_stmt_cost (cond_branch_not_taken);
2836           /* Cost model check occurs at epilogue generation.  */
2837           else
2838             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2839         }
2840     }
2841
2842   /* Complete the target-specific cost calculations.  */
2843   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2844                &vec_inside_cost, &vec_epilogue_cost);
2845
2846   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2847
2848   /* Calculate number of iterations required to make the vector version
2849      profitable, relative to the loop bodies only.  The following condition
2850      must hold true:
2851      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2852      where
2853      SIC = scalar iteration cost, VIC = vector iteration cost,
2854      VOC = vector outside cost, VF = vectorization factor,
2855      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2856      SOC = scalar outside cost for run time cost model check.  */
2857
2858   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2859     {
2860       if (vec_outside_cost <= 0)
2861         min_profitable_iters = 1;
2862       else
2863         {
2864           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2865                                   - vec_inside_cost * peel_iters_prologue
2866                                   - vec_inside_cost * peel_iters_epilogue)
2867                                  / ((scalar_single_iter_cost * vf)
2868                                     - vec_inside_cost);
2869
2870           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2871               <= (((int) vec_inside_cost * min_profitable_iters)
2872                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
2873             min_profitable_iters++;
2874         }
2875     }
2876   /* vector version will never be profitable.  */
2877   else
2878     {
2879       if (dump_enabled_p ())
2880         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2881                          "cost model: the vector iteration cost = %d "
2882                          "divided by the scalar iteration cost = %d "
2883                          "is greater or equal to the vectorization factor = %d.",
2884                          vec_inside_cost, scalar_single_iter_cost, vf);
2885       *ret_min_profitable_niters = -1;
2886       *ret_min_profitable_estimate = -1;
2887       return;
2888     }
2889
2890   if (dump_enabled_p ())
2891     {
2892       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2893       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
2894                    vec_inside_cost);
2895       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
2896                    vec_prologue_cost);
2897       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
2898                    vec_epilogue_cost);
2899       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
2900                    scalar_single_iter_cost);
2901       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
2902                    scalar_outside_cost);
2903       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
2904                    vec_outside_cost);
2905       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
2906                    peel_iters_prologue);
2907       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
2908                    peel_iters_epilogue);
2909       dump_printf (MSG_NOTE,
2910                    "  Calculated minimum iters for profitability: %d\n",
2911                    min_profitable_iters);
2912     }
2913
2914   min_profitable_iters =
2915         min_profitable_iters < vf ? vf : min_profitable_iters;
2916
2917   /* Because the condition we create is:
2918      if (niters <= min_profitable_iters)
2919        then skip the vectorized loop.  */
2920   min_profitable_iters--;
2921
2922   if (dump_enabled_p ())
2923     dump_printf_loc (MSG_NOTE, vect_location,
2924                      "  Runtime profitability threshold = %d\n", min_profitable_iters);
2925
2926   *ret_min_profitable_niters = min_profitable_iters;
2927
2928   /* Calculate number of iterations required to make the vector version
2929      profitable, relative to the loop bodies only.
2930
2931      Non-vectorized variant is SIC * niters and it must win over vector
2932      variant on the expected loop trip count.  The following condition must hold true:
2933      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
2934
2935   if (vec_outside_cost <= 0)
2936     min_profitable_estimate = 1;
2937   else
2938     {
2939       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
2940                                  - vec_inside_cost * peel_iters_prologue
2941                                  - vec_inside_cost * peel_iters_epilogue)
2942                                  / ((scalar_single_iter_cost * vf)
2943                                    - vec_inside_cost);
2944     }
2945   min_profitable_estimate --;
2946   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
2947   if (dump_enabled_p ())
2948     dump_printf_loc (MSG_NOTE, vect_location,
2949                      "  Static estimate profitability threshold = %d\n",
2950                       min_profitable_iters);
2951
2952   *ret_min_profitable_estimate = min_profitable_estimate;
2953 }
2954
2955
2956 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2957    functions. Design better to avoid maintenance issues.  */
2958
2959 /* Function vect_model_reduction_cost.
2960
2961    Models cost for a reduction operation, including the vector ops
2962    generated within the strip-mine loop, the initial definition before
2963    the loop, and the epilogue code that must be generated.  */
2964
2965 static bool
2966 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2967                            int ncopies)
2968 {
2969   int prologue_cost = 0, epilogue_cost = 0;
2970   enum tree_code code;
2971   optab optab;
2972   tree vectype;
2973   gimple stmt, orig_stmt;
2974   tree reduction_op;
2975   enum machine_mode mode;
2976   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2977   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2978   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2979
2980   /* Cost of reduction op inside loop.  */
2981   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
2982                                         stmt_info, 0, vect_body);
2983   stmt = STMT_VINFO_STMT (stmt_info);
2984
2985   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2986     {
2987     case GIMPLE_SINGLE_RHS:
2988       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2989       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2990       break;
2991     case GIMPLE_UNARY_RHS:
2992       reduction_op = gimple_assign_rhs1 (stmt);
2993       break;
2994     case GIMPLE_BINARY_RHS:
2995       reduction_op = gimple_assign_rhs2 (stmt);
2996       break;
2997     case GIMPLE_TERNARY_RHS:
2998       reduction_op = gimple_assign_rhs3 (stmt);
2999       break;
3000     default:
3001       gcc_unreachable ();
3002     }
3003
3004   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3005   if (!vectype)
3006     {
3007       if (dump_enabled_p ())
3008         {
3009           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3010                            "unsupported data-type ");
3011           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3012                              TREE_TYPE (reduction_op));
3013         }
3014       return false;
3015    }
3016
3017   mode = TYPE_MODE (vectype);
3018   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3019
3020   if (!orig_stmt)
3021     orig_stmt = STMT_VINFO_STMT (stmt_info);
3022
3023   code = gimple_assign_rhs_code (orig_stmt);
3024
3025   /* Add in cost for initial definition.  */
3026   prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3027                                   stmt_info, 0, vect_prologue);
3028
3029   /* Determine cost of epilogue code.
3030
3031      We have a reduction operator that will reduce the vector in one statement.
3032      Also requires scalar extract.  */
3033
3034   if (!nested_in_vect_loop_p (loop, orig_stmt))
3035     {
3036       if (reduc_code != ERROR_MARK)
3037         {
3038           epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3039                                           stmt_info, 0, vect_epilogue);
3040           epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3041                                           stmt_info, 0, vect_epilogue);
3042         }
3043       else
3044         {
3045           int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3046           tree bitsize =
3047             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3048           int element_bitsize = tree_low_cst (bitsize, 1);
3049           int nelements = vec_size_in_bits / element_bitsize;
3050
3051           optab = optab_for_tree_code (code, vectype, optab_default);
3052
3053           /* We have a whole vector shift available.  */
3054           if (VECTOR_MODE_P (mode)
3055               && optab_handler (optab, mode) != CODE_FOR_nothing
3056               && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3057             {
3058               /* Final reduction via vector shifts and the reduction operator.
3059                  Also requires scalar extract.  */
3060               epilogue_cost += add_stmt_cost (target_cost_data,
3061                                               exact_log2 (nelements) * 2,
3062                                               vector_stmt, stmt_info, 0,
3063                                               vect_epilogue);
3064               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3065                                               vec_to_scalar, stmt_info, 0,
3066                                               vect_epilogue);
3067             }
3068           else
3069             /* Use extracts and reduction op for final reduction.  For N
3070                elements, we have N extracts and N-1 reduction ops.  */
3071             epilogue_cost += add_stmt_cost (target_cost_data,
3072                                             nelements + nelements - 1,
3073                                             vector_stmt, stmt_info, 0,
3074                                             vect_epilogue);
3075         }
3076     }
3077
3078   if (dump_enabled_p ())
3079     dump_printf (MSG_NOTE,
3080                  "vect_model_reduction_cost: inside_cost = %d, "
3081                  "prologue_cost = %d, epilogue_cost = %d .", inside_cost,
3082                  prologue_cost, epilogue_cost);
3083
3084   return true;
3085 }
3086
3087
3088 /* Function vect_model_induction_cost.
3089
3090    Models cost for induction operations.  */
3091
3092 static void
3093 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3094 {
3095   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3096   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3097   unsigned inside_cost, prologue_cost;
3098
3099   /* loop cost for vec_loop.  */
3100   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3101                                stmt_info, 0, vect_body);
3102
3103   /* prologue cost for vec_init and vec_step.  */
3104   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3105                                  stmt_info, 0, vect_prologue);
3106
3107   if (dump_enabled_p ())
3108     dump_printf_loc (MSG_NOTE, vect_location,
3109                      "vect_model_induction_cost: inside_cost = %d, "
3110                      "prologue_cost = %d .", inside_cost, prologue_cost);
3111 }
3112
3113
3114 /* Function get_initial_def_for_induction
3115
3116    Input:
3117    STMT - a stmt that performs an induction operation in the loop.
3118    IV_PHI - the initial value of the induction variable
3119
3120    Output:
3121    Return a vector variable, initialized with the first VF values of
3122    the induction variable.  E.g., for an iv with IV_PHI='X' and
3123    evolution S, for a vector of 4 units, we want to return:
3124    [X, X + S, X + 2*S, X + 3*S].  */
3125
3126 static tree
3127 get_initial_def_for_induction (gimple iv_phi)
3128 {
3129   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3130   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3131   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3132   tree scalar_type;
3133   tree vectype;
3134   int nunits;
3135   edge pe = loop_preheader_edge (loop);
3136   struct loop *iv_loop;
3137   basic_block new_bb;
3138   tree new_vec, vec_init, vec_step, t;
3139   tree access_fn;
3140   tree new_var;
3141   tree new_name;
3142   gimple init_stmt, induction_phi, new_stmt;
3143   tree induc_def, vec_def, vec_dest;
3144   tree init_expr, step_expr;
3145   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3146   int i;
3147   bool ok;
3148   int ncopies;
3149   tree expr;
3150   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3151   bool nested_in_vect_loop = false;
3152   gimple_seq stmts = NULL;
3153   imm_use_iterator imm_iter;
3154   use_operand_p use_p;
3155   gimple exit_phi;
3156   edge latch_e;
3157   tree loop_arg;
3158   gimple_stmt_iterator si;
3159   basic_block bb = gimple_bb (iv_phi);
3160   tree stepvectype;
3161   tree resvectype;
3162
3163   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3164   if (nested_in_vect_loop_p (loop, iv_phi))
3165     {
3166       nested_in_vect_loop = true;
3167       iv_loop = loop->inner;
3168     }
3169   else
3170     iv_loop = loop;
3171   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3172
3173   latch_e = loop_latch_edge (iv_loop);
3174   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3175
3176   access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
3177   gcc_assert (access_fn);
3178   STRIP_NOPS (access_fn);
3179   ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
3180                                     &init_expr, &step_expr);
3181   gcc_assert (ok);
3182   pe = loop_preheader_edge (iv_loop);
3183
3184   scalar_type = TREE_TYPE (init_expr);
3185   vectype = get_vectype_for_scalar_type (scalar_type);
3186   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3187   gcc_assert (vectype);
3188   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3189   ncopies = vf / nunits;
3190
3191   gcc_assert (phi_info);
3192   gcc_assert (ncopies >= 1);
3193
3194   /* Find the first insertion point in the BB.  */
3195   si = gsi_after_labels (bb);
3196
3197   /* Create the vector that holds the initial_value of the induction.  */
3198   if (nested_in_vect_loop)
3199     {
3200       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3201          been created during vectorization of previous stmts.  We obtain it
3202          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3203       tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3204                                            loop_preheader_edge (iv_loop));
3205       vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
3206     }
3207   else
3208     {
3209       vec<constructor_elt, va_gc> *v;
3210
3211       /* iv_loop is the loop to be vectorized. Create:
3212          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3213       new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
3214       new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
3215       if (stmts)
3216         {
3217           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3218           gcc_assert (!new_bb);
3219         }
3220
3221       vec_alloc (v, nunits);
3222       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3223       for (i = 1; i < nunits; i++)
3224         {
3225           /* Create: new_name_i = new_name + step_expr  */
3226           enum tree_code code = POINTER_TYPE_P (scalar_type)
3227                                 ? POINTER_PLUS_EXPR : PLUS_EXPR;
3228           init_stmt = gimple_build_assign_with_ops (code, new_var,
3229                                                     new_name, step_expr);
3230           new_name = make_ssa_name (new_var, init_stmt);
3231           gimple_assign_set_lhs (init_stmt, new_name);
3232
3233           new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3234           gcc_assert (!new_bb);
3235
3236           if (dump_enabled_p ())
3237             {
3238               dump_printf_loc (MSG_NOTE, vect_location,
3239                                "created new init_stmt: ");
3240               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3241             }
3242           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3243         }
3244       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3245       new_vec = build_constructor (vectype, v);
3246       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3247     }
3248
3249
3250   /* Create the vector that holds the step of the induction.  */
3251   if (nested_in_vect_loop)
3252     /* iv_loop is nested in the loop to be vectorized. Generate:
3253        vec_step = [S, S, S, S]  */
3254     new_name = step_expr;
3255   else
3256     {
3257       /* iv_loop is the loop to be vectorized. Generate:
3258           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3259       expr = build_int_cst (TREE_TYPE (step_expr), vf);
3260       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3261                               expr, step_expr);
3262     }
3263
3264   t = unshare_expr (new_name);
3265   gcc_assert (CONSTANT_CLASS_P (new_name));
3266   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3267   gcc_assert (stepvectype);
3268   new_vec = build_vector_from_val (stepvectype, t);
3269   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3270
3271
3272   /* Create the following def-use cycle:
3273      loop prolog:
3274          vec_init = ...
3275          vec_step = ...
3276      loop:
3277          vec_iv = PHI <vec_init, vec_loop>
3278          ...
3279          STMT
3280          ...
3281          vec_loop = vec_iv + vec_step;  */
3282
3283   /* Create the induction-phi that defines the induction-operand.  */
3284   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3285   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3286   set_vinfo_for_stmt (induction_phi,
3287                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3288   induc_def = PHI_RESULT (induction_phi);
3289
3290   /* Create the iv update inside the loop  */
3291   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3292                                            induc_def, vec_step);
3293   vec_def = make_ssa_name (vec_dest, new_stmt);
3294   gimple_assign_set_lhs (new_stmt, vec_def);
3295   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3296   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3297                                                    NULL));
3298
3299   /* Set the arguments of the phi node:  */
3300   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3301   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3302                UNKNOWN_LOCATION);
3303
3304
3305   /* In case that vectorization factor (VF) is bigger than the number
3306      of elements that we can fit in a vectype (nunits), we have to generate
3307      more than one vector stmt - i.e - we need to "unroll" the
3308      vector stmt by a factor VF/nunits.  For more details see documentation
3309      in vectorizable_operation.  */
3310
3311   if (ncopies > 1)
3312     {
3313       stmt_vec_info prev_stmt_vinfo;
3314       /* FORNOW. This restriction should be relaxed.  */
3315       gcc_assert (!nested_in_vect_loop);
3316
3317       /* Create the vector that holds the step of the induction.  */
3318       expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3319       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3320                               expr, step_expr);
3321       t = unshare_expr (new_name);
3322       gcc_assert (CONSTANT_CLASS_P (new_name));
3323       new_vec = build_vector_from_val (stepvectype, t);
3324       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3325
3326       vec_def = induc_def;
3327       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3328       for (i = 1; i < ncopies; i++)
3329         {
3330           /* vec_i = vec_prev + vec_step  */
3331           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3332                                                    vec_def, vec_step);
3333           vec_def = make_ssa_name (vec_dest, new_stmt);
3334           gimple_assign_set_lhs (new_stmt, vec_def);
3335
3336           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3337           if (!useless_type_conversion_p (resvectype, vectype))
3338             {
3339               new_stmt = gimple_build_assign_with_ops
3340                   (VIEW_CONVERT_EXPR,
3341                    vect_get_new_vect_var (resvectype, vect_simple_var,
3342                                           "vec_iv_"),
3343                    build1 (VIEW_CONVERT_EXPR, resvectype,
3344                            gimple_assign_lhs (new_stmt)), NULL_TREE);
3345               gimple_assign_set_lhs (new_stmt,
3346                                      make_ssa_name
3347                                        (gimple_assign_lhs (new_stmt), new_stmt));
3348               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3349             }
3350           set_vinfo_for_stmt (new_stmt,
3351                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3352           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3353           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3354         }
3355     }
3356
3357   if (nested_in_vect_loop)
3358     {
3359       /* Find the loop-closed exit-phi of the induction, and record
3360          the final vector of induction results:  */
3361       exit_phi = NULL;
3362       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3363         {
3364           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3365             {
3366               exit_phi = USE_STMT (use_p);
3367               break;
3368             }
3369         }
3370       if (exit_phi)
3371         {
3372           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3373           /* FORNOW. Currently not supporting the case that an inner-loop induction
3374              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3375           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3376                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3377
3378           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3379           if (dump_enabled_p ())
3380             {
3381               dump_printf_loc (MSG_NOTE, vect_location,
3382                                "vector of inductions after inner-loop:");
3383               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3384             }
3385         }
3386     }
3387
3388
3389   if (dump_enabled_p ())
3390     {
3391       dump_printf_loc (MSG_NOTE, vect_location,
3392                        "transform induction: created def-use cycle: ");
3393       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3394       dump_printf (MSG_NOTE, "\n");
3395       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3396                         SSA_NAME_DEF_STMT (vec_def), 0);
3397     }
3398
3399   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3400   if (!useless_type_conversion_p (resvectype, vectype))
3401     {
3402       new_stmt = gimple_build_assign_with_ops
3403          (VIEW_CONVERT_EXPR,
3404           vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3405           build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3406       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3407       gimple_assign_set_lhs (new_stmt, induc_def);
3408       si = gsi_after_labels (bb);
3409       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3410       set_vinfo_for_stmt (new_stmt,
3411                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3412       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3413         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3414     }
3415
3416   return induc_def;
3417 }
3418
3419
3420 /* Function get_initial_def_for_reduction
3421
3422    Input:
3423    STMT - a stmt that performs a reduction operation in the loop.
3424    INIT_VAL - the initial value of the reduction variable
3425
3426    Output:
3427    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3428         of the reduction (used for adjusting the epilog - see below).
3429    Return a vector variable, initialized according to the operation that STMT
3430         performs. This vector will be used as the initial value of the
3431         vector of partial results.
3432
3433    Option1 (adjust in epilog): Initialize the vector as follows:
3434      add/bit or/xor:    [0,0,...,0,0]
3435      mult/bit and:      [1,1,...,1,1]
3436      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3437    and when necessary (e.g. add/mult case) let the caller know
3438    that it needs to adjust the result by init_val.
3439
3440    Option2: Initialize the vector as follows:
3441      add/bit or/xor:    [init_val,0,0,...,0]
3442      mult/bit and:      [init_val,1,1,...,1]
3443      min/max/cond_expr: [init_val,init_val,...,init_val]
3444    and no adjustments are needed.
3445
3446    For example, for the following code:
3447
3448    s = init_val;
3449    for (i=0;i<n;i++)
3450      s = s + a[i];
3451
3452    STMT is 's = s + a[i]', and the reduction variable is 's'.
3453    For a vector of 4 units, we want to return either [0,0,0,init_val],
3454    or [0,0,0,0] and let the caller know that it needs to adjust
3455    the result at the end by 'init_val'.
3456
3457    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3458    initialization vector is simpler (same element in all entries), if
3459    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3460
3461    A cost model should help decide between these two schemes.  */
3462
3463 tree
3464 get_initial_def_for_reduction (gimple stmt, tree init_val,
3465                                tree *adjustment_def)
3466 {
3467   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3468   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3469   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3470   tree scalar_type = TREE_TYPE (init_val);
3471   tree vectype = get_vectype_for_scalar_type (scalar_type);
3472   int nunits;
3473   enum tree_code code = gimple_assign_rhs_code (stmt);
3474   tree def_for_init;
3475   tree init_def;
3476   tree *elts;
3477   int i;
3478   bool nested_in_vect_loop = false;
3479   tree init_value;
3480   REAL_VALUE_TYPE real_init_val = dconst0;
3481   int int_init_val = 0;
3482   gimple def_stmt = NULL;
3483
3484   gcc_assert (vectype);
3485   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3486
3487   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3488               || SCALAR_FLOAT_TYPE_P (scalar_type));
3489
3490   if (nested_in_vect_loop_p (loop, stmt))
3491     nested_in_vect_loop = true;
3492   else
3493     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3494
3495   /* In case of double reduction we only create a vector variable to be put
3496      in the reduction phi node.  The actual statement creation is done in
3497      vect_create_epilog_for_reduction.  */
3498   if (adjustment_def && nested_in_vect_loop
3499       && TREE_CODE (init_val) == SSA_NAME
3500       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3501       && gimple_code (def_stmt) == GIMPLE_PHI
3502       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3503       && vinfo_for_stmt (def_stmt)
3504       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3505           == vect_double_reduction_def)
3506     {
3507       *adjustment_def = NULL;
3508       return vect_create_destination_var (init_val, vectype);
3509     }
3510
3511   if (TREE_CONSTANT (init_val))
3512     {
3513       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3514         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3515       else
3516         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3517     }
3518   else
3519     init_value = init_val;
3520
3521   switch (code)
3522     {
3523       case WIDEN_SUM_EXPR:
3524       case DOT_PROD_EXPR:
3525       case PLUS_EXPR:
3526       case MINUS_EXPR:
3527       case BIT_IOR_EXPR:
3528       case BIT_XOR_EXPR:
3529       case MULT_EXPR:
3530       case BIT_AND_EXPR:
3531         /* ADJUSMENT_DEF is NULL when called from
3532            vect_create_epilog_for_reduction to vectorize double reduction.  */
3533         if (adjustment_def)
3534           {
3535             if (nested_in_vect_loop)
3536               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3537                                                               NULL);
3538             else
3539               *adjustment_def = init_val;
3540           }
3541
3542         if (code == MULT_EXPR)
3543           {
3544             real_init_val = dconst1;
3545             int_init_val = 1;
3546           }
3547
3548         if (code == BIT_AND_EXPR)
3549           int_init_val = -1;
3550
3551         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3552           def_for_init = build_real (scalar_type, real_init_val);
3553         else
3554           def_for_init = build_int_cst (scalar_type, int_init_val);
3555
3556         /* Create a vector of '0' or '1' except the first element.  */
3557         elts = XALLOCAVEC (tree, nunits);
3558         for (i = nunits - 2; i >= 0; --i)
3559           elts[i + 1] = def_for_init;
3560
3561         /* Option1: the first element is '0' or '1' as well.  */
3562         if (adjustment_def)
3563           {
3564             elts[0] = def_for_init;
3565             init_def = build_vector (vectype, elts);
3566             break;
3567           }
3568
3569         /* Option2: the first element is INIT_VAL.  */
3570         elts[0] = init_val;
3571         if (TREE_CONSTANT (init_val))
3572           init_def = build_vector (vectype, elts);
3573         else
3574           {
3575             vec<constructor_elt, va_gc> *v;
3576             vec_alloc (v, nunits);
3577             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3578             for (i = 1; i < nunits; ++i)
3579               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3580             init_def = build_constructor (vectype, v);
3581           }
3582
3583         break;
3584
3585       case MIN_EXPR:
3586       case MAX_EXPR:
3587       case COND_EXPR:
3588         if (adjustment_def)
3589           {
3590             *adjustment_def = NULL_TREE;
3591             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3592             break;
3593           }
3594
3595         init_def = build_vector_from_val (vectype, init_value);
3596         break;
3597
3598       default:
3599         gcc_unreachable ();
3600     }
3601
3602   return init_def;
3603 }
3604
3605
3606 /* Function vect_create_epilog_for_reduction
3607
3608    Create code at the loop-epilog to finalize the result of a reduction
3609    computation.
3610
3611    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3612      reduction statements.
3613    STMT is the scalar reduction stmt that is being vectorized.
3614    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3615      number of elements that we can fit in a vectype (nunits).  In this case
3616      we have to generate more than one vector stmt - i.e - we need to "unroll"
3617      the vector stmt by a factor VF/nunits.  For more details see documentation
3618      in vectorizable_operation.
3619    REDUC_CODE is the tree-code for the epilog reduction.
3620    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3621      computation.
3622    REDUC_INDEX is the index of the operand in the right hand side of the
3623      statement that is defined by REDUCTION_PHI.
3624    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3625    SLP_NODE is an SLP node containing a group of reduction statements. The
3626      first one in this group is STMT.
3627
3628    This function:
3629    1. Creates the reduction def-use cycles: sets the arguments for
3630       REDUCTION_PHIS:
3631       The loop-entry argument is the vectorized initial-value of the reduction.
3632       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3633       sums.
3634    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3635       by applying the operation specified by REDUC_CODE if available, or by
3636       other means (whole-vector shifts or a scalar loop).
3637       The function also creates a new phi node at the loop exit to preserve
3638       loop-closed form, as illustrated below.
3639
3640      The flow at the entry to this function:
3641
3642         loop:
3643           vec_def = phi <null, null>            # REDUCTION_PHI
3644           VECT_DEF = vector_stmt                # vectorized form of STMT
3645           s_loop = scalar_stmt                  # (scalar) STMT
3646         loop_exit:
3647           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3648           use <s_out0>
3649           use <s_out0>
3650
3651      The above is transformed by this function into:
3652
3653         loop:
3654           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3655           VECT_DEF = vector_stmt                # vectorized form of STMT
3656           s_loop = scalar_stmt                  # (scalar) STMT
3657         loop_exit:
3658           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3659           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3660           v_out2 = reduce <v_out1>
3661           s_out3 = extract_field <v_out2, 0>
3662           s_out4 = adjust_result <s_out3>
3663           use <s_out4>
3664           use <s_out4>
3665 */
3666
3667 static void
3668 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3669                                   int ncopies, enum tree_code reduc_code,
3670                                   vec<gimple> reduction_phis,
3671                                   int reduc_index, bool double_reduc,
3672                                   slp_tree slp_node)
3673 {
3674   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3675   stmt_vec_info prev_phi_info;
3676   tree vectype;
3677   enum machine_mode mode;
3678   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3679   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3680   basic_block exit_bb;
3681   tree scalar_dest;
3682   tree scalar_type;
3683   gimple new_phi = NULL, phi;
3684   gimple_stmt_iterator exit_gsi;
3685   tree vec_dest;
3686   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3687   gimple epilog_stmt = NULL;
3688   enum tree_code code = gimple_assign_rhs_code (stmt);
3689   gimple exit_phi;
3690   tree bitsize, bitpos;
3691   tree adjustment_def = NULL;
3692   tree vec_initial_def = NULL;
3693   tree reduction_op, expr, def;
3694   tree orig_name, scalar_result;
3695   imm_use_iterator imm_iter, phi_imm_iter;
3696   use_operand_p use_p, phi_use_p;
3697   bool extract_scalar_result = false;
3698   gimple use_stmt, orig_stmt, reduction_phi = NULL;
3699   bool nested_in_vect_loop = false;
3700   vec<gimple> new_phis = vNULL;
3701   vec<gimple> inner_phis = vNULL;
3702   enum vect_def_type dt = vect_unknown_def_type;
3703   int j, i;
3704   vec<tree> scalar_results = vNULL;
3705   unsigned int group_size = 1, k, ratio;
3706   vec<tree> vec_initial_defs = vNULL;
3707   vec<gimple> phis;
3708   bool slp_reduc = false;
3709   tree new_phi_result;
3710   gimple inner_phi = NULL;
3711
3712   if (slp_node)
3713     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3714
3715   if (nested_in_vect_loop_p (loop, stmt))
3716     {
3717       outer_loop = loop;
3718       loop = loop->inner;
3719       nested_in_vect_loop = true;
3720       gcc_assert (!slp_node);
3721     }
3722
3723   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3724     {
3725     case GIMPLE_SINGLE_RHS:
3726       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3727                   == ternary_op);
3728       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3729       break;
3730     case GIMPLE_UNARY_RHS:
3731       reduction_op = gimple_assign_rhs1 (stmt);
3732       break;
3733     case GIMPLE_BINARY_RHS:
3734       reduction_op = reduc_index ?
3735                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3736       break;
3737     case GIMPLE_TERNARY_RHS:
3738       reduction_op = gimple_op (stmt, reduc_index + 1);
3739       break;
3740     default:
3741       gcc_unreachable ();
3742     }
3743
3744   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3745   gcc_assert (vectype);
3746   mode = TYPE_MODE (vectype);
3747
3748   /* 1. Create the reduction def-use cycle:
3749      Set the arguments of REDUCTION_PHIS, i.e., transform
3750
3751         loop:
3752           vec_def = phi <null, null>            # REDUCTION_PHI
3753           VECT_DEF = vector_stmt                # vectorized form of STMT
3754           ...
3755
3756      into:
3757
3758         loop:
3759           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3760           VECT_DEF = vector_stmt                # vectorized form of STMT
3761           ...
3762
3763      (in case of SLP, do it for all the phis). */
3764
3765   /* Get the loop-entry arguments.  */
3766   if (slp_node)
3767     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3768                        NULL, slp_node, reduc_index);
3769   else
3770     {
3771       vec_initial_defs.create (1);
3772      /* For the case of reduction, vect_get_vec_def_for_operand returns
3773         the scalar def before the loop, that defines the initial value
3774         of the reduction variable.  */
3775       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3776                                                       &adjustment_def);
3777       vec_initial_defs.quick_push (vec_initial_def);
3778     }
3779
3780   /* Set phi nodes arguments.  */
3781   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
3782     {
3783       tree vec_init_def = vec_initial_defs[i];
3784       tree def = vect_defs[i];
3785       for (j = 0; j < ncopies; j++)
3786         {
3787           /* Set the loop-entry arg of the reduction-phi.  */
3788           add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3789                        UNKNOWN_LOCATION);
3790
3791           /* Set the loop-latch arg for the reduction-phi.  */
3792           if (j > 0)
3793             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3794
3795           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3796
3797           if (dump_enabled_p ())
3798             {
3799               dump_printf_loc (MSG_NOTE, vect_location,
3800                                "transform reduction: created def-use cycle: ");
3801               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
3802               dump_printf (MSG_NOTE, "\n");
3803               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
3804             }
3805
3806           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3807         }
3808     }
3809
3810   vec_initial_defs.release ();
3811
3812   /* 2. Create epilog code.
3813         The reduction epilog code operates across the elements of the vector
3814         of partial results computed by the vectorized loop.
3815         The reduction epilog code consists of:
3816
3817         step 1: compute the scalar result in a vector (v_out2)
3818         step 2: extract the scalar result (s_out3) from the vector (v_out2)
3819         step 3: adjust the scalar result (s_out3) if needed.
3820
3821         Step 1 can be accomplished using one the following three schemes:
3822           (scheme 1) using reduc_code, if available.
3823           (scheme 2) using whole-vector shifts, if available.
3824           (scheme 3) using a scalar loop. In this case steps 1+2 above are
3825                      combined.
3826
3827           The overall epilog code looks like this:
3828
3829           s_out0 = phi <s_loop>         # original EXIT_PHI
3830           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
3831           v_out2 = reduce <v_out1>              # step 1
3832           s_out3 = extract_field <v_out2, 0>    # step 2
3833           s_out4 = adjust_result <s_out3>       # step 3
3834
3835           (step 3 is optional, and steps 1 and 2 may be combined).
3836           Lastly, the uses of s_out0 are replaced by s_out4.  */
3837
3838
3839   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3840          v_out1 = phi <VECT_DEF>
3841          Store them in NEW_PHIS.  */
3842
3843   exit_bb = single_exit (loop)->dest;
3844   prev_phi_info = NULL;
3845   new_phis.create (vect_defs.length ());
3846   FOR_EACH_VEC_ELT (vect_defs, i, def)
3847     {
3848       for (j = 0; j < ncopies; j++)
3849         {
3850           tree new_def = copy_ssa_name (def, NULL);
3851           phi = create_phi_node (new_def, exit_bb);
3852           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3853           if (j == 0)
3854             new_phis.quick_push (phi);
3855           else
3856             {
3857               def = vect_get_vec_def_for_stmt_copy (dt, def);
3858               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3859             }
3860
3861           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3862           prev_phi_info = vinfo_for_stmt (phi);
3863         }
3864     }
3865
3866   /* The epilogue is created for the outer-loop, i.e., for the loop being
3867      vectorized.  Create exit phis for the outer loop.  */
3868   if (double_reduc)
3869     {
3870       loop = outer_loop;
3871       exit_bb = single_exit (loop)->dest;
3872       inner_phis.create (vect_defs.length ());
3873       FOR_EACH_VEC_ELT (new_phis, i, phi)
3874         {
3875           tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3876           gimple outer_phi = create_phi_node (new_result, exit_bb);
3877           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3878                            PHI_RESULT (phi));
3879           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3880                                                             loop_vinfo, NULL));
3881           inner_phis.quick_push (phi);
3882           new_phis[i] = outer_phi;
3883           prev_phi_info = vinfo_for_stmt (outer_phi);
3884           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
3885             {
3886               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3887               new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3888               outer_phi = create_phi_node (new_result, exit_bb);
3889               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3890                                PHI_RESULT (phi));
3891               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3892                                                         loop_vinfo, NULL));
3893               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
3894               prev_phi_info = vinfo_for_stmt (outer_phi);
3895             }
3896         }
3897     }
3898
3899   exit_gsi = gsi_after_labels (exit_bb);
3900
3901   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3902          (i.e. when reduc_code is not available) and in the final adjustment
3903          code (if needed).  Also get the original scalar reduction variable as
3904          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
3905          represents a reduction pattern), the tree-code and scalar-def are
3906          taken from the original stmt that the pattern-stmt (STMT) replaces.
3907          Otherwise (it is a regular reduction) - the tree-code and scalar-def
3908          are taken from STMT.  */
3909
3910   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3911   if (!orig_stmt)
3912     {
3913       /* Regular reduction  */
3914       orig_stmt = stmt;
3915     }
3916   else
3917     {
3918       /* Reduction pattern  */
3919       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3920       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3921       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3922     }
3923
3924   code = gimple_assign_rhs_code (orig_stmt);
3925   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3926      partial results are added and not subtracted.  */
3927   if (code == MINUS_EXPR)
3928     code = PLUS_EXPR;
3929
3930   scalar_dest = gimple_assign_lhs (orig_stmt);
3931   scalar_type = TREE_TYPE (scalar_dest);
3932   scalar_results.create (group_size);
3933   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3934   bitsize = TYPE_SIZE (scalar_type);
3935
3936   /* In case this is a reduction in an inner-loop while vectorizing an outer
3937      loop - we don't need to extract a single scalar result at the end of the
3938      inner-loop (unless it is double reduction, i.e., the use of reduction is
3939      outside the outer-loop).  The final vector of partial results will be used
3940      in the vectorized outer-loop, or reduced to a scalar result at the end of
3941      the outer-loop.  */
3942   if (nested_in_vect_loop && !double_reduc)
3943     goto vect_finalize_reduction;
3944
3945   /* SLP reduction without reduction chain, e.g.,
3946      # a1 = phi <a2, a0>
3947      # b1 = phi <b2, b0>
3948      a2 = operation (a1)
3949      b2 = operation (b1)  */
3950   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
3951
3952   /* In case of reduction chain, e.g.,
3953      # a1 = phi <a3, a0>
3954      a2 = operation (a1)
3955      a3 = operation (a2),
3956
3957      we may end up with more than one vector result.  Here we reduce them to
3958      one vector.  */
3959   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
3960     {
3961       tree first_vect = PHI_RESULT (new_phis[0]);
3962       tree tmp;
3963       gimple new_vec_stmt = NULL;
3964
3965       vec_dest = vect_create_destination_var (scalar_dest, vectype);
3966       for (k = 1; k < new_phis.length (); k++)
3967         {
3968           gimple next_phi = new_phis[k];
3969           tree second_vect = PHI_RESULT (next_phi);
3970
3971           tmp = build2 (code, vectype,  first_vect, second_vect);
3972           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
3973           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
3974           gimple_assign_set_lhs (new_vec_stmt, first_vect);
3975           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
3976         }
3977
3978       new_phi_result = first_vect;
3979       if (new_vec_stmt)
3980         {
3981           new_phis.truncate (0);
3982           new_phis.safe_push (new_vec_stmt);
3983         }
3984     }
3985   else
3986     new_phi_result = PHI_RESULT (new_phis[0]);
3987
3988   /* 2.3 Create the reduction code, using one of the three schemes described
3989          above. In SLP we simply need to extract all the elements from the
3990          vector (without reducing them), so we use scalar shifts.  */
3991   if (reduc_code != ERROR_MARK && !slp_reduc)
3992     {
3993       tree tmp;
3994
3995       /*** Case 1:  Create:
3996            v_out2 = reduc_expr <v_out1>  */
3997
3998       if (dump_enabled_p ())
3999         dump_printf_loc (MSG_NOTE, vect_location,
4000                          "Reduce using direct vector reduction.");
4001
4002       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4003       tmp = build1 (reduc_code, vectype, new_phi_result);
4004       epilog_stmt = gimple_build_assign (vec_dest, tmp);
4005       new_temp = make_ssa_name (vec_dest, epilog_stmt);
4006       gimple_assign_set_lhs (epilog_stmt, new_temp);
4007       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4008
4009       extract_scalar_result = true;
4010     }
4011   else
4012     {
4013       enum tree_code shift_code = ERROR_MARK;
4014       bool have_whole_vector_shift = true;
4015       int bit_offset;
4016       int element_bitsize = tree_low_cst (bitsize, 1);
4017       int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4018       tree vec_temp;
4019
4020       if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4021         shift_code = VEC_RSHIFT_EXPR;
4022       else
4023         have_whole_vector_shift = false;
4024
4025       /* Regardless of whether we have a whole vector shift, if we're
4026          emulating the operation via tree-vect-generic, we don't want
4027          to use it.  Only the first round of the reduction is likely
4028          to still be profitable via emulation.  */
4029       /* ??? It might be better to emit a reduction tree code here, so that
4030          tree-vect-generic can expand the first round via bit tricks.  */
4031       if (!VECTOR_MODE_P (mode))
4032         have_whole_vector_shift = false;
4033       else
4034         {
4035           optab optab = optab_for_tree_code (code, vectype, optab_default);
4036           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4037             have_whole_vector_shift = false;
4038         }
4039
4040       if (have_whole_vector_shift && !slp_reduc)
4041         {
4042           /*** Case 2: Create:
4043              for (offset = VS/2; offset >= element_size; offset/=2)
4044                 {
4045                   Create:  va' = vec_shift <va, offset>
4046                   Create:  va = vop <va, va'>
4047                 }  */
4048
4049           if (dump_enabled_p ())
4050             dump_printf_loc (MSG_NOTE, vect_location,
4051                              "Reduce using vector shifts");
4052
4053           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4054           new_temp = new_phi_result;
4055           for (bit_offset = vec_size_in_bits/2;
4056                bit_offset >= element_bitsize;
4057                bit_offset /= 2)
4058             {
4059               tree bitpos = size_int (bit_offset);
4060
4061               epilog_stmt = gimple_build_assign_with_ops (shift_code,
4062                                                vec_dest, new_temp, bitpos);
4063               new_name = make_ssa_name (vec_dest, epilog_stmt);
4064               gimple_assign_set_lhs (epilog_stmt, new_name);
4065               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4066
4067               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4068                                                           new_name, new_temp);
4069               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4070               gimple_assign_set_lhs (epilog_stmt, new_temp);
4071               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4072             }
4073
4074           extract_scalar_result = true;
4075         }
4076       else
4077         {
4078           tree rhs;
4079
4080           /*** Case 3: Create:
4081              s = extract_field <v_out2, 0>
4082              for (offset = element_size;
4083                   offset < vector_size;
4084                   offset += element_size;)
4085                {
4086                  Create:  s' = extract_field <v_out2, offset>
4087                  Create:  s = op <s, s'>  // For non SLP cases
4088                }  */
4089
4090           if (dump_enabled_p ())
4091             dump_printf_loc (MSG_NOTE, vect_location,
4092                              "Reduce using scalar code. ");
4093
4094           vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4095           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4096             {
4097               if (gimple_code (new_phi) == GIMPLE_PHI)
4098                 vec_temp = PHI_RESULT (new_phi);
4099               else
4100                 vec_temp = gimple_assign_lhs (new_phi);
4101               rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4102                             bitsize_zero_node);
4103               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4104               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4105               gimple_assign_set_lhs (epilog_stmt, new_temp);
4106               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4107
4108               /* In SLP we don't need to apply reduction operation, so we just
4109                  collect s' values in SCALAR_RESULTS.  */
4110               if (slp_reduc)
4111                 scalar_results.safe_push (new_temp);
4112
4113               for (bit_offset = element_bitsize;
4114                    bit_offset < vec_size_in_bits;
4115                    bit_offset += element_bitsize)
4116                 {
4117                   tree bitpos = bitsize_int (bit_offset);
4118                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4119                                      bitsize, bitpos);
4120
4121                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4122                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4123                   gimple_assign_set_lhs (epilog_stmt, new_name);
4124                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4125
4126                   if (slp_reduc)
4127                     {
4128                       /* In SLP we don't need to apply reduction operation, so
4129                          we just collect s' values in SCALAR_RESULTS.  */
4130                       new_temp = new_name;
4131                       scalar_results.safe_push (new_name);
4132                     }
4133                   else
4134                     {
4135                       epilog_stmt = gimple_build_assign_with_ops (code,
4136                                           new_scalar_dest, new_name, new_temp);
4137                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4138                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4139                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4140                     }
4141                 }
4142             }
4143
4144           /* The only case where we need to reduce scalar results in SLP, is
4145              unrolling.  If the size of SCALAR_RESULTS is greater than
4146              GROUP_SIZE, we reduce them combining elements modulo
4147              GROUP_SIZE.  */
4148           if (slp_reduc)
4149             {
4150               tree res, first_res, new_res;
4151               gimple new_stmt;
4152
4153               /* Reduce multiple scalar results in case of SLP unrolling.  */
4154               for (j = group_size; scalar_results.iterate (j, &res);
4155                    j++)
4156                 {
4157                   first_res = scalar_results[j % group_size];
4158                   new_stmt = gimple_build_assign_with_ops (code,
4159                                               new_scalar_dest, first_res, res);
4160                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4161                   gimple_assign_set_lhs (new_stmt, new_res);
4162                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4163                   scalar_results[j % group_size] = new_res;
4164                 }
4165             }
4166           else
4167             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4168             scalar_results.safe_push (new_temp);
4169
4170           extract_scalar_result = false;
4171         }
4172     }
4173
4174   /* 2.4  Extract the final scalar result.  Create:
4175           s_out3 = extract_field <v_out2, bitpos>  */
4176
4177   if (extract_scalar_result)
4178     {
4179       tree rhs;
4180
4181       if (dump_enabled_p ())
4182         dump_printf_loc (MSG_NOTE, vect_location,
4183                          "extract scalar result");
4184
4185       if (BYTES_BIG_ENDIAN)
4186         bitpos = size_binop (MULT_EXPR,
4187                              bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4188                              TYPE_SIZE (scalar_type));
4189       else
4190         bitpos = bitsize_zero_node;
4191
4192       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4193       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4194       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4195       gimple_assign_set_lhs (epilog_stmt, new_temp);
4196       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4197       scalar_results.safe_push (new_temp);
4198     }
4199
4200 vect_finalize_reduction:
4201
4202   if (double_reduc)
4203     loop = loop->inner;
4204
4205   /* 2.5 Adjust the final result by the initial value of the reduction
4206          variable. (When such adjustment is not needed, then
4207          'adjustment_def' is zero).  For example, if code is PLUS we create:
4208          new_temp = loop_exit_def + adjustment_def  */
4209
4210   if (adjustment_def)
4211     {
4212       gcc_assert (!slp_reduc);
4213       if (nested_in_vect_loop)
4214         {
4215           new_phi = new_phis[0];
4216           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4217           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4218           new_dest = vect_create_destination_var (scalar_dest, vectype);
4219         }
4220       else
4221         {
4222           new_temp = scalar_results[0];
4223           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4224           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4225           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4226         }
4227
4228       epilog_stmt = gimple_build_assign (new_dest, expr);
4229       new_temp = make_ssa_name (new_dest, epilog_stmt);
4230       gimple_assign_set_lhs (epilog_stmt, new_temp);
4231       SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
4232       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4233       if (nested_in_vect_loop)
4234         {
4235           set_vinfo_for_stmt (epilog_stmt,
4236                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
4237                                                  NULL));
4238           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4239                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4240
4241           if (!double_reduc)
4242             scalar_results.quick_push (new_temp);
4243           else
4244             scalar_results[0] = new_temp;
4245         }
4246       else
4247         scalar_results[0] = new_temp;
4248
4249       new_phis[0] = epilog_stmt;
4250     }
4251
4252   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4253           phis with new adjusted scalar results, i.e., replace use <s_out0>
4254           with use <s_out4>.
4255
4256      Transform:
4257         loop_exit:
4258           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4259           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4260           v_out2 = reduce <v_out1>
4261           s_out3 = extract_field <v_out2, 0>
4262           s_out4 = adjust_result <s_out3>
4263           use <s_out0>
4264           use <s_out0>
4265
4266      into:
4267
4268         loop_exit:
4269           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4270           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4271           v_out2 = reduce <v_out1>
4272           s_out3 = extract_field <v_out2, 0>
4273           s_out4 = adjust_result <s_out3>
4274           use <s_out4>
4275           use <s_out4> */
4276
4277
4278   /* In SLP reduction chain we reduce vector results into one vector if
4279      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4280      the last stmt in the reduction chain, since we are looking for the loop
4281      exit phi node.  */
4282   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4283     {
4284       scalar_dest = gimple_assign_lhs (
4285                         SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4286       group_size = 1;
4287     }
4288
4289   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4290      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4291      need to match SCALAR_RESULTS with corresponding statements.  The first
4292      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4293      the first vector stmt, etc.
4294      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4295   if (group_size > new_phis.length ())
4296     {
4297       ratio = group_size / new_phis.length ();
4298       gcc_assert (!(group_size % new_phis.length ()));
4299     }
4300   else
4301     ratio = 1;
4302
4303   for (k = 0; k < group_size; k++)
4304     {
4305       if (k % ratio == 0)
4306         {
4307           epilog_stmt = new_phis[k / ratio];
4308           reduction_phi = reduction_phis[k / ratio];
4309           if (double_reduc)
4310             inner_phi = inner_phis[k / ratio];
4311         }
4312
4313       if (slp_reduc)
4314         {
4315           gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4316
4317           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4318           /* SLP statements can't participate in patterns.  */
4319           gcc_assert (!orig_stmt);
4320           scalar_dest = gimple_assign_lhs (current_stmt);
4321         }
4322
4323       phis.create (3);
4324       /* Find the loop-closed-use at the loop exit of the original scalar
4325          result.  (The reduction result is expected to have two immediate uses -
4326          one at the latch block, and one at the loop exit).  */
4327       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4328         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4329           phis.safe_push (USE_STMT (use_p));
4330
4331       /* We expect to have found an exit_phi because of loop-closed-ssa
4332          form.  */
4333       gcc_assert (!phis.is_empty ());
4334
4335       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4336         {
4337           if (outer_loop)
4338             {
4339               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4340               gimple vect_phi;
4341
4342               /* FORNOW. Currently not supporting the case that an inner-loop
4343                  reduction is not used in the outer-loop (but only outside the
4344                  outer-loop), unless it is double reduction.  */
4345               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4346                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4347                           || double_reduc);
4348
4349               STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4350               if (!double_reduc
4351                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4352                       != vect_double_reduction_def)
4353                 continue;
4354
4355               /* Handle double reduction:
4356
4357                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4358                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4359                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4360                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4361
4362                  At that point the regular reduction (stmt2 and stmt3) is
4363                  already vectorized, as well as the exit phi node, stmt4.
4364                  Here we vectorize the phi node of double reduction, stmt1, and
4365                  update all relevant statements.  */
4366
4367               /* Go through all the uses of s2 to find double reduction phi
4368                  node, i.e., stmt1 above.  */
4369               orig_name = PHI_RESULT (exit_phi);
4370               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4371                 {
4372                   stmt_vec_info use_stmt_vinfo;
4373                   stmt_vec_info new_phi_vinfo;
4374                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4375                   basic_block bb = gimple_bb (use_stmt);
4376                   gimple use;
4377
4378                   /* Check that USE_STMT is really double reduction phi
4379                      node.  */
4380                   if (gimple_code (use_stmt) != GIMPLE_PHI
4381                       || gimple_phi_num_args (use_stmt) != 2
4382                       || bb->loop_father != outer_loop)
4383                     continue;
4384                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4385                   if (!use_stmt_vinfo
4386                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4387                           != vect_double_reduction_def)
4388                     continue;
4389
4390                   /* Create vector phi node for double reduction:
4391                      vs1 = phi <vs0, vs2>
4392                      vs1 was created previously in this function by a call to
4393                        vect_get_vec_def_for_operand and is stored in
4394                        vec_initial_def;
4395                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4396                      vs0 is created here.  */
4397
4398                   /* Create vector phi node.  */
4399                   vect_phi = create_phi_node (vec_initial_def, bb);
4400                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4401                                     loop_vec_info_for_loop (outer_loop), NULL);
4402                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4403
4404                   /* Create vs0 - initial def of the double reduction phi.  */
4405                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4406                                              loop_preheader_edge (outer_loop));
4407                   init_def = get_initial_def_for_reduction (stmt,
4408                                                           preheader_arg, NULL);
4409                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4410                                                     vectype, NULL);
4411
4412                   /* Update phi node arguments with vs0 and vs2.  */
4413                   add_phi_arg (vect_phi, vect_phi_init,
4414                                loop_preheader_edge (outer_loop),
4415                                UNKNOWN_LOCATION);
4416                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4417                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4418                   if (dump_enabled_p ())
4419                     {
4420                       dump_printf_loc (MSG_NOTE, vect_location,
4421                                        "created double reduction phi node: ");
4422                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4423                     }
4424
4425                   vect_phi_res = PHI_RESULT (vect_phi);
4426
4427                   /* Replace the use, i.e., set the correct vs1 in the regular
4428                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4429                      loop is redundant.  */
4430                   use = reduction_phi;
4431                   for (j = 0; j < ncopies; j++)
4432                     {
4433                       edge pr_edge = loop_preheader_edge (loop);
4434                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4435                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4436                     }
4437                 }
4438             }
4439         }
4440
4441       phis.release ();
4442       if (nested_in_vect_loop)
4443         {
4444           if (double_reduc)
4445             loop = outer_loop;
4446           else
4447             continue;
4448         }
4449
4450       phis.create (3);
4451       /* Find the loop-closed-use at the loop exit of the original scalar
4452          result.  (The reduction result is expected to have two immediate uses,
4453          one at the latch block, and one at the loop exit).  For double
4454          reductions we are looking for exit phis of the outer loop.  */
4455       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4456         {
4457           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4458             phis.safe_push (USE_STMT (use_p));
4459           else
4460             {
4461               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4462                 {
4463                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4464
4465                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4466                     {
4467                       if (!flow_bb_inside_loop_p (loop,
4468                                              gimple_bb (USE_STMT (phi_use_p))))
4469                         phis.safe_push (USE_STMT (phi_use_p));
4470                     }
4471                 }
4472             }
4473         }
4474
4475       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4476         {
4477           /* Replace the uses:  */
4478           orig_name = PHI_RESULT (exit_phi);
4479           scalar_result = scalar_results[k];
4480           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4481             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4482               SET_USE (use_p, scalar_result);
4483         }
4484
4485       phis.release ();
4486     }
4487
4488   scalar_results.release ();
4489   new_phis.release ();
4490 }
4491
4492
4493 /* Function vectorizable_reduction.
4494
4495    Check if STMT performs a reduction operation that can be vectorized.
4496    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4497    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4498    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4499
4500    This function also handles reduction idioms (patterns) that have been
4501    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4502    of this form:
4503      X = pattern_expr (arg0, arg1, ..., X)
4504    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4505    sequence that had been detected and replaced by the pattern-stmt (STMT).
4506
4507    In some cases of reduction patterns, the type of the reduction variable X is
4508    different than the type of the other arguments of STMT.
4509    In such cases, the vectype that is used when transforming STMT into a vector
4510    stmt is different than the vectype that is used to determine the
4511    vectorization factor, because it consists of a different number of elements
4512    than the actual number of elements that are being operated upon in parallel.
4513
4514    For example, consider an accumulation of shorts into an int accumulator.
4515    On some targets it's possible to vectorize this pattern operating on 8
4516    shorts at a time (hence, the vectype for purposes of determining the
4517    vectorization factor should be V8HI); on the other hand, the vectype that
4518    is used to create the vector form is actually V4SI (the type of the result).
4519
4520    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4521    indicates what is the actual level of parallelism (V8HI in the example), so
4522    that the right vectorization factor would be derived.  This vectype
4523    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4524    be used to create the vectorized stmt.  The right vectype for the vectorized
4525    stmt is obtained from the type of the result X:
4526         get_vectype_for_scalar_type (TREE_TYPE (X))
4527
4528    This means that, contrary to "regular" reductions (or "regular" stmts in
4529    general), the following equation:
4530       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4531    does *NOT* necessarily hold for reduction patterns.  */
4532
4533 bool
4534 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4535                         gimple *vec_stmt, slp_tree slp_node)
4536 {
4537   tree vec_dest;
4538   tree scalar_dest;
4539   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4540   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4541   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4542   tree vectype_in = NULL_TREE;
4543   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4544   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4545   enum tree_code code, orig_code, epilog_reduc_code;
4546   enum machine_mode vec_mode;
4547   int op_type;
4548   optab optab, reduc_optab;
4549   tree new_temp = NULL_TREE;
4550   tree def;
4551   gimple def_stmt;
4552   enum vect_def_type dt;
4553   gimple new_phi = NULL;
4554   tree scalar_type;
4555   bool is_simple_use;
4556   gimple orig_stmt;
4557   stmt_vec_info orig_stmt_info;
4558   tree expr = NULL_TREE;
4559   int i;
4560   int ncopies;
4561   int epilog_copies;
4562   stmt_vec_info prev_stmt_info, prev_phi_info;
4563   bool single_defuse_cycle = false;
4564   tree reduc_def = NULL_TREE;
4565   gimple new_stmt = NULL;
4566   int j;
4567   tree ops[3];
4568   bool nested_cycle = false, found_nested_cycle_def = false;
4569   gimple reduc_def_stmt = NULL;
4570   /* The default is that the reduction variable is the last in statement.  */
4571   int reduc_index = 2;
4572   bool double_reduc = false, dummy;
4573   basic_block def_bb;
4574   struct loop * def_stmt_loop, *outer_loop = NULL;
4575   tree def_arg;
4576   gimple def_arg_stmt;
4577   vec<tree> vec_oprnds0 = vNULL;
4578   vec<tree> vec_oprnds1 = vNULL;
4579   vec<tree> vect_defs = vNULL;
4580   vec<gimple> phis = vNULL;
4581   int vec_num;
4582   tree def0, def1, tem, op0, op1 = NULL_TREE;
4583
4584   /* In case of reduction chain we switch to the first stmt in the chain, but
4585      we don't update STMT_INFO, since only the last stmt is marked as reduction
4586      and has reduction properties.  */
4587   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4588     stmt = GROUP_FIRST_ELEMENT (stmt_info);
4589
4590   if (nested_in_vect_loop_p (loop, stmt))
4591     {
4592       outer_loop = loop;
4593       loop = loop->inner;
4594       nested_cycle = true;
4595     }
4596
4597   /* 1. Is vectorizable reduction?  */
4598   /* Not supportable if the reduction variable is used in the loop, unless
4599      it's a reduction chain.  */
4600   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4601       && !GROUP_FIRST_ELEMENT (stmt_info))
4602     return false;
4603
4604   /* Reductions that are not used even in an enclosing outer-loop,
4605      are expected to be "live" (used out of the loop).  */
4606   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4607       && !STMT_VINFO_LIVE_P (stmt_info))
4608     return false;
4609
4610   /* Make sure it was already recognized as a reduction computation.  */
4611   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4612       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4613     return false;
4614
4615   /* 2. Has this been recognized as a reduction pattern?
4616
4617      Check if STMT represents a pattern that has been recognized
4618      in earlier analysis stages.  For stmts that represent a pattern,
4619      the STMT_VINFO_RELATED_STMT field records the last stmt in
4620      the original sequence that constitutes the pattern.  */
4621
4622   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4623   if (orig_stmt)
4624     {
4625       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4626       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4627       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4628     }
4629
4630   /* 3. Check the operands of the operation.  The first operands are defined
4631         inside the loop body. The last operand is the reduction variable,
4632         which is defined by the loop-header-phi.  */
4633
4634   gcc_assert (is_gimple_assign (stmt));
4635
4636   /* Flatten RHS.  */
4637   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4638     {
4639     case GIMPLE_SINGLE_RHS:
4640       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4641       if (op_type == ternary_op)
4642         {
4643           tree rhs = gimple_assign_rhs1 (stmt);
4644           ops[0] = TREE_OPERAND (rhs, 0);
4645           ops[1] = TREE_OPERAND (rhs, 1);
4646           ops[2] = TREE_OPERAND (rhs, 2);
4647           code = TREE_CODE (rhs);
4648         }
4649       else
4650         return false;
4651       break;
4652
4653     case GIMPLE_BINARY_RHS:
4654       code = gimple_assign_rhs_code (stmt);
4655       op_type = TREE_CODE_LENGTH (code);
4656       gcc_assert (op_type == binary_op);
4657       ops[0] = gimple_assign_rhs1 (stmt);
4658       ops[1] = gimple_assign_rhs2 (stmt);
4659       break;
4660
4661     case GIMPLE_TERNARY_RHS:
4662       code = gimple_assign_rhs_code (stmt);
4663       op_type = TREE_CODE_LENGTH (code);
4664       gcc_assert (op_type == ternary_op);
4665       ops[0] = gimple_assign_rhs1 (stmt);
4666       ops[1] = gimple_assign_rhs2 (stmt);
4667       ops[2] = gimple_assign_rhs3 (stmt);
4668       break;
4669
4670     case GIMPLE_UNARY_RHS:
4671       return false;
4672
4673     default:
4674       gcc_unreachable ();
4675     }
4676
4677   if (code == COND_EXPR && slp_node)
4678     return false;
4679
4680   scalar_dest = gimple_assign_lhs (stmt);
4681   scalar_type = TREE_TYPE (scalar_dest);
4682   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4683       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4684     return false;
4685
4686   /* Do not try to vectorize bit-precision reductions.  */
4687   if ((TYPE_PRECISION (scalar_type)
4688        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4689     return false;
4690
4691   /* All uses but the last are expected to be defined in the loop.
4692      The last use is the reduction variable.  In case of nested cycle this
4693      assumption is not true: we use reduc_index to record the index of the
4694      reduction variable.  */
4695   for (i = 0; i < op_type-1; i++)
4696     {
4697       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4698       if (i == 0 && code == COND_EXPR)
4699         continue;
4700
4701       is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4702                                             &def_stmt, &def, &dt, &tem);
4703       if (!vectype_in)
4704         vectype_in = tem;
4705       gcc_assert (is_simple_use);
4706
4707       if (dt != vect_internal_def
4708           && dt != vect_external_def
4709           && dt != vect_constant_def
4710           && dt != vect_induction_def
4711           && !(dt == vect_nested_cycle && nested_cycle))
4712         return false;
4713
4714       if (dt == vect_nested_cycle)
4715         {
4716           found_nested_cycle_def = true;
4717           reduc_def_stmt = def_stmt;
4718           reduc_index = i;
4719         }
4720     }
4721
4722   is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4723                                         &def_stmt, &def, &dt, &tem);
4724   if (!vectype_in)
4725     vectype_in = tem;
4726   gcc_assert (is_simple_use);
4727   gcc_assert (dt == vect_reduction_def
4728               || dt == vect_nested_cycle
4729               || ((dt == vect_internal_def || dt == vect_external_def
4730                    || dt == vect_constant_def || dt == vect_induction_def)
4731                    && nested_cycle && found_nested_cycle_def));
4732   if (!found_nested_cycle_def)
4733     reduc_def_stmt = def_stmt;
4734
4735   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4736   if (orig_stmt)
4737     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4738                                                        reduc_def_stmt,
4739                                                        !nested_cycle,
4740                                                        &dummy));
4741   else
4742     {
4743       gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4744                                              !nested_cycle, &dummy);
4745       /* We changed STMT to be the first stmt in reduction chain, hence we
4746          check that in this case the first element in the chain is STMT.  */
4747       gcc_assert (stmt == tmp
4748                   || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4749     }
4750
4751   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4752     return false;
4753
4754   if (slp_node || PURE_SLP_STMT (stmt_info))
4755     ncopies = 1;
4756   else
4757     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4758                / TYPE_VECTOR_SUBPARTS (vectype_in));
4759
4760   gcc_assert (ncopies >= 1);
4761
4762   vec_mode = TYPE_MODE (vectype_in);
4763
4764   if (code == COND_EXPR)
4765     {
4766       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4767         {
4768           if (dump_enabled_p ())
4769             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4770                              "unsupported condition in reduction");
4771
4772             return false;
4773         }
4774     }
4775   else
4776     {
4777       /* 4. Supportable by target?  */
4778
4779       /* 4.1. check support for the operation in the loop  */
4780       optab = optab_for_tree_code (code, vectype_in, optab_default);
4781       if (!optab)
4782         {
4783           if (dump_enabled_p ())
4784             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4785                              "no optab.");
4786
4787           return false;
4788         }
4789
4790       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4791         {
4792           if (dump_enabled_p ())
4793             dump_printf (MSG_NOTE, "op not supported by target.");
4794
4795           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4796               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4797                   < vect_min_worthwhile_factor (code))
4798             return false;
4799
4800           if (dump_enabled_p ())
4801             dump_printf (MSG_NOTE, "proceeding using word mode.");
4802         }
4803
4804       /* Worthwhile without SIMD support?  */
4805       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4806           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4807              < vect_min_worthwhile_factor (code))
4808         {
4809           if (dump_enabled_p ())
4810             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4811                              "not worthwhile without SIMD support.");
4812
4813           return false;
4814         }
4815     }
4816
4817   /* 4.2. Check support for the epilog operation.
4818
4819           If STMT represents a reduction pattern, then the type of the
4820           reduction variable may be different than the type of the rest
4821           of the arguments.  For example, consider the case of accumulation
4822           of shorts into an int accumulator; The original code:
4823                         S1: int_a = (int) short_a;
4824           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
4825
4826           was replaced with:
4827                         STMT: int_acc = widen_sum <short_a, int_acc>
4828
4829           This means that:
4830           1. The tree-code that is used to create the vector operation in the
4831              epilog code (that reduces the partial results) is not the
4832              tree-code of STMT, but is rather the tree-code of the original
4833              stmt from the pattern that STMT is replacing.  I.e, in the example
4834              above we want to use 'widen_sum' in the loop, but 'plus' in the
4835              epilog.
4836           2. The type (mode) we use to check available target support
4837              for the vector operation to be created in the *epilog*, is
4838              determined by the type of the reduction variable (in the example
4839              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4840              However the type (mode) we use to check available target support
4841              for the vector operation to be created *inside the loop*, is
4842              determined by the type of the other arguments to STMT (in the
4843              example we'd check this: optab_handler (widen_sum_optab,
4844              vect_short_mode)).
4845
4846           This is contrary to "regular" reductions, in which the types of all
4847           the arguments are the same as the type of the reduction variable.
4848           For "regular" reductions we can therefore use the same vector type
4849           (and also the same tree-code) when generating the epilog code and
4850           when generating the code inside the loop.  */
4851
4852   if (orig_stmt)
4853     {
4854       /* This is a reduction pattern: get the vectype from the type of the
4855          reduction variable, and get the tree-code from orig_stmt.  */
4856       orig_code = gimple_assign_rhs_code (orig_stmt);
4857       gcc_assert (vectype_out);
4858       vec_mode = TYPE_MODE (vectype_out);
4859     }
4860   else
4861     {
4862       /* Regular reduction: use the same vectype and tree-code as used for
4863          the vector code inside the loop can be used for the epilog code. */
4864       orig_code = code;
4865     }
4866
4867   if (nested_cycle)
4868     {
4869       def_bb = gimple_bb (reduc_def_stmt);
4870       def_stmt_loop = def_bb->loop_father;
4871       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4872                                        loop_preheader_edge (def_stmt_loop));
4873       if (TREE_CODE (def_arg) == SSA_NAME
4874           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4875           && gimple_code (def_arg_stmt) == GIMPLE_PHI
4876           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4877           && vinfo_for_stmt (def_arg_stmt)
4878           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4879               == vect_double_reduction_def)
4880         double_reduc = true;
4881     }
4882
4883   epilog_reduc_code = ERROR_MARK;
4884   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4885     {
4886       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4887                                          optab_default);
4888       if (!reduc_optab)
4889         {
4890           if (dump_enabled_p ())
4891             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4892                              "no optab for reduction.");
4893
4894           epilog_reduc_code = ERROR_MARK;
4895         }
4896
4897       if (reduc_optab
4898           && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4899         {
4900           if (dump_enabled_p ())
4901             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4902                              "reduc op not supported by target.");
4903
4904           epilog_reduc_code = ERROR_MARK;
4905         }
4906     }
4907   else
4908     {
4909       if (!nested_cycle || double_reduc)
4910         {
4911           if (dump_enabled_p ())
4912             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4913                              "no reduc code for scalar code.");
4914
4915           return false;
4916         }
4917     }
4918
4919   if (double_reduc && ncopies > 1)
4920     {
4921       if (dump_enabled_p ())
4922         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4923                          "multiple types in double reduction");
4924
4925       return false;
4926     }
4927
4928   /* In case of widenning multiplication by a constant, we update the type
4929      of the constant to be the type of the other operand.  We check that the
4930      constant fits the type in the pattern recognition pass.  */
4931   if (code == DOT_PROD_EXPR
4932       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
4933     {
4934       if (TREE_CODE (ops[0]) == INTEGER_CST)
4935         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
4936       else if (TREE_CODE (ops[1]) == INTEGER_CST)
4937         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
4938       else
4939         {
4940           if (dump_enabled_p ())
4941             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4942                              "invalid types in dot-prod");
4943
4944           return false;
4945         }
4946     }
4947
4948   if (!vec_stmt) /* transformation not required.  */
4949     {
4950       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4951         return false;
4952       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4953       return true;
4954     }
4955
4956   /** Transform.  **/
4957
4958   if (dump_enabled_p ())
4959     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.");
4960
4961   /* FORNOW: Multiple types are not supported for condition.  */
4962   if (code == COND_EXPR)
4963     gcc_assert (ncopies == 1);
4964
4965   /* Create the destination vector  */
4966   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4967
4968   /* In case the vectorization factor (VF) is bigger than the number
4969      of elements that we can fit in a vectype (nunits), we have to generate
4970      more than one vector stmt - i.e - we need to "unroll" the
4971      vector stmt by a factor VF/nunits.  For more details see documentation
4972      in vectorizable_operation.  */
4973
4974   /* If the reduction is used in an outer loop we need to generate
4975      VF intermediate results, like so (e.g. for ncopies=2):
4976         r0 = phi (init, r0)
4977         r1 = phi (init, r1)
4978         r0 = x0 + r0;
4979         r1 = x1 + r1;
4980     (i.e. we generate VF results in 2 registers).
4981     In this case we have a separate def-use cycle for each copy, and therefore
4982     for each copy we get the vector def for the reduction variable from the
4983     respective phi node created for this copy.
4984
4985     Otherwise (the reduction is unused in the loop nest), we can combine
4986     together intermediate results, like so (e.g. for ncopies=2):
4987         r = phi (init, r)
4988         r = x0 + r;
4989         r = x1 + r;
4990    (i.e. we generate VF/2 results in a single register).
4991    In this case for each copy we get the vector def for the reduction variable
4992    from the vectorized reduction operation generated in the previous iteration.
4993   */
4994
4995   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
4996     {
4997       single_defuse_cycle = true;
4998       epilog_copies = 1;
4999     }
5000   else
5001     epilog_copies = ncopies;
5002
5003   prev_stmt_info = NULL;
5004   prev_phi_info = NULL;
5005   if (slp_node)
5006     {
5007       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5008       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5009                   == TYPE_VECTOR_SUBPARTS (vectype_in));
5010     }
5011   else
5012     {
5013       vec_num = 1;
5014       vec_oprnds0.create (1);
5015       if (op_type == ternary_op)
5016         vec_oprnds1.create (1);
5017     }
5018
5019   phis.create (vec_num);
5020   vect_defs.create (vec_num);
5021   if (!slp_node)
5022     vect_defs.quick_push (NULL_TREE);
5023
5024   for (j = 0; j < ncopies; j++)
5025     {
5026       if (j == 0 || !single_defuse_cycle)
5027         {
5028           for (i = 0; i < vec_num; i++)
5029             {
5030               /* Create the reduction-phi that defines the reduction
5031                  operand.  */
5032               new_phi = create_phi_node (vec_dest, loop->header);
5033               set_vinfo_for_stmt (new_phi,
5034                                   new_stmt_vec_info (new_phi, loop_vinfo,
5035                                                      NULL));
5036                if (j == 0 || slp_node)
5037                  phis.quick_push (new_phi);
5038             }
5039         }
5040
5041       if (code == COND_EXPR)
5042         {
5043           gcc_assert (!slp_node);
5044           vectorizable_condition (stmt, gsi, vec_stmt,
5045                                   PHI_RESULT (phis[0]),
5046                                   reduc_index, NULL);
5047           /* Multiple types are not supported for condition.  */
5048           break;
5049         }
5050
5051       /* Handle uses.  */
5052       if (j == 0)
5053         {
5054           op0 = ops[!reduc_index];
5055           if (op_type == ternary_op)
5056             {
5057               if (reduc_index == 0)
5058                 op1 = ops[2];
5059               else
5060                 op1 = ops[1];
5061             }
5062
5063           if (slp_node)
5064             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5065                                slp_node, -1);
5066           else
5067             {
5068               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5069                                                             stmt, NULL);
5070               vec_oprnds0.quick_push (loop_vec_def0);
5071               if (op_type == ternary_op)
5072                {
5073                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5074                                                                NULL);
5075                  vec_oprnds1.quick_push (loop_vec_def1);
5076                }
5077             }
5078         }
5079       else
5080         {
5081           if (!slp_node)
5082             {
5083               enum vect_def_type dt;
5084               gimple dummy_stmt;
5085               tree dummy;
5086
5087               vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5088                                   &dummy_stmt, &dummy, &dt);
5089               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5090                                                               loop_vec_def0);
5091               vec_oprnds0[0] = loop_vec_def0;
5092               if (op_type == ternary_op)
5093                 {
5094                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5095                                       &dummy, &dt);
5096                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5097                                                                 loop_vec_def1);
5098                   vec_oprnds1[0] = loop_vec_def1;
5099                 }
5100             }
5101
5102           if (single_defuse_cycle)
5103             reduc_def = gimple_assign_lhs (new_stmt);
5104
5105           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5106         }
5107
5108       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5109         {
5110           if (slp_node)
5111             reduc_def = PHI_RESULT (phis[i]);
5112           else
5113             {
5114               if (!single_defuse_cycle || j == 0)
5115                 reduc_def = PHI_RESULT (new_phi);
5116             }
5117
5118           def1 = ((op_type == ternary_op)
5119                   ? vec_oprnds1[i] : NULL);
5120           if (op_type == binary_op)
5121             {
5122               if (reduc_index == 0)
5123                 expr = build2 (code, vectype_out, reduc_def, def0);
5124               else
5125                 expr = build2 (code, vectype_out, def0, reduc_def);
5126             }
5127           else
5128             {
5129               if (reduc_index == 0)
5130                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5131               else
5132                 {
5133                   if (reduc_index == 1)
5134                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5135                   else
5136                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5137                 }
5138             }
5139
5140           new_stmt = gimple_build_assign (vec_dest, expr);
5141           new_temp = make_ssa_name (vec_dest, new_stmt);
5142           gimple_assign_set_lhs (new_stmt, new_temp);
5143           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5144
5145           if (slp_node)
5146             {
5147               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5148               vect_defs.quick_push (new_temp);
5149             }
5150           else
5151             vect_defs[0] = new_temp;
5152         }
5153
5154       if (slp_node)
5155         continue;
5156
5157       if (j == 0)
5158         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5159       else
5160         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5161
5162       prev_stmt_info = vinfo_for_stmt (new_stmt);
5163       prev_phi_info = vinfo_for_stmt (new_phi);
5164     }
5165
5166   /* Finalize the reduction-phi (set its arguments) and create the
5167      epilog reduction code.  */
5168   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5169     {
5170       new_temp = gimple_assign_lhs (*vec_stmt);
5171       vect_defs[0] = new_temp;
5172     }
5173
5174   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5175                                     epilog_reduc_code, phis, reduc_index,
5176                                     double_reduc, slp_node);
5177
5178   phis.release ();
5179   vec_oprnds0.release ();
5180   vec_oprnds1.release ();
5181
5182   return true;
5183 }
5184
5185 /* Function vect_min_worthwhile_factor.
5186
5187    For a loop where we could vectorize the operation indicated by CODE,
5188    return the minimum vectorization factor that makes it worthwhile
5189    to use generic vectors.  */
5190 int
5191 vect_min_worthwhile_factor (enum tree_code code)
5192 {
5193   switch (code)
5194     {
5195     case PLUS_EXPR:
5196     case MINUS_EXPR:
5197     case NEGATE_EXPR:
5198       return 4;
5199
5200     case BIT_AND_EXPR:
5201     case BIT_IOR_EXPR:
5202     case BIT_XOR_EXPR:
5203     case BIT_NOT_EXPR:
5204       return 2;
5205
5206     default:
5207       return INT_MAX;
5208     }
5209 }
5210
5211
5212 /* Function vectorizable_induction
5213
5214    Check if PHI performs an induction computation that can be vectorized.
5215    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5216    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5217    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5218
5219 bool
5220 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5221                         gimple *vec_stmt)
5222 {
5223   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5224   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5225   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5226   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5227   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5228   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5229   tree vec_def;
5230
5231   gcc_assert (ncopies >= 1);
5232   /* FORNOW. These restrictions should be relaxed.  */
5233   if (nested_in_vect_loop_p (loop, phi))
5234     {
5235       imm_use_iterator imm_iter;
5236       use_operand_p use_p;
5237       gimple exit_phi;
5238       edge latch_e;
5239       tree loop_arg;
5240
5241       if (ncopies > 1)
5242         {
5243           if (dump_enabled_p ())
5244             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5245                              "multiple types in nested loop.");
5246           return false;
5247         }
5248
5249       exit_phi = NULL;
5250       latch_e = loop_latch_edge (loop->inner);
5251       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5252       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5253         {
5254           if (!flow_bb_inside_loop_p (loop->inner,
5255                                       gimple_bb (USE_STMT (use_p))))
5256             {
5257               exit_phi = USE_STMT (use_p);
5258               break;
5259             }
5260         }
5261       if (exit_phi)
5262         {
5263           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5264           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5265                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5266             {
5267               if (dump_enabled_p ())
5268                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5269                                  "inner-loop induction only used outside "
5270                                  "of the outer vectorized loop.");
5271               return false;
5272             }
5273         }
5274     }
5275
5276   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5277     return false;
5278
5279   /* FORNOW: SLP not supported.  */
5280   if (STMT_SLP_TYPE (stmt_info))
5281     return false;
5282
5283   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5284
5285   if (gimple_code (phi) != GIMPLE_PHI)
5286     return false;
5287
5288   if (!vec_stmt) /* transformation not required.  */
5289     {
5290       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5291       if (dump_enabled_p ())
5292         dump_printf_loc (MSG_NOTE, vect_location,
5293                          "=== vectorizable_induction ===");
5294       vect_model_induction_cost (stmt_info, ncopies);
5295       return true;
5296     }
5297
5298   /** Transform.  **/
5299
5300   if (dump_enabled_p ())
5301     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.");
5302
5303   vec_def = get_initial_def_for_induction (phi);
5304   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5305   return true;
5306 }
5307
5308 /* Function vectorizable_live_operation.
5309
5310    STMT computes a value that is used outside the loop.  Check if
5311    it can be supported.  */
5312
5313 bool
5314 vectorizable_live_operation (gimple stmt,
5315                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5316                              gimple *vec_stmt ATTRIBUTE_UNUSED)
5317 {
5318   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5319   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5320   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5321   int i;
5322   int op_type;
5323   tree op;
5324   tree def;
5325   gimple def_stmt;
5326   enum vect_def_type dt;
5327   enum tree_code code;
5328   enum gimple_rhs_class rhs_class;
5329
5330   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5331
5332   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5333     return false;
5334
5335   if (!is_gimple_assign (stmt))
5336     return false;
5337
5338   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5339     return false;
5340
5341   /* FORNOW. CHECKME. */
5342   if (nested_in_vect_loop_p (loop, stmt))
5343     return false;
5344
5345   code = gimple_assign_rhs_code (stmt);
5346   op_type = TREE_CODE_LENGTH (code);
5347   rhs_class = get_gimple_rhs_class (code);
5348   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5349   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5350
5351   /* FORNOW: support only if all uses are invariant.  This means
5352      that the scalar operations can remain in place, unvectorized.
5353      The original last scalar value that they compute will be used.  */
5354
5355   for (i = 0; i < op_type; i++)
5356     {
5357       if (rhs_class == GIMPLE_SINGLE_RHS)
5358         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5359       else
5360         op = gimple_op (stmt, i + 1);
5361       if (op
5362           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5363                                   &dt))
5364         {
5365           if (dump_enabled_p ())
5366             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5367                              "use not simple.");
5368           return false;
5369         }
5370
5371       if (dt != vect_external_def && dt != vect_constant_def)
5372         return false;
5373     }
5374
5375   /* No transformation is required for the cases we currently support.  */
5376   return true;
5377 }
5378
5379 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5380
5381 static void
5382 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5383 {
5384   ssa_op_iter op_iter;
5385   imm_use_iterator imm_iter;
5386   def_operand_p def_p;
5387   gimple ustmt;
5388
5389   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5390     {
5391       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5392         {
5393           basic_block bb;
5394
5395           if (!is_gimple_debug (ustmt))
5396             continue;
5397
5398           bb = gimple_bb (ustmt);
5399
5400           if (!flow_bb_inside_loop_p (loop, bb))
5401             {
5402               if (gimple_debug_bind_p (ustmt))
5403                 {
5404                   if (dump_enabled_p ())
5405                     dump_printf_loc (MSG_NOTE, vect_location,
5406                                      "killing debug use");
5407
5408                   gimple_debug_bind_reset_value (ustmt);
5409                   update_stmt (ustmt);
5410                 }
5411               else
5412                 gcc_unreachable ();
5413             }
5414         }
5415     }
5416 }
5417
5418 /* Function vect_transform_loop.
5419
5420    The analysis phase has determined that the loop is vectorizable.
5421    Vectorize the loop - created vectorized stmts to replace the scalar
5422    stmts in the loop, and update the loop exit condition.  */
5423
5424 void
5425 vect_transform_loop (loop_vec_info loop_vinfo)
5426 {
5427   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5428   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5429   int nbbs = loop->num_nodes;
5430   gimple_stmt_iterator si;
5431   int i;
5432   tree ratio = NULL;
5433   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5434   bool grouped_store;
5435   bool slp_scheduled = false;
5436   unsigned int nunits;
5437   gimple stmt, pattern_stmt;
5438   gimple_seq pattern_def_seq = NULL;
5439   gimple_stmt_iterator pattern_def_si = gsi_none ();
5440   bool transform_pattern_stmt = false;
5441   bool check_profitability = false;
5442   int th;
5443   /* Record number of iterations before we started tampering with the profile. */
5444   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5445
5446   if (dump_enabled_p ())
5447     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===");
5448
5449   /* If profile is inprecise, we have chance to fix it up.  */
5450   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5451     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5452
5453   /* Use the more conservative vectorization threshold.  If the number
5454      of iterations is constant assume the cost check has been performed
5455      by our caller.  If the threshold makes all loops profitable that
5456      run at least the vectorization factor number of times checking
5457      is pointless, too.  */
5458   th = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
5459          * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
5460   th = MAX (th, LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo));
5461   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5462       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5463     {
5464       if (dump_enabled_p ())
5465         dump_printf_loc (MSG_NOTE, vect_location,
5466                          "Profitability threshold is %d loop iterations.", th);
5467       check_profitability = true;
5468     }
5469
5470   /* Peel the loop if there are data refs with unknown alignment.
5471      Only one data ref with unknown store is allowed.  */
5472
5473   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5474     {
5475       vect_do_peeling_for_alignment (loop_vinfo, th, check_profitability);
5476       check_profitability = false;
5477     }
5478
5479   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5480       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5481     {
5482       vect_loop_versioning (loop_vinfo, th, check_profitability);
5483       check_profitability = false;
5484     }
5485
5486   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5487      compile time constant), or it is a constant that doesn't divide by the
5488      vectorization factor, then an epilog loop needs to be created.
5489      We therefore duplicate the loop: the original loop will be vectorized,
5490      and will compute the first (n/VF) iterations.  The second copy of the loop
5491      will remain scalar and will compute the remaining (n%VF) iterations.
5492      (VF is the vectorization factor).  */
5493
5494   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5495        || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5496            && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
5497        || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5498     vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
5499                                     th, check_profitability);
5500   else
5501     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5502                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5503
5504   /* 1) Make sure the loop header has exactly two entries
5505      2) Make sure we have a preheader basic block.  */
5506
5507   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5508
5509   split_edge (loop_preheader_edge (loop));
5510
5511   /* FORNOW: the vectorizer supports only loops which body consist
5512      of one basic block (header + empty latch). When the vectorizer will
5513      support more involved loop forms, the order by which the BBs are
5514      traversed need to be reconsidered.  */
5515
5516   for (i = 0; i < nbbs; i++)
5517     {
5518       basic_block bb = bbs[i];
5519       stmt_vec_info stmt_info;
5520       gimple phi;
5521
5522       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5523         {
5524           phi = gsi_stmt (si);
5525           if (dump_enabled_p ())
5526             {
5527               dump_printf_loc (MSG_NOTE, vect_location,
5528                                "------>vectorizing phi: ");
5529               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5530             }
5531           stmt_info = vinfo_for_stmt (phi);
5532           if (!stmt_info)
5533             continue;
5534
5535           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5536             vect_loop_kill_debug_uses (loop, phi);
5537
5538           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5539               && !STMT_VINFO_LIVE_P (stmt_info))
5540             continue;
5541
5542           if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5543                 != (unsigned HOST_WIDE_INT) vectorization_factor)
5544               && dump_enabled_p ())
5545             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.");
5546
5547           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5548             {
5549               if (dump_enabled_p ())
5550                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.");
5551               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5552             }
5553         }
5554
5555       pattern_stmt = NULL;
5556       for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5557         {
5558           bool is_store;
5559
5560           if (transform_pattern_stmt)
5561             stmt = pattern_stmt;
5562           else
5563             stmt = gsi_stmt (si);
5564
5565           if (dump_enabled_p ())
5566             {
5567               dump_printf_loc (MSG_NOTE, vect_location,
5568                                "------>vectorizing statement: ");
5569               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5570             }
5571
5572           stmt_info = vinfo_for_stmt (stmt);
5573
5574           /* vector stmts created in the outer-loop during vectorization of
5575              stmts in an inner-loop may not have a stmt_info, and do not
5576              need to be vectorized.  */
5577           if (!stmt_info)
5578             {
5579               gsi_next (&si);
5580               continue;
5581             }
5582
5583           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5584             vect_loop_kill_debug_uses (loop, stmt);
5585
5586           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5587               && !STMT_VINFO_LIVE_P (stmt_info))
5588             {
5589               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5590                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5591                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5592                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5593                 {
5594                   stmt = pattern_stmt;
5595                   stmt_info = vinfo_for_stmt (stmt);
5596                 }
5597               else
5598                 {
5599                   gsi_next (&si);
5600                   continue;
5601                 }
5602             }
5603           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5604                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5605                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5606                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5607             transform_pattern_stmt = true;
5608
5609           /* If pattern statement has def stmts, vectorize them too.  */
5610           if (is_pattern_stmt_p (stmt_info))
5611             {
5612               if (pattern_def_seq == NULL)
5613                 {
5614                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
5615                   pattern_def_si = gsi_start (pattern_def_seq);
5616                 }
5617               else if (!gsi_end_p (pattern_def_si))
5618                 gsi_next (&pattern_def_si);
5619               if (pattern_def_seq != NULL)
5620                 {
5621                   gimple pattern_def_stmt = NULL;
5622                   stmt_vec_info pattern_def_stmt_info = NULL;
5623
5624                   while (!gsi_end_p (pattern_def_si))
5625                     {
5626                       pattern_def_stmt = gsi_stmt (pattern_def_si);
5627                       pattern_def_stmt_info
5628                         = vinfo_for_stmt (pattern_def_stmt);
5629                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
5630                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
5631                         break;
5632                       gsi_next (&pattern_def_si);
5633                     }
5634
5635                   if (!gsi_end_p (pattern_def_si))
5636                     {
5637                       if (dump_enabled_p ())
5638                         {
5639                           dump_printf_loc (MSG_NOTE, vect_location,
5640                                            "==> vectorizing pattern def "
5641                                            "stmt: ");
5642                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
5643                                             pattern_def_stmt, 0);
5644                         }
5645
5646                       stmt = pattern_def_stmt;
5647                       stmt_info = pattern_def_stmt_info;
5648                     }
5649                   else
5650                     {
5651                       pattern_def_si = gsi_none ();
5652                       transform_pattern_stmt = false;
5653                     }
5654                 }
5655               else
5656                 transform_pattern_stmt = false;
5657             }
5658
5659           gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5660           nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
5661                                                STMT_VINFO_VECTYPE (stmt_info));
5662           if (!STMT_SLP_TYPE (stmt_info)
5663               && nunits != (unsigned int) vectorization_factor
5664               && dump_enabled_p ())
5665             /* For SLP VF is set according to unrolling factor, and not to
5666                vector size, hence for SLP this print is not valid.  */
5667             dump_printf_loc (MSG_NOTE, vect_location,
5668                              "multiple-types.");
5669
5670           /* SLP. Schedule all the SLP instances when the first SLP stmt is
5671              reached.  */
5672           if (STMT_SLP_TYPE (stmt_info))
5673             {
5674               if (!slp_scheduled)
5675                 {
5676                   slp_scheduled = true;
5677
5678                   if (dump_enabled_p ())
5679                     dump_printf_loc (MSG_NOTE, vect_location,
5680                                      "=== scheduling SLP instances ===");
5681
5682                   vect_schedule_slp (loop_vinfo, NULL);
5683                 }
5684
5685               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
5686               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
5687                 {
5688                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5689                     {
5690                       pattern_def_seq = NULL;
5691                       gsi_next (&si);
5692                     }
5693                   continue;
5694                 }
5695             }
5696
5697           /* -------- vectorize statement ------------ */
5698           if (dump_enabled_p ())
5699             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.");
5700
5701           grouped_store = false;
5702           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
5703           if (is_store)
5704             {
5705               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
5706                 {
5707                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5708                      interleaving chain was completed - free all the stores in
5709                      the chain.  */
5710                   gsi_next (&si);
5711                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
5712                   continue;
5713                 }
5714               else
5715                 {
5716                   /* Free the attached stmt_vec_info and remove the stmt.  */
5717                   gimple store = gsi_stmt (si);
5718                   free_stmt_vec_info (store);
5719                   unlink_stmt_vdef (store);
5720                   gsi_remove (&si, true);
5721                   release_defs (store);
5722                   continue;
5723                 }
5724             }
5725
5726           if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5727             {
5728               pattern_def_seq = NULL;
5729               gsi_next (&si);
5730             }
5731         }                       /* stmts in BB */
5732     }                           /* BBs in loop */
5733
5734   slpeel_make_loop_iterate_ntimes (loop, ratio);
5735
5736   /* Reduce loop iterations by the vectorization factor.  */
5737   scale_loop_profile (loop, RDIV (REG_BR_PROB_BASE , vectorization_factor),
5738                       expected_iterations / vectorization_factor);
5739   loop->nb_iterations_upper_bound
5740     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (vectorization_factor),
5741                                             FLOOR_DIV_EXPR);
5742   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5743       && loop->nb_iterations_upper_bound != double_int_zero)
5744     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - double_int_one;
5745   if (loop->any_estimate)
5746     {
5747       loop->nb_iterations_estimate
5748         = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (vectorization_factor),
5749                                              FLOOR_DIV_EXPR);
5750        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5751            && loop->nb_iterations_estimate != double_int_zero)
5752          loop->nb_iterations_estimate = loop->nb_iterations_estimate - double_int_one;
5753     }
5754
5755   /* The memory tags and pointers in vectorized statements need to
5756      have their SSA forms updated.  FIXME, why can't this be delayed
5757      until all the loops have been transformed?  */
5758   update_ssa (TODO_update_ssa);
5759
5760   if (dump_enabled_p ())
5761     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, "LOOP VECTORIZED.");
5762   if (loop->inner && dump_enabled_p ())
5763     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
5764                      "OUTER LOOP VECTORIZED.");
5765 }