gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2013 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "ggc.h"
  28 #include "tree.h"
  29 #include "basic-block.h"
  30 #include "gimple-pretty-print.h"
  31 #include "tree-flow.h"
  32 #include "tree-pass.h"
  33 #include "cfgloop.h"
  34 #include "expr.h"
  35 #include "recog.h"
  36 #include "optabs.h"
  37 #include "params.h"
  38 #include "diagnostic-core.h"
  39 #include "tree-chrec.h"
  40 #include "tree-scalar-evolution.h"
  41 #include "tree-vectorizer.h"
  42 #include "target.h"
  43
  44 /* Loop Vectorization Pass.
  45
  46    This pass tries to vectorize loops.
  47
  48    For example, the vectorizer transforms the following simple loop:
  49
  50         short a[N]; short b[N]; short c[N]; int i;
  51
  52         for (i=0; i<N; i++){
  53           a[i] = b[i] + c[i];
  54         }
  55
  56    as if it was manually vectorized by rewriting the source code into:
  57
  58         typedef int __attribute__((mode(V8HI))) v8hi;
  59         short a[N];  short b[N]; short c[N];   int i;
  60         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  61         v8hi va, vb, vc;
  62
  63         for (i=0; i<N/8; i++){
  64           vb = pb[i];
  65           vc = pc[i];
  66           va = vb + vc;
  67           pa[i] = va;
  68         }
  69
  70         The main entry to this pass is vectorize_loops(), in which
  71    the vectorizer applies a set of analyses on a given set of loops,
  72    followed by the actual vectorization transformation for the loops that
  73    had successfully passed the analysis phase.
  74         Throughout this pass we make a distinction between two types of
  75    data: scalars (which are represented by SSA_NAMES), and memory references
  76    ("data-refs").  These two types of data require different handling both
  77    during analysis and transformation. The types of data-refs that the
  78    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  79    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  80    accesses are required to have a simple (consecutive) access pattern.
  81
  82    Analysis phase:
  83    ===============
  84         The driver for the analysis phase is vect_analyze_loop().
  85    It applies a set of analyses, some of which rely on the scalar evolution
  86    analyzer (scev) developed by Sebastian Pop.
  87
  88         During the analysis phase the vectorizer records some information
  89    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  90    loop, as well as general information about the loop as a whole, which is
  91    recorded in a "loop_vec_info" struct attached to each loop.
  92
  93    Transformation phase:
  94    =====================
  95         The loop transformation phase scans all the stmts in the loop, and
  96    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
  97    the loop that needs to be vectorized.  It inserts the vector code sequence
  98    just before the scalar stmt S, and records a pointer to the vector code
  99    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 100    attached to S).  This pointer will be used for the vectorization of following
 101    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 102    otherwise, we rely on dead code elimination for removing it.
 103
 104         For example, say stmt S1 was vectorized into stmt VS1:
 105
 106    VS1: vb = px[i];
 107    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 108    S2:  a = b;
 109
 110    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 111    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 112    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 113    resulting sequence would be:
 114
 115    VS1: vb = px[i];
 116    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 117    VS2: va = vb;
 118    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 119
 120         Operands that are not SSA_NAMEs, are data-refs that appear in
 121    load/store operations (like 'x[i]' in S1), and are handled differently.
 122
 123    Target modeling:
 124    =================
 125         Currently the only target specific information that is used is the
 126    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 127    Targets that can support different sizes of vectors, for now will need
 128    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 129    flexibility will be added in the future.
 130
 131         Since we only vectorize operations which vector form can be
 132    expressed using existing tree codes, to verify that an operation is
 133    supported, the vectorizer checks the relevant optab at the relevant
 134    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 135    the value found is CODE_FOR_nothing, then there's no target support, and
 136    we can't vectorize the stmt.
 137
 138    For additional information on this project see:
 139    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 140 */
 141
 142 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 143
 144 /* Function vect_determine_vectorization_factor
 145
 146    Determine the vectorization factor (VF).  VF is the number of data elements
 147    that are operated upon in parallel in a single iteration of the vectorized
 148    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 149    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 150    elements can fit in a single vector register.
 151
 152    We currently support vectorization of loops in which all types operated upon
 153    are of the same size.  Therefore this function currently sets VF according to
 154    the size of the types operated upon, and fails if there are multiple sizes
 155    in the loop.
 156
 157    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 158    original loop:
 159         for (i=0; i<N; i++){
 160           a[i] = b[i] + c[i];
 161         }
 162
 163    vectorized loop:
 164         for (i=0; i<N; i+=VF){
 165           a[i:VF] = b[i:VF] + c[i:VF];
 166         }
 167 */
 168
 169 static bool
 170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 171 {
 172   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 173   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 174   int nbbs = loop->num_nodes;
 175   gimple_stmt_iterator si;
 176   unsigned int vectorization_factor = 0;
 177   tree scalar_type;
 178   gimple phi;
 179   tree vectype;
 180   unsigned int nunits;
 181   stmt_vec_info stmt_info;
 182   int i;
 183   HOST_WIDE_INT dummy;
 184   gimple stmt, pattern_stmt = NULL;
 185   gimple_seq pattern_def_seq = NULL;
 186   gimple_stmt_iterator pattern_def_si = gsi_none ();
 187   bool analyze_pattern_stmt = false;
 188
 189   if (dump_enabled_p ())
 190     dump_printf_loc (MSG_NOTE, vect_location,
 191                      "=== vect_determine_vectorization_factor ===");
 192
 193   for (i = 0; i < nbbs; i++)
 194     {
 195       basic_block bb = bbs[i];
 196
 197       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 198         {
 199           phi = gsi_stmt (si);
 200           stmt_info = vinfo_for_stmt (phi);
 201           if (dump_enabled_p ())
 202             {
 203               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 204               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 205             }
 206
 207           gcc_assert (stmt_info);
 208
 209           if (STMT_VINFO_RELEVANT_P (stmt_info))
 210             {
 211               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 212               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 213
 214               if (dump_enabled_p ())
 215                 {
 216                   dump_printf_loc (MSG_NOTE, vect_location,
 217                                    "get vectype for scalar type:  ");
 218                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 219                 }
 220
 221               vectype = get_vectype_for_scalar_type (scalar_type);
 222               if (!vectype)
 223                 {
 224                   if (dump_enabled_p ())
 225                     {
 226                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 227                                        "not vectorized: unsupported "
 228                                        "data-type ");
 229                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 230                                          scalar_type);
 231                     }
 232                   return false;
 233                 }
 234               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 235
 236               if (dump_enabled_p ())
 237                 {
 238                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 239                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 240                 }
 241
 242               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 243               if (dump_enabled_p ())
 244                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
 245
 246               if (!vectorization_factor
 247                   || (nunits > vectorization_factor))
 248                 vectorization_factor = nunits;
 249             }
 250         }
 251
 252       for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
 253         {
 254           tree vf_vectype;
 255
 256           if (analyze_pattern_stmt)
 257             stmt = pattern_stmt;
 258           else
 259             stmt = gsi_stmt (si);
 260
 261           stmt_info = vinfo_for_stmt (stmt);
 262
 263           if (dump_enabled_p ())
 264             {
 265               dump_printf_loc (MSG_NOTE, vect_location,
 266                                "==> examining statement: ");
 267               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 268             }
 269
 270           gcc_assert (stmt_info);
 271
 272           /* Skip stmts which do not need to be vectorized.  */
 273           if (!STMT_VINFO_RELEVANT_P (stmt_info)
 274               && !STMT_VINFO_LIVE_P (stmt_info))
 275             {
 276               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 277                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 278                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 279                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 280                 {
 281                   stmt = pattern_stmt;
 282                   stmt_info = vinfo_for_stmt (pattern_stmt);
 283                   if (dump_enabled_p ())
 284                     {
 285                       dump_printf_loc (MSG_NOTE, vect_location,
 286                                        "==> examining pattern statement: ");
 287                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 288                     }
 289                 }
 290               else
 291                 {
 292                   if (dump_enabled_p ())
 293                     dump_printf_loc (MSG_NOTE, vect_location, "skip.");
 294                   gsi_next (&si);
 295                   continue;
 296                 }
 297             }
 298           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 299                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 300                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 301                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 302             analyze_pattern_stmt = true;
 303
 304           /* If a pattern statement has def stmts, analyze them too.  */
 305           if (is_pattern_stmt_p (stmt_info))
 306             {
 307               if (pattern_def_seq == NULL)
 308                 {
 309                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 310                   pattern_def_si = gsi_start (pattern_def_seq);
 311                 }
 312               else if (!gsi_end_p (pattern_def_si))
 313                 gsi_next (&pattern_def_si);
 314               if (pattern_def_seq != NULL)
 315                 {
 316                   gimple pattern_def_stmt = NULL;
 317                   stmt_vec_info pattern_def_stmt_info = NULL;
 318
 319                   while (!gsi_end_p (pattern_def_si))
 320                     {
 321                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 322                       pattern_def_stmt_info
 323                         = vinfo_for_stmt (pattern_def_stmt);
 324                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 325                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 326                         break;
 327                       gsi_next (&pattern_def_si);
 328                     }
 329
 330                   if (!gsi_end_p (pattern_def_si))
 331                     {
 332                       if (dump_enabled_p ())
 333                         {
 334                           dump_printf_loc (MSG_NOTE, vect_location,
 335                                            "==> examining pattern def stmt: ");
 336                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 337                                             pattern_def_stmt, 0);
 338                         }
 339
 340                       stmt = pattern_def_stmt;
 341                       stmt_info = pattern_def_stmt_info;
 342                     }
 343                   else
 344                     {
 345                       pattern_def_si = gsi_none ();
 346                       analyze_pattern_stmt = false;
 347                     }
 348                 }
 349               else
 350                 analyze_pattern_stmt = false;
 351             }
 352
 353           if (gimple_get_lhs (stmt) == NULL_TREE)
 354             {
 355               if (dump_enabled_p ())
 356                 {
 357                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 358                                    "not vectorized: irregular stmt.");
 359                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 360                                     0);
 361                 }
 362               return false;
 363             }
 364
 365           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 366             {
 367               if (dump_enabled_p ())
 368                 {
 369                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 370                                    "not vectorized: vector stmt in loop:");
 371                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 372                 }
 373               return false;
 374             }
 375
 376           if (STMT_VINFO_VECTYPE (stmt_info))
 377             {
 378               /* The only case when a vectype had been already set is for stmts
 379                  that contain a dataref, or for "pattern-stmts" (stmts
 380                  generated by the vectorizer to represent/replace a certain
 381                  idiom).  */
 382               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 383                           || is_pattern_stmt_p (stmt_info)
 384                           || !gsi_end_p (pattern_def_si));
 385               vectype = STMT_VINFO_VECTYPE (stmt_info);
 386             }
 387           else
 388             {
 389               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 390               scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 391               if (dump_enabled_p ())
 392                 {
 393                   dump_printf_loc (MSG_NOTE, vect_location,
 394                                    "get vectype for scalar type:  ");
 395                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 396                 }
 397               vectype = get_vectype_for_scalar_type (scalar_type);
 398               if (!vectype)
 399                 {
 400                   if (dump_enabled_p ())
 401                     {
 402                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 403                                        "not vectorized: unsupported "
 404                                        "data-type ");
 405                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 406                                          scalar_type);
 407                     }
 408                   return false;
 409                 }
 410
 411               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 412
 413               if (dump_enabled_p ())
 414                 {
 415                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 416                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 417                 }
 418             }
 419
 420           /* The vectorization factor is according to the smallest
 421              scalar type (or the largest vector size, but we only
 422              support one vector size per loop).  */
 423           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 424                                                        &dummy);
 425           if (dump_enabled_p ())
 426             {
 427               dump_printf_loc (MSG_NOTE, vect_location,
 428                                "get vectype for scalar type:  ");
 429               dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 430             }
 431           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 432           if (!vf_vectype)
 433             {
 434               if (dump_enabled_p ())
 435                 {
 436                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 437                                    "not vectorized: unsupported data-type ");
 438                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 439                                      scalar_type);
 440                 }
 441               return false;
 442             }
 443
 444           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 445                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 446             {
 447               if (dump_enabled_p ())
 448                 {
 449                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 450                                    "not vectorized: different sized vector "
 451                                    "types in statement, ");
 452                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 453                                      vectype);
 454                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 455                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 456                                      vf_vectype);
 457                 }
 458               return false;
 459             }
 460
 461           if (dump_enabled_p ())
 462             {
 463               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 464               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 465             }
 466
 467           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 468           if (dump_enabled_p ())
 469             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
 470           if (!vectorization_factor
 471               || (nunits > vectorization_factor))
 472             vectorization_factor = nunits;
 473
 474           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 475             {
 476               pattern_def_seq = NULL;
 477               gsi_next (&si);
 478             }
 479         }
 480     }
 481
 482   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 483   if (dump_enabled_p ())
 484     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d",
 485                      vectorization_factor);
 486   if (vectorization_factor <= 1)
 487     {
 488       if (dump_enabled_p ())
 489         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 490                          "not vectorized: unsupported data-type");
 491       return false;
 492     }
 493   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 494
 495   return true;
 496 }
 497
 498
 499 /* Function vect_is_simple_iv_evolution.
 500
 501    FORNOW: A simple evolution of an induction variables in the loop is
 502    considered a polynomial evolution with constant step.  */
 503
 504 static bool
 505 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 506                              tree * step)
 507 {
 508   tree init_expr;
 509   tree step_expr;
 510   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 511
 512   /* When there is no evolution in this loop, the evolution function
 513      is not "simple".  */
 514   if (evolution_part == NULL_TREE)
 515     return false;
 516
 517   /* When the evolution is a polynomial of degree >= 2
 518      the evolution function is not "simple".  */
 519   if (tree_is_chrec (evolution_part))
 520     return false;
 521
 522   step_expr = evolution_part;
 523   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 524
 525   if (dump_enabled_p ())
 526     {
 527       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 528       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 529       dump_printf (MSG_NOTE, ",  init: ");
 530       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 531     }
 532
 533   *init = init_expr;
 534   *step = step_expr;
 535
 536   if (TREE_CODE (step_expr) != INTEGER_CST)
 537     {
 538       if (dump_enabled_p ())
 539         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 540                          "step unknown.");
 541       return false;
 542     }
 543
 544   return true;
 545 }
 546
 547 /* Function vect_analyze_scalar_cycles_1.
 548
 549    Examine the cross iteration def-use cycles of scalar variables
 550    in LOOP.  LOOP_VINFO represents the loop that is now being
 551    considered for vectorization (can be LOOP, or an outer-loop
 552    enclosing LOOP).  */
 553
 554 static void
 555 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 556 {
 557   basic_block bb = loop->header;
 558   tree dumy;
 559   vec<gimple> worklist;
 560   worklist.create (64);
 561   gimple_stmt_iterator gsi;
 562   bool double_reduc;
 563
 564   if (dump_enabled_p ())
 565     dump_printf_loc (MSG_NOTE, vect_location,
 566                      "=== vect_analyze_scalar_cycles ===");
 567
 568   /* First - identify all inductions.  Reduction detection assumes that all the
 569      inductions have been identified, therefore, this order must not be
 570      changed.  */
 571   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 572     {
 573       gimple phi = gsi_stmt (gsi);
 574       tree access_fn = NULL;
 575       tree def = PHI_RESULT (phi);
 576       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 577
 578       if (dump_enabled_p ())
 579         {
 580           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 581           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 582         }
 583
 584       /* Skip virtual phi's.  The data dependences that are associated with
 585          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 586       if (virtual_operand_p (def))
 587         continue;
 588
 589       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 590
 591       /* Analyze the evolution function.  */
 592       access_fn = analyze_scalar_evolution (loop, def);
 593       if (access_fn)
 594         {
 595           STRIP_NOPS (access_fn);
 596           if (dump_enabled_p ())
 597             {
 598               dump_printf_loc (MSG_NOTE, vect_location,
 599                                "Access function of PHI: ");
 600               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 601             }
 602           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 603             = evolution_part_in_loop_num (access_fn, loop->num);
 604         }
 605
 606       if (!access_fn
 607           || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
 608         {
 609           worklist.safe_push (phi);
 610           continue;
 611         }
 612
 613       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 614
 615       if (dump_enabled_p ())
 616         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.");
 617       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 618     }
 619
 620
 621   /* Second - identify all reductions and nested cycles.  */
 622   while (worklist.length () > 0)
 623     {
 624       gimple phi = worklist.pop ();
 625       tree def = PHI_RESULT (phi);
 626       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 627       gimple reduc_stmt;
 628       bool nested_cycle;
 629
 630       if (dump_enabled_p ())
 631         {
 632           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 633           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 634         }
 635
 636       gcc_assert (!virtual_operand_p (def)
 637                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 638
 639       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 640       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 641                                                 &double_reduc);
 642       if (reduc_stmt)
 643         {
 644           if (double_reduc)
 645             {
 646               if (dump_enabled_p ())
 647                 dump_printf_loc (MSG_NOTE, vect_location,
 648                                  "Detected double reduction.");
 649
 650               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 651               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 652                                                     vect_double_reduction_def;
 653             }
 654           else
 655             {
 656               if (nested_cycle)
 657                 {
 658                   if (dump_enabled_p ())
 659                     dump_printf_loc (MSG_NOTE, vect_location,
 660                                      "Detected vectorizable nested cycle.");
 661
 662                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 663                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 664                                                              vect_nested_cycle;
 665                 }
 666               else
 667                 {
 668                   if (dump_enabled_p ())
 669                     dump_printf_loc (MSG_NOTE, vect_location,
 670                                      "Detected reduction.");
 671
 672                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 673                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 674                                                            vect_reduction_def;
 675                   /* Store the reduction cycles for possible vectorization in
 676                      loop-aware SLP.  */
 677                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 678                 }
 679             }
 680         }
 681       else
 682         if (dump_enabled_p ())
 683           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 684                            "Unknown def-use cycle pattern.");
 685     }
 686
 687   worklist.release ();
 688 }
 689
 690
 691 /* Function vect_analyze_scalar_cycles.
 692
 693    Examine the cross iteration def-use cycles of scalar variables, by
 694    analyzing the loop-header PHIs of scalar variables.  Classify each
 695    cycle as one of the following: invariant, induction, reduction, unknown.
 696    We do that for the loop represented by LOOP_VINFO, and also to its
 697    inner-loop, if exists.
 698    Examples for scalar cycles:
 699
 700    Example1: reduction:
 701
 702               loop1:
 703               for (i=0; i<N; i++)
 704                  sum += a[i];
 705
 706    Example2: induction:
 707
 708               loop2:
 709               for (i=0; i<N; i++)
 710                  a[i] = i;  */
 711
 712 static void
 713 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 714 {
 715   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 716
 717   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 718
 719   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 720      Reductions in such inner-loop therefore have different properties than
 721      the reductions in the nest that gets vectorized:
 722      1. When vectorized, they are executed in the same order as in the original
 723         scalar loop, so we can't change the order of computation when
 724         vectorizing them.
 725      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 726         current checks are too strict.  */
 727
 728   if (loop->inner)
 729     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 730 }
 731
 732 /* Function vect_get_loop_niters.
 733
 734    Determine how many iterations the loop is executed.
 735    If an expression that represents the number of iterations
 736    can be constructed, place it in NUMBER_OF_ITERATIONS.
 737    Return the loop exit condition.  */
 738
 739 static gimple
 740 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
 741 {
 742   tree niters;
 743
 744   if (dump_enabled_p ())
 745     dump_printf_loc (MSG_NOTE, vect_location,
 746                      "=== get_loop_niters ===");
 747   niters = number_of_exit_cond_executions (loop);
 748
 749   if (niters != NULL_TREE
 750       && niters != chrec_dont_know)
 751     {
 752       *number_of_iterations = niters;
 753
 754       if (dump_enabled_p ())
 755         {
 756           dump_printf_loc (MSG_NOTE, vect_location, "==> get_loop_niters:");
 757           dump_generic_expr (MSG_NOTE, TDF_SLIM, *number_of_iterations);
 758         }
 759     }
 760
 761   return get_loop_exit_condition (loop);
 762 }
 763
 764
 765 /* Function bb_in_loop_p
 766
 767    Used as predicate for dfs order traversal of the loop bbs.  */
 768
 769 static bool
 770 bb_in_loop_p (const_basic_block bb, const void *data)
 771 {
 772   const struct loop *const loop = (const struct loop *)data;
 773   if (flow_bb_inside_loop_p (loop, bb))
 774     return true;
 775   return false;
 776 }
 777
 778
 779 /* Function new_loop_vec_info.
 780
 781    Create and initialize a new loop_vec_info struct for LOOP, as well as
 782    stmt_vec_info structs for all the stmts in LOOP.  */
 783
 784 static loop_vec_info
 785 new_loop_vec_info (struct loop *loop)
 786 {
 787   loop_vec_info res;
 788   basic_block *bbs;
 789   gimple_stmt_iterator si;
 790   unsigned int i, nbbs;
 791
 792   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 793   LOOP_VINFO_LOOP (res) = loop;
 794
 795   bbs = get_loop_body (loop);
 796
 797   /* Create/Update stmt_info for all stmts in the loop.  */
 798   for (i = 0; i < loop->num_nodes; i++)
 799     {
 800       basic_block bb = bbs[i];
 801
 802       /* BBs in a nested inner-loop will have been already processed (because
 803          we will have called vect_analyze_loop_form for any nested inner-loop).
 804          Therefore, for stmts in an inner-loop we just want to update the
 805          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 806          loop_info of the outer-loop we are currently considering to vectorize
 807          (instead of the loop_info of the inner-loop).
 808          For stmts in other BBs we need to create a stmt_info from scratch.  */
 809       if (bb->loop_father != loop)
 810         {
 811           /* Inner-loop bb.  */
 812           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 813           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 814             {
 815               gimple phi = gsi_stmt (si);
 816               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 817               loop_vec_info inner_loop_vinfo =
 818                 STMT_VINFO_LOOP_VINFO (stmt_info);
 819               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 820               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 821             }
 822           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 823            {
 824               gimple stmt = gsi_stmt (si);
 825               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 826               loop_vec_info inner_loop_vinfo =
 827                  STMT_VINFO_LOOP_VINFO (stmt_info);
 828               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 829               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 830            }
 831         }
 832       else
 833         {
 834           /* bb in current nest.  */
 835           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 836             {
 837               gimple phi = gsi_stmt (si);
 838               gimple_set_uid (phi, 0);
 839               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 840             }
 841
 842           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 843             {
 844               gimple stmt = gsi_stmt (si);
 845               gimple_set_uid (stmt, 0);
 846               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 847             }
 848         }
 849     }
 850
 851   /* CHECKME: We want to visit all BBs before their successors (except for
 852      latch blocks, for which this assertion wouldn't hold).  In the simple
 853      case of the loop forms we allow, a dfs order of the BBs would the same
 854      as reversed postorder traversal, so we are safe.  */
 855
 856    free (bbs);
 857    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 858    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 859                               bbs, loop->num_nodes, loop);
 860    gcc_assert (nbbs == loop->num_nodes);
 861
 862   LOOP_VINFO_BBS (res) = bbs;
 863   LOOP_VINFO_NITERS (res) = NULL;
 864   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 865   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 866   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 867   LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
 868   LOOP_VINFO_VECT_FACTOR (res) = 0;
 869   LOOP_VINFO_LOOP_NEST (res).create (3);
 870   LOOP_VINFO_DATAREFS (res).create (10);
 871   LOOP_VINFO_DDRS (res).create (10 * 10);
 872   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 873   LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
 874              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 875   LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
 876              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 877   LOOP_VINFO_GROUPED_STORES (res).create (10);
 878   LOOP_VINFO_REDUCTIONS (res).create (10);
 879   LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
 880   LOOP_VINFO_SLP_INSTANCES (res).create (10);
 881   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 882   LOOP_VINFO_PEELING_HTAB (res) = NULL;
 883   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
 884   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
 885   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
 886
 887   return res;
 888 }
 889
 890
 891 /* Function destroy_loop_vec_info.
 892
 893    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 894    stmts in the loop.  */
 895
 896 void
 897 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 898 {
 899   struct loop *loop;
 900   basic_block *bbs;
 901   int nbbs;
 902   gimple_stmt_iterator si;
 903   int j;
 904   vec<slp_instance> slp_instances;
 905   slp_instance instance;
 906   bool swapped;
 907
 908   if (!loop_vinfo)
 909     return;
 910
 911   loop = LOOP_VINFO_LOOP (loop_vinfo);
 912
 913   bbs = LOOP_VINFO_BBS (loop_vinfo);
 914   nbbs = clean_stmts ? loop->num_nodes : 0;
 915   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
 916
 917   for (j = 0; j < nbbs; j++)
 918     {
 919       basic_block bb = bbs[j];
 920       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 921         free_stmt_vec_info (gsi_stmt (si));
 922
 923       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 924         {
 925           gimple stmt = gsi_stmt (si);
 926
 927           /* We may have broken canonical form by moving a constant
 928              into RHS1 of a commutative op.  Fix such occurrences.  */
 929           if (swapped && is_gimple_assign (stmt))
 930             {
 931               enum tree_code code = gimple_assign_rhs_code (stmt);
 932
 933               if ((code == PLUS_EXPR
 934                    || code == POINTER_PLUS_EXPR
 935                    || code == MULT_EXPR)
 936                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 937                 swap_tree_operands (stmt,
 938                                     gimple_assign_rhs1_ptr (stmt),
 939                                     gimple_assign_rhs2_ptr (stmt));
 940             }
 941
 942           /* Free stmt_vec_info.  */
 943           free_stmt_vec_info (stmt);
 944           gsi_next (&si);
 945         }
 946     }
 947
 948   free (LOOP_VINFO_BBS (loop_vinfo));
 949   free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 950   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 951   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
 952   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
 953   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
 954   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
 955   FOR_EACH_VEC_ELT (slp_instances, j, instance)
 956     vect_free_slp_instance (instance);
 957
 958   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
 959   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
 960   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
 961   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
 962
 963   if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
 964     htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
 965
 966   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
 967
 968   free (loop_vinfo);
 969   loop->aux = NULL;
 970 }
 971
 972
 973 /* Function vect_analyze_loop_1.
 974
 975    Apply a set of analyses on LOOP, and create a loop_vec_info struct
 976    for it. The different analyses will record information in the
 977    loop_vec_info struct.  This is a subset of the analyses applied in
 978    vect_analyze_loop, to be applied on an inner-loop nested in the loop
 979    that is now considered for (outer-loop) vectorization.  */
 980
 981 static loop_vec_info
 982 vect_analyze_loop_1 (struct loop *loop)
 983 {
 984   loop_vec_info loop_vinfo;
 985
 986   if (dump_enabled_p ())
 987     dump_printf_loc (MSG_NOTE, vect_location,
 988                      "===== analyze_loop_nest_1 =====");
 989
 990   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
 991
 992   loop_vinfo = vect_analyze_loop_form (loop);
 993   if (!loop_vinfo)
 994     {
 995       if (dump_enabled_p ())
 996         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 997                          "bad inner-loop form.");
 998       return NULL;
 999     }
1000
1001   return loop_vinfo;
1002 }
1003
1004
1005 /* Function vect_analyze_loop_form.
1006
1007    Verify that certain CFG restrictions hold, including:
1008    - the loop has a pre-header
1009    - the loop has a single entry and exit
1010    - the loop exit condition is simple enough, and the number of iterations
1011      can be analyzed (a countable loop).  */
1012
1013 loop_vec_info
1014 vect_analyze_loop_form (struct loop *loop)
1015 {
1016   loop_vec_info loop_vinfo;
1017   gimple loop_cond;
1018   tree number_of_iterations = NULL;
1019   loop_vec_info inner_loop_vinfo = NULL;
1020
1021   if (dump_enabled_p ())
1022     dump_printf_loc (MSG_NOTE, vect_location,
1023                      "=== vect_analyze_loop_form ===");
1024
1025   /* Different restrictions apply when we are considering an inner-most loop,
1026      vs. an outer (nested) loop.
1027      (FORNOW. May want to relax some of these restrictions in the future).  */
1028
1029   if (!loop->inner)
1030     {
1031       /* Inner-most loop.  We currently require that the number of BBs is
1032          exactly 2 (the header and latch).  Vectorizable inner-most loops
1033          look like this:
1034
1035                         (pre-header)
1036                            |
1037                           header <--------+
1038                            | |            |
1039                            | +--> latch --+
1040                            |
1041                         (exit-bb)  */
1042
1043       if (loop->num_nodes != 2)
1044         {
1045           if (dump_enabled_p ())
1046             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1047                              "not vectorized: control flow in loop.");
1048           return NULL;
1049         }
1050
1051       if (empty_block_p (loop->header))
1052     {
1053           if (dump_enabled_p ())
1054             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1055                              "not vectorized: empty loop.");
1056       return NULL;
1057     }
1058     }
1059   else
1060     {
1061       struct loop *innerloop = loop->inner;
1062       edge entryedge;
1063
1064       /* Nested loop. We currently require that the loop is doubly-nested,
1065          contains a single inner loop, and the number of BBs is exactly 5.
1066          Vectorizable outer-loops look like this:
1067
1068                         (pre-header)
1069                            |
1070                           header <---+
1071                            |         |
1072                           inner-loop |
1073                            |         |
1074                           tail ------+
1075                            |
1076                         (exit-bb)
1077
1078          The inner-loop has the properties expected of inner-most loops
1079          as described above.  */
1080
1081       if ((loop->inner)->inner || (loop->inner)->next)
1082         {
1083           if (dump_enabled_p ())
1084             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1085                              "not vectorized: multiple nested loops.");
1086           return NULL;
1087         }
1088
1089       /* Analyze the inner-loop.  */
1090       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1091       if (!inner_loop_vinfo)
1092         {
1093           if (dump_enabled_p ())
1094             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1095                              "not vectorized: Bad inner loop.");
1096           return NULL;
1097         }
1098
1099       if (!expr_invariant_in_loop_p (loop,
1100                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
1101         {
1102           if (dump_enabled_p ())
1103             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1104                              "not vectorized: inner-loop count not invariant.");
1105           destroy_loop_vec_info (inner_loop_vinfo, true);
1106           return NULL;
1107         }
1108
1109       if (loop->num_nodes != 5)
1110         {
1111           if (dump_enabled_p ())
1112             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1113                              "not vectorized: control flow in loop.");
1114           destroy_loop_vec_info (inner_loop_vinfo, true);
1115           return NULL;
1116         }
1117
1118       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1119       entryedge = EDGE_PRED (innerloop->header, 0);
1120       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1121         entryedge = EDGE_PRED (innerloop->header, 1);
1122
1123       if (entryedge->src != loop->header
1124           || !single_exit (innerloop)
1125           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1126         {
1127           if (dump_enabled_p ())
1128             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1129                              "not vectorized: unsupported outerloop form.");
1130           destroy_loop_vec_info (inner_loop_vinfo, true);
1131           return NULL;
1132         }
1133
1134       if (dump_enabled_p ())
1135         dump_printf_loc (MSG_NOTE, vect_location,
1136                          "Considering outer-loop vectorization.");
1137     }
1138
1139   if (!single_exit (loop)
1140       || EDGE_COUNT (loop->header->preds) != 2)
1141     {
1142       if (dump_enabled_p ())
1143         {
1144           if (!single_exit (loop))
1145             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1146                              "not vectorized: multiple exits.");
1147           else if (EDGE_COUNT (loop->header->preds) != 2)
1148             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1149                              "not vectorized: too many incoming edges.");
1150         }
1151       if (inner_loop_vinfo)
1152         destroy_loop_vec_info (inner_loop_vinfo, true);
1153       return NULL;
1154     }
1155
1156   /* We assume that the loop exit condition is at the end of the loop. i.e,
1157      that the loop is represented as a do-while (with a proper if-guard
1158      before the loop if needed), where the loop header contains all the
1159      executable statements, and the latch is empty.  */
1160   if (!empty_block_p (loop->latch)
1161       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1162     {
1163       if (dump_enabled_p ())
1164         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1165                          "not vectorized: latch block not empty.");
1166       if (inner_loop_vinfo)
1167         destroy_loop_vec_info (inner_loop_vinfo, true);
1168       return NULL;
1169     }
1170
1171   /* Make sure there exists a single-predecessor exit bb:  */
1172   if (!single_pred_p (single_exit (loop)->dest))
1173     {
1174       edge e = single_exit (loop);
1175       if (!(e->flags & EDGE_ABNORMAL))
1176         {
1177           split_loop_exit_edge (e);
1178           if (dump_enabled_p ())
1179             dump_printf (MSG_NOTE, "split exit edge.");
1180         }
1181       else
1182         {
1183           if (dump_enabled_p ())
1184             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1185                              "not vectorized: abnormal loop exit edge.");
1186           if (inner_loop_vinfo)
1187             destroy_loop_vec_info (inner_loop_vinfo, true);
1188           return NULL;
1189         }
1190     }
1191
1192   loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1193   if (!loop_cond)
1194     {
1195       if (dump_enabled_p ())
1196         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1197                          "not vectorized: complicated exit condition.");
1198       if (inner_loop_vinfo)
1199         destroy_loop_vec_info (inner_loop_vinfo, true);
1200       return NULL;
1201     }
1202
1203   if (!number_of_iterations)
1204     {
1205       if (dump_enabled_p ())
1206         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1207                          "not vectorized: number of iterations cannot be "
1208                          "computed.");
1209       if (inner_loop_vinfo)
1210         destroy_loop_vec_info (inner_loop_vinfo, true);
1211       return NULL;
1212     }
1213
1214   if (chrec_contains_undetermined (number_of_iterations))
1215     {
1216       if (dump_enabled_p ())
1217             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1218                              "Infinite number of iterations.");
1219       if (inner_loop_vinfo)
1220         destroy_loop_vec_info (inner_loop_vinfo, true);
1221       return NULL;
1222     }
1223
1224   if (!NITERS_KNOWN_P (number_of_iterations))
1225     {
1226       if (dump_enabled_p ())
1227         {
1228           dump_printf_loc (MSG_NOTE, vect_location,
1229                            "Symbolic number of iterations is ");
1230           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1231         }
1232     }
1233   else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1234     {
1235       if (dump_enabled_p ())
1236         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1237                          "not vectorized: number of iterations = 0.");
1238       if (inner_loop_vinfo)
1239         destroy_loop_vec_info (inner_loop_vinfo, true);
1240       return NULL;
1241     }
1242
1243   loop_vinfo = new_loop_vec_info (loop);
1244   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1245   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1246
1247   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1248
1249   /* CHECKME: May want to keep it around it in the future.  */
1250   if (inner_loop_vinfo)
1251     destroy_loop_vec_info (inner_loop_vinfo, false);
1252
1253   gcc_assert (!loop->aux);
1254   loop->aux = loop_vinfo;
1255   return loop_vinfo;
1256 }
1257
1258
1259 /* Function vect_analyze_loop_operations.
1260
1261    Scan the loop stmts and make sure they are all vectorizable.  */
1262
1263 static bool
1264 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1265 {
1266   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1267   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1268   int nbbs = loop->num_nodes;
1269   gimple_stmt_iterator si;
1270   unsigned int vectorization_factor = 0;
1271   int i;
1272   gimple phi;
1273   stmt_vec_info stmt_info;
1274   bool need_to_vectorize = false;
1275   int min_profitable_iters;
1276   int min_scalar_loop_bound;
1277   unsigned int th;
1278   bool only_slp_in_loop = true, ok;
1279   HOST_WIDE_INT max_niter;
1280   HOST_WIDE_INT estimated_niter;
1281   int min_profitable_estimate;
1282
1283   if (dump_enabled_p ())
1284     dump_printf_loc (MSG_NOTE, vect_location,
1285                      "=== vect_analyze_loop_operations ===");
1286
1287   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1288   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1289   if (slp)
1290     {
1291       /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1292          vectorization factor of the loop is the unrolling factor required by
1293          the SLP instances.  If that unrolling factor is 1, we say, that we
1294          perform pure SLP on loop - cross iteration parallelism is not
1295          exploited.  */
1296       for (i = 0; i < nbbs; i++)
1297         {
1298           basic_block bb = bbs[i];
1299           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1300             {
1301               gimple stmt = gsi_stmt (si);
1302               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1303               gcc_assert (stmt_info);
1304               if ((STMT_VINFO_RELEVANT_P (stmt_info)
1305                    || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1306                   && !PURE_SLP_STMT (stmt_info))
1307                 /* STMT needs both SLP and loop-based vectorization.  */
1308                 only_slp_in_loop = false;
1309             }
1310         }
1311
1312       if (only_slp_in_loop)
1313         vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1314       else
1315         vectorization_factor = least_common_multiple (vectorization_factor,
1316                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1317
1318       LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1319       if (dump_enabled_p ())
1320         dump_printf_loc (MSG_NOTE, vect_location,
1321                          "Updating vectorization factor to %d ",
1322                          vectorization_factor);
1323     }
1324
1325   for (i = 0; i < nbbs; i++)
1326     {
1327       basic_block bb = bbs[i];
1328
1329       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1330         {
1331           phi = gsi_stmt (si);
1332           ok = true;
1333
1334           stmt_info = vinfo_for_stmt (phi);
1335           if (dump_enabled_p ())
1336             {
1337               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1338               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1339             }
1340
1341           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1342              (i.e., a phi in the tail of the outer-loop).  */
1343           if (! is_loop_header_bb_p (bb))
1344             {
1345               /* FORNOW: we currently don't support the case that these phis
1346                  are not used in the outerloop (unless it is double reduction,
1347                  i.e., this phi is vect_reduction_def), cause this case
1348                  requires to actually do something here.  */
1349               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1350                    || STMT_VINFO_LIVE_P (stmt_info))
1351                   && STMT_VINFO_DEF_TYPE (stmt_info)
1352                      != vect_double_reduction_def)
1353                 {
1354                   if (dump_enabled_p ())
1355                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1356                                      "Unsupported loop-closed phi in "
1357                                      "outer-loop.");
1358                   return false;
1359                 }
1360
1361               /* If PHI is used in the outer loop, we check that its operand
1362                  is defined in the inner loop.  */
1363               if (STMT_VINFO_RELEVANT_P (stmt_info))
1364                 {
1365                   tree phi_op;
1366                   gimple op_def_stmt;
1367
1368                   if (gimple_phi_num_args (phi) != 1)
1369                     return false;
1370
1371                   phi_op = PHI_ARG_DEF (phi, 0);
1372                   if (TREE_CODE (phi_op) != SSA_NAME)
1373                     return false;
1374
1375                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1376                   if (!op_def_stmt
1377                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1378                       || !vinfo_for_stmt (op_def_stmt))
1379                     return false;
1380
1381                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1382                         != vect_used_in_outer
1383                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1384                            != vect_used_in_outer_by_reduction)
1385                     return false;
1386                 }
1387
1388               continue;
1389             }
1390
1391           gcc_assert (stmt_info);
1392
1393           if (STMT_VINFO_LIVE_P (stmt_info))
1394             {
1395               /* FORNOW: not yet supported.  */
1396               if (dump_enabled_p ())
1397                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1398                                  "not vectorized: value used after loop.");
1399               return false;
1400             }
1401
1402           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1403               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1404             {
1405               /* A scalar-dependence cycle that we don't support.  */
1406               if (dump_enabled_p ())
1407                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1408                                  "not vectorized: scalar dependence cycle.");
1409               return false;
1410             }
1411
1412           if (STMT_VINFO_RELEVANT_P (stmt_info))
1413             {
1414               need_to_vectorize = true;
1415               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1416                 ok = vectorizable_induction (phi, NULL, NULL);
1417             }
1418
1419           if (!ok)
1420             {
1421               if (dump_enabled_p ())
1422                 {
1423                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1424                                    "not vectorized: relevant phi not "
1425                                    "supported: ");
1426                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1427                 }
1428               return false;
1429             }
1430         }
1431
1432       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1433         {
1434           gimple stmt = gsi_stmt (si);
1435           if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1436             return false;
1437         }
1438     } /* bbs */
1439
1440   /* All operations in the loop are either irrelevant (deal with loop
1441      control, or dead), or only used outside the loop and can be moved
1442      out of the loop (e.g. invariants, inductions).  The loop can be
1443      optimized away by scalar optimizations.  We're better off not
1444      touching this loop.  */
1445   if (!need_to_vectorize)
1446     {
1447       if (dump_enabled_p ())
1448         dump_printf_loc (MSG_NOTE, vect_location,
1449                          "All the computation can be taken out of the loop.");
1450       if (dump_enabled_p ())
1451         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1452                          "not vectorized: redundant loop. no profit to "
1453                          "vectorize.");
1454       return false;
1455     }
1456
1457   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1458     dump_printf_loc (MSG_NOTE, vect_location,
1459                      "vectorization_factor = %d, niters = "
1460                      HOST_WIDE_INT_PRINT_DEC, vectorization_factor,
1461                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1462
1463   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1464        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1465       || ((max_niter = max_stmt_executions_int (loop)) != -1
1466           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1467     {
1468       if (dump_enabled_p ())
1469         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1470                          "not vectorized: iteration count too small.");
1471       if (dump_enabled_p ())
1472         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1473                          "not vectorized: iteration count smaller than "
1474                          "vectorization factor.");
1475       return false;
1476     }
1477
1478   /* Analyze cost.  Decide if worth while to vectorize.  */
1479
1480   /* Once VF is set, SLP costs should be updated since the number of created
1481      vector stmts depends on VF.  */
1482   vect_update_slp_costs_according_to_vf (loop_vinfo);
1483
1484   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1485                                       &min_profitable_estimate);
1486   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1487
1488   if (min_profitable_iters < 0)
1489     {
1490       if (dump_enabled_p ())
1491         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1492                          "not vectorized: vectorization not profitable.");
1493       if (dump_enabled_p ())
1494         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1495                          "not vectorized: vector version will never be "
1496                          "profitable.");
1497       return false;
1498     }
1499
1500   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1501                             * vectorization_factor) - 1);
1502
1503
1504   /* Use the cost model only if it is more conservative than user specified
1505      threshold.  */
1506
1507   th = (unsigned) min_scalar_loop_bound;
1508   if (min_profitable_iters
1509       && (!min_scalar_loop_bound
1510           || min_profitable_iters > min_scalar_loop_bound))
1511     th = (unsigned) min_profitable_iters;
1512
1513   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1514       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1515     {
1516       if (dump_enabled_p ())
1517         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1518                          "not vectorized: vectorization not profitable.");
1519       if (dump_enabled_p ())
1520         dump_printf_loc (MSG_NOTE, vect_location,
1521                          "not vectorized: iteration count smaller than user "
1522                          "specified loop bound parameter or minimum profitable "
1523                          "iterations (whichever is more conservative).");
1524       return false;
1525     }
1526
1527   if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1528       && ((unsigned HOST_WIDE_INT) estimated_niter
1529           <= MAX (th, (unsigned)min_profitable_estimate)))
1530     {
1531       if (dump_enabled_p ())
1532         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1533                          "not vectorized: estimated iteration count too "
1534                          "small.");
1535       if (dump_enabled_p ())
1536         dump_printf_loc (MSG_NOTE, vect_location,
1537                          "not vectorized: estimated iteration count smaller "
1538                          "than specified loop bound parameter or minimum "
1539                          "profitable iterations (whichever is more "
1540                          "conservative).");
1541       return false;
1542     }
1543
1544   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1545       || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1546       || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1547     {
1548       if (dump_enabled_p ())
1549         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required.");
1550       if (!vect_can_advance_ivs_p (loop_vinfo))
1551         {
1552           if (dump_enabled_p ())
1553             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1554                              "not vectorized: can't create epilog loop 1.");
1555           return false;
1556         }
1557       if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1558         {
1559           if (dump_enabled_p ())
1560             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1561                              "not vectorized: can't create epilog loop 2.");
1562           return false;
1563         }
1564     }
1565
1566   return true;
1567 }
1568
1569
1570 /* Function vect_analyze_loop_2.
1571
1572    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1573    for it.  The different analyses will record information in the
1574    loop_vec_info struct.  */
1575 static bool
1576 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1577 {
1578   bool ok, slp = false;
1579   int max_vf = MAX_VECTORIZATION_FACTOR;
1580   int min_vf = 2;
1581
1582   /* Find all data references in the loop (which correspond to vdefs/vuses)
1583      and analyze their evolution in the loop.  Also adjust the minimal
1584      vectorization factor according to the loads and stores.
1585
1586      FORNOW: Handle only simple, array references, which
1587      alignment can be forced, and aligned pointer-references.  */
1588
1589   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1590   if (!ok)
1591     {
1592       if (dump_enabled_p ())
1593         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1594                          "bad data references.");
1595       return false;
1596     }
1597
1598   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1599      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1600
1601   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1602   if (!ok)
1603     {
1604       if (dump_enabled_p ())
1605         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1606                          "bad data access.");
1607       return false;
1608     }
1609
1610   /* Classify all cross-iteration scalar data-flow cycles.
1611      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1612
1613   vect_analyze_scalar_cycles (loop_vinfo);
1614
1615   vect_pattern_recog (loop_vinfo, NULL);
1616
1617   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1618
1619   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1620   if (!ok)
1621     {
1622       if (dump_enabled_p ())
1623         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1624                          "unexpected pattern.");
1625       return false;
1626     }
1627
1628   /* Analyze data dependences between the data-refs in the loop
1629      and adjust the maximum vectorization factor according to
1630      the dependences.
1631      FORNOW: fail at the first data dependence that we encounter.  */
1632
1633   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1634   if (!ok
1635       || max_vf < min_vf)
1636     {
1637       if (dump_enabled_p ())
1638             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1639                              "bad data dependence.");
1640       return false;
1641     }
1642
1643   ok = vect_determine_vectorization_factor (loop_vinfo);
1644   if (!ok)
1645     {
1646       if (dump_enabled_p ())
1647         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1648                          "can't determine vectorization factor.");
1649       return false;
1650     }
1651   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1652     {
1653       if (dump_enabled_p ())
1654         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1655                          "bad data dependence.");
1656       return false;
1657     }
1658
1659   /* Analyze the alignment of the data-refs in the loop.
1660      Fail if a data reference is found that cannot be vectorized.  */
1661
1662   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1663   if (!ok)
1664     {
1665       if (dump_enabled_p ())
1666         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1667                          "bad data alignment.");
1668       return false;
1669     }
1670
1671   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1672      It is important to call pruning after vect_analyze_data_ref_accesses,
1673      since we use grouping information gathered by interleaving analysis.  */
1674   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1675   if (!ok)
1676     {
1677       if (dump_enabled_p ())
1678         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1679                          "too long list of versioning for alias "
1680                          "run-time tests.");
1681       return false;
1682     }
1683
1684   /* This pass will decide on using loop versioning and/or loop peeling in
1685      order to enhance the alignment of data references in the loop.  */
1686
1687   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1688   if (!ok)
1689     {
1690       if (dump_enabled_p ())
1691         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1692                          "bad data alignment.");
1693       return false;
1694     }
1695
1696   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1697   ok = vect_analyze_slp (loop_vinfo, NULL);
1698   if (ok)
1699     {
1700       /* Decide which possible SLP instances to SLP.  */
1701       slp = vect_make_slp_decision (loop_vinfo);
1702
1703       /* Find stmts that need to be both vectorized and SLPed.  */
1704       vect_detect_hybrid_slp (loop_vinfo);
1705     }
1706   else
1707     return false;
1708
1709   /* Scan all the operations in the loop and make sure they are
1710      vectorizable.  */
1711
1712   ok = vect_analyze_loop_operations (loop_vinfo, slp);
1713   if (!ok)
1714     {
1715       if (dump_enabled_p ())
1716         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1717                          "bad operation or unsupported loop bound.");
1718       return false;
1719     }
1720
1721   return true;
1722 }
1723
1724 /* Function vect_analyze_loop.
1725
1726    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1727    for it.  The different analyses will record information in the
1728    loop_vec_info struct.  */
1729 loop_vec_info
1730 vect_analyze_loop (struct loop *loop)
1731 {
1732   loop_vec_info loop_vinfo;
1733   unsigned int vector_sizes;
1734
1735   /* Autodetect first vector size we try.  */
1736   current_vector_size = 0;
1737   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1738
1739   if (dump_enabled_p ())
1740     dump_printf_loc (MSG_NOTE, vect_location,
1741                      "===== analyze_loop_nest =====");
1742
1743   if (loop_outer (loop)
1744       && loop_vec_info_for_loop (loop_outer (loop))
1745       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1746     {
1747       if (dump_enabled_p ())
1748         dump_printf_loc (MSG_NOTE, vect_location,
1749                          "outer-loop already vectorized.");
1750       return NULL;
1751     }
1752
1753   while (1)
1754     {
1755       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1756       loop_vinfo = vect_analyze_loop_form (loop);
1757       if (!loop_vinfo)
1758         {
1759           if (dump_enabled_p ())
1760             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1761                              "bad loop form.");
1762           return NULL;
1763         }
1764
1765       if (vect_analyze_loop_2 (loop_vinfo))
1766         {
1767           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1768
1769           return loop_vinfo;
1770         }
1771
1772       destroy_loop_vec_info (loop_vinfo, true);
1773
1774       vector_sizes &= ~current_vector_size;
1775       if (vector_sizes == 0
1776           || current_vector_size == 0)
1777         return NULL;
1778
1779       /* Try the next biggest vector size.  */
1780       current_vector_size = 1 << floor_log2 (vector_sizes);
1781       if (dump_enabled_p ())
1782         dump_printf_loc (MSG_NOTE, vect_location,
1783                          "***** Re-trying analysis with "
1784                          "vector size %d\n", current_vector_size);
1785     }
1786 }
1787
1788
1789 /* Function reduction_code_for_scalar_code
1790
1791    Input:
1792    CODE - tree_code of a reduction operations.
1793
1794    Output:
1795    REDUC_CODE - the corresponding tree-code to be used to reduce the
1796       vector of partial results into a single scalar result (which
1797       will also reside in a vector) or ERROR_MARK if the operation is
1798       a supported reduction operation, but does not have such tree-code.
1799
1800    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1801
1802 static bool
1803 reduction_code_for_scalar_code (enum tree_code code,
1804                                 enum tree_code *reduc_code)
1805 {
1806   switch (code)
1807     {
1808       case MAX_EXPR:
1809         *reduc_code = REDUC_MAX_EXPR;
1810         return true;
1811
1812       case MIN_EXPR:
1813         *reduc_code = REDUC_MIN_EXPR;
1814         return true;
1815
1816       case PLUS_EXPR:
1817         *reduc_code = REDUC_PLUS_EXPR;
1818         return true;
1819
1820       case MULT_EXPR:
1821       case MINUS_EXPR:
1822       case BIT_IOR_EXPR:
1823       case BIT_XOR_EXPR:
1824       case BIT_AND_EXPR:
1825         *reduc_code = ERROR_MARK;
1826         return true;
1827
1828       default:
1829        return false;
1830     }
1831 }
1832
1833
1834 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
1835    STMT is printed with a message MSG. */
1836
1837 static void
1838 report_vect_op (int msg_type, gimple stmt, const char *msg)
1839 {
1840   dump_printf_loc (msg_type, vect_location, "%s", msg);
1841   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1842 }
1843
1844
1845 /* Detect SLP reduction of the form:
1846
1847    #a1 = phi <a5, a0>
1848    a2 = operation (a1)
1849    a3 = operation (a2)
1850    a4 = operation (a3)
1851    a5 = operation (a4)
1852
1853    #a = phi <a5>
1854
1855    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1856    FIRST_STMT is the first reduction stmt in the chain
1857    (a2 = operation (a1)).
1858
1859    Return TRUE if a reduction chain was detected.  */
1860
1861 static bool
1862 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1863 {
1864   struct loop *loop = (gimple_bb (phi))->loop_father;
1865   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1866   enum tree_code code;
1867   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1868   stmt_vec_info use_stmt_info, current_stmt_info;
1869   tree lhs;
1870   imm_use_iterator imm_iter;
1871   use_operand_p use_p;
1872   int nloop_uses, size = 0, n_out_of_loop_uses;
1873   bool found = false;
1874
1875   if (loop != vect_loop)
1876     return false;
1877
1878   lhs = PHI_RESULT (phi);
1879   code = gimple_assign_rhs_code (first_stmt);
1880   while (1)
1881     {
1882       nloop_uses = 0;
1883       n_out_of_loop_uses = 0;
1884       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1885         {
1886           gimple use_stmt = USE_STMT (use_p);
1887           if (is_gimple_debug (use_stmt))
1888             continue;
1889
1890           use_stmt = USE_STMT (use_p);
1891
1892           /* Check if we got back to the reduction phi.  */
1893           if (use_stmt == phi)
1894             {
1895               loop_use_stmt = use_stmt;
1896               found = true;
1897               break;
1898             }
1899
1900           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1901             {
1902               if (vinfo_for_stmt (use_stmt)
1903                   && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1904                 {
1905                   loop_use_stmt = use_stmt;
1906                   nloop_uses++;
1907                 }
1908             }
1909            else
1910              n_out_of_loop_uses++;
1911
1912            /* There are can be either a single use in the loop or two uses in
1913               phi nodes.  */
1914            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1915              return false;
1916         }
1917
1918       if (found)
1919         break;
1920
1921       /* We reached a statement with no loop uses.  */
1922       if (nloop_uses == 0)
1923         return false;
1924
1925       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
1926       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
1927         return false;
1928
1929       if (!is_gimple_assign (loop_use_stmt)
1930           || code != gimple_assign_rhs_code (loop_use_stmt)
1931           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
1932         return false;
1933
1934       /* Insert USE_STMT into reduction chain.  */
1935       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
1936       if (current_stmt)
1937         {
1938           current_stmt_info = vinfo_for_stmt (current_stmt);
1939           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
1940           GROUP_FIRST_ELEMENT (use_stmt_info)
1941             = GROUP_FIRST_ELEMENT (current_stmt_info);
1942         }
1943       else
1944         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
1945
1946       lhs = gimple_assign_lhs (loop_use_stmt);
1947       current_stmt = loop_use_stmt;
1948       size++;
1949    }
1950
1951   if (!found || loop_use_stmt != phi || size < 2)
1952     return false;
1953
1954   /* Swap the operands, if needed, to make the reduction operand be the second
1955      operand.  */
1956   lhs = PHI_RESULT (phi);
1957   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1958   while (next_stmt)
1959     {
1960       if (gimple_assign_rhs2 (next_stmt) == lhs)
1961         {
1962           tree op = gimple_assign_rhs1 (next_stmt);
1963           gimple def_stmt = NULL;
1964
1965           if (TREE_CODE (op) == SSA_NAME)
1966             def_stmt = SSA_NAME_DEF_STMT (op);
1967
1968           /* Check that the other def is either defined in the loop
1969              ("vect_internal_def"), or it's an induction (defined by a
1970              loop-header phi-node).  */
1971           if (def_stmt
1972               && gimple_bb (def_stmt)
1973               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1974               && (is_gimple_assign (def_stmt)
1975                   || is_gimple_call (def_stmt)
1976                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1977                            == vect_induction_def
1978                   || (gimple_code (def_stmt) == GIMPLE_PHI
1979                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1980                                   == vect_internal_def
1981                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
1982             {
1983               lhs = gimple_assign_lhs (next_stmt);
1984               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1985               continue;
1986             }
1987
1988           return false;
1989         }
1990       else
1991         {
1992           tree op = gimple_assign_rhs2 (next_stmt);
1993           gimple def_stmt = NULL;
1994
1995           if (TREE_CODE (op) == SSA_NAME)
1996             def_stmt = SSA_NAME_DEF_STMT (op);
1997
1998           /* Check that the other def is either defined in the loop
1999             ("vect_internal_def"), or it's an induction (defined by a
2000             loop-header phi-node).  */
2001           if (def_stmt
2002               && gimple_bb (def_stmt)
2003               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2004               && (is_gimple_assign (def_stmt)
2005                   || is_gimple_call (def_stmt)
2006                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2007                               == vect_induction_def
2008                   || (gimple_code (def_stmt) == GIMPLE_PHI
2009                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2010                                   == vect_internal_def
2011                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2012             {
2013               if (dump_enabled_p ())
2014                 {
2015                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2016                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2017                 }
2018
2019               swap_tree_operands (next_stmt,
2020                                   gimple_assign_rhs1_ptr (next_stmt),
2021                                   gimple_assign_rhs2_ptr (next_stmt));
2022               update_stmt (next_stmt);
2023
2024               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2025                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2026             }
2027           else
2028             return false;
2029         }
2030
2031       lhs = gimple_assign_lhs (next_stmt);
2032       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2033     }
2034
2035   /* Save the chain for further analysis in SLP detection.  */
2036   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2037   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2038   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2039
2040   return true;
2041 }
2042
2043
2044 /* Function vect_is_simple_reduction_1
2045
2046    (1) Detect a cross-iteration def-use cycle that represents a simple
2047    reduction computation.  We look for the following pattern:
2048
2049    loop_header:
2050      a1 = phi < a0, a2 >
2051      a3 = ...
2052      a2 = operation (a3, a1)
2053
2054    such that:
2055    1. operation is commutative and associative and it is safe to
2056       change the order of the computation (if CHECK_REDUCTION is true)
2057    2. no uses for a2 in the loop (a2 is used out of the loop)
2058    3. no uses of a1 in the loop besides the reduction operation
2059    4. no uses of a1 outside the loop.
2060
2061    Conditions 1,4 are tested here.
2062    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2063
2064    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2065    nested cycles, if CHECK_REDUCTION is false.
2066
2067    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2068    reductions:
2069
2070      a1 = phi < a0, a2 >
2071      inner loop (def of a3)
2072      a2 = phi < a3 >
2073
2074    If MODIFY is true it tries also to rework the code in-place to enable
2075    detection of more reduction patterns.  For the time being we rewrite
2076    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2077 */
2078
2079 static gimple
2080 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2081                             bool check_reduction, bool *double_reduc,
2082                             bool modify)
2083 {
2084   struct loop *loop = (gimple_bb (phi))->loop_father;
2085   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2086   edge latch_e = loop_latch_edge (loop);
2087   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2088   gimple def_stmt, def1 = NULL, def2 = NULL;
2089   enum tree_code orig_code, code;
2090   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2091   tree type;
2092   int nloop_uses;
2093   tree name;
2094   imm_use_iterator imm_iter;
2095   use_operand_p use_p;
2096   bool phi_def;
2097
2098   *double_reduc = false;
2099
2100   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2101      otherwise, we assume outer loop vectorization.  */
2102   gcc_assert ((check_reduction && loop == vect_loop)
2103               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2104
2105   name = PHI_RESULT (phi);
2106   nloop_uses = 0;
2107   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2108     {
2109       gimple use_stmt = USE_STMT (use_p);
2110       if (is_gimple_debug (use_stmt))
2111         continue;
2112
2113       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2114         {
2115           if (dump_enabled_p ())
2116             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2117                              "intermediate value used outside loop.");
2118
2119           return NULL;
2120         }
2121
2122       if (vinfo_for_stmt (use_stmt)
2123           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2124         nloop_uses++;
2125       if (nloop_uses > 1)
2126         {
2127           if (dump_enabled_p ())
2128             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2129                              "reduction used in loop.");
2130           return NULL;
2131         }
2132     }
2133
2134   if (TREE_CODE (loop_arg) != SSA_NAME)
2135     {
2136       if (dump_enabled_p ())
2137         {
2138           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2139                            "reduction: not ssa_name: ");
2140           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2141         }
2142       return NULL;
2143     }
2144
2145   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2146   if (!def_stmt)
2147     {
2148       if (dump_enabled_p ())
2149         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2150                          "reduction: no def_stmt.");
2151       return NULL;
2152     }
2153
2154   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2155     {
2156       if (dump_enabled_p ())
2157         dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2158       return NULL;
2159     }
2160
2161   if (is_gimple_assign (def_stmt))
2162     {
2163       name = gimple_assign_lhs (def_stmt);
2164       phi_def = false;
2165     }
2166   else
2167     {
2168       name = PHI_RESULT (def_stmt);
2169       phi_def = true;
2170     }
2171
2172   nloop_uses = 0;
2173   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2174     {
2175       gimple use_stmt = USE_STMT (use_p);
2176       if (is_gimple_debug (use_stmt))
2177         continue;
2178       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2179           && vinfo_for_stmt (use_stmt)
2180           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2181         nloop_uses++;
2182       if (nloop_uses > 1)
2183         {
2184           if (dump_enabled_p ())
2185             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2186                              "reduction used in loop.");
2187           return NULL;
2188         }
2189     }
2190
2191   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2192      defined in the inner loop.  */
2193   if (phi_def)
2194     {
2195       op1 = PHI_ARG_DEF (def_stmt, 0);
2196
2197       if (gimple_phi_num_args (def_stmt) != 1
2198           || TREE_CODE (op1) != SSA_NAME)
2199         {
2200           if (dump_enabled_p ())
2201             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2202                              "unsupported phi node definition.");
2203
2204           return NULL;
2205         }
2206
2207       def1 = SSA_NAME_DEF_STMT (op1);
2208       if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2209           && loop->inner
2210           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2211           && is_gimple_assign (def1))
2212         {
2213           if (dump_enabled_p ())
2214             report_vect_op (MSG_NOTE, def_stmt,
2215                             "detected double reduction: ");
2216
2217           *double_reduc = true;
2218           return def_stmt;
2219         }
2220
2221       return NULL;
2222     }
2223
2224   code = orig_code = gimple_assign_rhs_code (def_stmt);
2225
2226   /* We can handle "res -= x[i]", which is non-associative by
2227      simply rewriting this into "res += -x[i]".  Avoid changing
2228      gimple instruction for the first simple tests and only do this
2229      if we're allowed to change code at all.  */
2230   if (code == MINUS_EXPR
2231       && modify
2232       && (op1 = gimple_assign_rhs1 (def_stmt))
2233       && TREE_CODE (op1) == SSA_NAME
2234       && SSA_NAME_DEF_STMT (op1) == phi)
2235     code = PLUS_EXPR;
2236
2237   if (check_reduction
2238       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2239     {
2240       if (dump_enabled_p ())
2241         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2242                         "reduction: not commutative/associative: ");
2243       return NULL;
2244     }
2245
2246   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2247     {
2248       if (code != COND_EXPR)
2249         {
2250           if (dump_enabled_p ())
2251             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2252                             "reduction: not binary operation: ");
2253
2254           return NULL;
2255         }
2256
2257       op3 = gimple_assign_rhs1 (def_stmt);
2258       if (COMPARISON_CLASS_P (op3))
2259         {
2260           op4 = TREE_OPERAND (op3, 1);
2261           op3 = TREE_OPERAND (op3, 0);
2262         }
2263
2264       op1 = gimple_assign_rhs2 (def_stmt);
2265       op2 = gimple_assign_rhs3 (def_stmt);
2266
2267       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2268         {
2269           if (dump_enabled_p ())
2270             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2271                             "reduction: uses not ssa_names: ");
2272
2273           return NULL;
2274         }
2275     }
2276   else
2277     {
2278       op1 = gimple_assign_rhs1 (def_stmt);
2279       op2 = gimple_assign_rhs2 (def_stmt);
2280
2281       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2282         {
2283           if (dump_enabled_p ())
2284             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2285                             "reduction: uses not ssa_names: ");
2286
2287           return NULL;
2288         }
2289    }
2290
2291   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2292   if ((TREE_CODE (op1) == SSA_NAME
2293        && !types_compatible_p (type,TREE_TYPE (op1)))
2294       || (TREE_CODE (op2) == SSA_NAME
2295           && !types_compatible_p (type, TREE_TYPE (op2)))
2296       || (op3 && TREE_CODE (op3) == SSA_NAME
2297           && !types_compatible_p (type, TREE_TYPE (op3)))
2298       || (op4 && TREE_CODE (op4) == SSA_NAME
2299           && !types_compatible_p (type, TREE_TYPE (op4))))
2300     {
2301       if (dump_enabled_p ())
2302         {
2303           dump_printf_loc (MSG_NOTE, vect_location,
2304                            "reduction: multiple types: operation type: ");
2305           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2306           dump_printf (MSG_NOTE, ", operands types: ");
2307           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2308                              TREE_TYPE (op1));
2309           dump_printf (MSG_NOTE, ",");
2310           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2311                              TREE_TYPE (op2));
2312           if (op3)
2313             {
2314               dump_printf (MSG_NOTE, ",");
2315               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2316                                  TREE_TYPE (op3));
2317             }
2318
2319           if (op4)
2320             {
2321               dump_printf (MSG_NOTE, ",");
2322               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2323                                  TREE_TYPE (op4));
2324             }
2325         }
2326
2327       return NULL;
2328     }
2329
2330   /* Check that it's ok to change the order of the computation.
2331      Generally, when vectorizing a reduction we change the order of the
2332      computation.  This may change the behavior of the program in some
2333      cases, so we need to check that this is ok.  One exception is when
2334      vectorizing an outer-loop: the inner-loop is executed sequentially,
2335      and therefore vectorizing reductions in the inner-loop during
2336      outer-loop vectorization is safe.  */
2337
2338   /* CHECKME: check for !flag_finite_math_only too?  */
2339   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2340       && check_reduction)
2341     {
2342       /* Changing the order of operations changes the semantics.  */
2343       if (dump_enabled_p ())
2344         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2345                         "reduction: unsafe fp math optimization: ");
2346       return NULL;
2347     }
2348   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2349            && check_reduction)
2350     {
2351       /* Changing the order of operations changes the semantics.  */
2352       if (dump_enabled_p ())
2353         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2354                         "reduction: unsafe int math optimization: ");
2355       return NULL;
2356     }
2357   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2358     {
2359       /* Changing the order of operations changes the semantics.  */
2360       if (dump_enabled_p ())
2361         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2362                         "reduction: unsafe fixed-point math optimization: ");
2363       return NULL;
2364     }
2365
2366   /* If we detected "res -= x[i]" earlier, rewrite it into
2367      "res += -x[i]" now.  If this turns out to be useless reassoc
2368      will clean it up again.  */
2369   if (orig_code == MINUS_EXPR)
2370     {
2371       tree rhs = gimple_assign_rhs2 (def_stmt);
2372       tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
2373       gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2374                                                          rhs, NULL);
2375       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2376       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2377                                                           loop_info, NULL));
2378       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2379       gimple_assign_set_rhs2 (def_stmt, negrhs);
2380       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2381       update_stmt (def_stmt);
2382     }
2383
2384   /* Reduction is safe. We're dealing with one of the following:
2385      1) integer arithmetic and no trapv
2386      2) floating point arithmetic, and special flags permit this optimization
2387      3) nested cycle (i.e., outer loop vectorization).  */
2388   if (TREE_CODE (op1) == SSA_NAME)
2389     def1 = SSA_NAME_DEF_STMT (op1);
2390
2391   if (TREE_CODE (op2) == SSA_NAME)
2392     def2 = SSA_NAME_DEF_STMT (op2);
2393
2394   if (code != COND_EXPR
2395       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2396     {
2397       if (dump_enabled_p ())
2398         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2399       return NULL;
2400     }
2401
2402   /* Check that one def is the reduction def, defined by PHI,
2403      the other def is either defined in the loop ("vect_internal_def"),
2404      or it's an induction (defined by a loop-header phi-node).  */
2405
2406   if (def2 && def2 == phi
2407       && (code == COND_EXPR
2408           || !def1 || gimple_nop_p (def1)
2409           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2410               && (is_gimple_assign (def1)
2411                   || is_gimple_call (def1)
2412                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2413                       == vect_induction_def
2414                   || (gimple_code (def1) == GIMPLE_PHI
2415                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2416                           == vect_internal_def
2417                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2418     {
2419       if (dump_enabled_p ())
2420         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2421       return def_stmt;
2422     }
2423
2424   if (def1 && def1 == phi
2425       && (code == COND_EXPR
2426           || !def2 || gimple_nop_p (def2)
2427           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2428               && (is_gimple_assign (def2)
2429                   || is_gimple_call (def2)
2430                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2431                       == vect_induction_def
2432                   || (gimple_code (def2) == GIMPLE_PHI
2433                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2434                           == vect_internal_def
2435                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2436     {
2437       if (check_reduction)
2438         {
2439           /* Swap operands (just for simplicity - so that the rest of the code
2440              can assume that the reduction variable is always the last (second)
2441              argument).  */
2442           if (dump_enabled_p ())
2443             report_vect_op (MSG_NOTE, def_stmt,
2444                             "detected reduction: need to swap operands: ");
2445
2446           swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2447                               gimple_assign_rhs2_ptr (def_stmt));
2448
2449           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2450             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2451         }
2452       else
2453         {
2454           if (dump_enabled_p ())
2455             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2456         }
2457
2458       return def_stmt;
2459     }
2460
2461   /* Try to find SLP reduction chain.  */
2462   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2463     {
2464       if (dump_enabled_p ())
2465         report_vect_op (MSG_NOTE, def_stmt,
2466                         "reduction: detected reduction chain: ");
2467
2468       return def_stmt;
2469     }
2470
2471   if (dump_enabled_p ())
2472     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2473                     "reduction: unknown pattern: ");
2474
2475   return NULL;
2476 }
2477
2478 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2479    in-place.  Arguments as there.  */
2480
2481 static gimple
2482 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2483                           bool check_reduction, bool *double_reduc)
2484 {
2485   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2486                                      double_reduc, false);
2487 }
2488
2489 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2490    in-place if it enables detection of more reductions.  Arguments
2491    as there.  */
2492
2493 gimple
2494 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2495                           bool check_reduction, bool *double_reduc)
2496 {
2497   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2498                                      double_reduc, true);
2499 }
2500
2501 /* Calculate the cost of one scalar iteration of the loop.  */
2502 int
2503 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2504 {
2505   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2506   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2507   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2508   int innerloop_iters, i, stmt_cost;
2509
2510   /* Count statements in scalar loop.  Using this as scalar cost for a single
2511      iteration for now.
2512
2513      TODO: Add outer loop support.
2514
2515      TODO: Consider assigning different costs to different scalar
2516      statements.  */
2517
2518   /* FORNOW.  */
2519   innerloop_iters = 1;
2520   if (loop->inner)
2521     innerloop_iters = 50; /* FIXME */
2522
2523   for (i = 0; i < nbbs; i++)
2524     {
2525       gimple_stmt_iterator si;
2526       basic_block bb = bbs[i];
2527
2528       if (bb->loop_father == loop->inner)
2529         factor = innerloop_iters;
2530       else
2531         factor = 1;
2532
2533       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2534         {
2535           gimple stmt = gsi_stmt (si);
2536           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2537
2538           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2539             continue;
2540
2541           /* Skip stmts that are not vectorized inside the loop.  */
2542           if (stmt_info
2543               && !STMT_VINFO_RELEVANT_P (stmt_info)
2544               && (!STMT_VINFO_LIVE_P (stmt_info)
2545                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2546               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2547             continue;
2548
2549           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2550             {
2551               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2552                stmt_cost = vect_get_stmt_cost (scalar_load);
2553              else
2554                stmt_cost = vect_get_stmt_cost (scalar_store);
2555             }
2556           else
2557             stmt_cost = vect_get_stmt_cost (scalar_stmt);
2558
2559           scalar_single_iter_cost += stmt_cost * factor;
2560         }
2561     }
2562   return scalar_single_iter_cost;
2563 }
2564
2565 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2566 int
2567 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2568                              int *peel_iters_epilogue,
2569                              int scalar_single_iter_cost,
2570                              stmt_vector_for_cost *prologue_cost_vec,
2571                              stmt_vector_for_cost *epilogue_cost_vec)
2572 {
2573   int retval = 0;
2574   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2575
2576   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2577     {
2578       *peel_iters_epilogue = vf/2;
2579       if (dump_enabled_p ())
2580         dump_printf_loc (MSG_NOTE, vect_location,
2581                          "cost model: epilogue peel iters set to vf/2 "
2582                          "because loop iterations are unknown .");
2583
2584       /* If peeled iterations are known but number of scalar loop
2585          iterations are unknown, count a taken branch per peeled loop.  */
2586       retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2587                                  NULL, 0, vect_prologue);
2588     }
2589   else
2590     {
2591       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2592       peel_iters_prologue = niters < peel_iters_prologue ?
2593                             niters : peel_iters_prologue;
2594       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2595       /* If we need to peel for gaps, but no peeling is required, we have to
2596          peel VF iterations.  */
2597       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2598         *peel_iters_epilogue = vf;
2599     }
2600
2601   if (peel_iters_prologue)
2602     retval += record_stmt_cost (prologue_cost_vec,
2603                                 peel_iters_prologue * scalar_single_iter_cost,
2604                                 scalar_stmt, NULL, 0, vect_prologue);
2605   if (*peel_iters_epilogue)
2606     retval += record_stmt_cost (epilogue_cost_vec,
2607                                 *peel_iters_epilogue * scalar_single_iter_cost,
2608                                 scalar_stmt, NULL, 0, vect_epilogue);
2609   return retval;
2610 }
2611
2612 /* Function vect_estimate_min_profitable_iters
2613
2614    Return the number of iterations required for the vector version of the
2615    loop to be profitable relative to the cost of the scalar version of the
2616    loop.  */
2617
2618 static void
2619 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2620                                     int *ret_min_profitable_niters,
2621                                     int *ret_min_profitable_estimate)
2622 {
2623   int min_profitable_iters;
2624   int min_profitable_estimate;
2625   int peel_iters_prologue;
2626   int peel_iters_epilogue;
2627   unsigned vec_inside_cost = 0;
2628   int vec_outside_cost = 0;
2629   unsigned vec_prologue_cost = 0;
2630   unsigned vec_epilogue_cost = 0;
2631   int scalar_single_iter_cost = 0;
2632   int scalar_outside_cost = 0;
2633   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2634   int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2635   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2636
2637   /* Cost model disabled.  */
2638   if (!flag_vect_cost_model)
2639     {
2640       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.");
2641       *ret_min_profitable_niters = 0;
2642       *ret_min_profitable_estimate = 0;
2643       return;
2644     }
2645
2646   /* Requires loop versioning tests to handle misalignment.  */
2647   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2648     {
2649       /*  FIXME: Make cost depend on complexity of individual check.  */
2650       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2651       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2652                             vect_prologue);
2653       dump_printf (MSG_NOTE,
2654                    "cost model: Adding cost of checks for loop "
2655                    "versioning to treat misalignment.\n");
2656     }
2657
2658   /* Requires loop versioning with alias checks.  */
2659   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2660     {
2661       /*  FIXME: Make cost depend on complexity of individual check.  */
2662       unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2663       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2664                             vect_prologue);
2665       dump_printf (MSG_NOTE,
2666                    "cost model: Adding cost of checks for loop "
2667                    "versioning aliasing.\n");
2668     }
2669
2670   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2671       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2672     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2673                           vect_prologue);
2674
2675   /* Count statements in scalar loop.  Using this as scalar cost for a single
2676      iteration for now.
2677
2678      TODO: Add outer loop support.
2679
2680      TODO: Consider assigning different costs to different scalar
2681      statements.  */
2682
2683   scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2684
2685   /* Add additional cost for the peeled instructions in prologue and epilogue
2686      loop.
2687
2688      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2689      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2690
2691      TODO: Build an expression that represents peel_iters for prologue and
2692      epilogue to be used in a run-time test.  */
2693
2694   if (npeel  < 0)
2695     {
2696       peel_iters_prologue = vf/2;
2697       dump_printf (MSG_NOTE, "cost model: "
2698                    "prologue peel iters set to vf/2.");
2699
2700       /* If peeling for alignment is unknown, loop bound of main loop becomes
2701          unknown.  */
2702       peel_iters_epilogue = vf/2;
2703       dump_printf (MSG_NOTE, "cost model: "
2704                    "epilogue peel iters set to vf/2 because "
2705                    "peeling for alignment is unknown.");
2706
2707       /* If peeled iterations are unknown, count a taken branch and a not taken
2708          branch per peeled loop. Even if scalar loop iterations are known,
2709          vector iterations are not known since peeled prologue iterations are
2710          not known. Hence guards remain the same.  */
2711       (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2712                             NULL, 0, vect_prologue);
2713       (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2714                             NULL, 0, vect_prologue);
2715       /* FORNOW: Don't attempt to pass individual scalar instructions to
2716          the model; just assume linear cost for scalar iterations.  */
2717       (void) add_stmt_cost (target_cost_data,
2718                             peel_iters_prologue * scalar_single_iter_cost,
2719                             scalar_stmt, NULL, 0, vect_prologue);
2720       (void) add_stmt_cost (target_cost_data,
2721                             peel_iters_epilogue * scalar_single_iter_cost,
2722                             scalar_stmt, NULL, 0, vect_epilogue);
2723     }
2724   else
2725     {
2726       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2727       stmt_info_for_cost *si;
2728       int j;
2729       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2730
2731       prologue_cost_vec.create (2);
2732       epilogue_cost_vec.create (2);
2733       peel_iters_prologue = npeel;
2734
2735       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2736                                           &peel_iters_epilogue,
2737                                           scalar_single_iter_cost,
2738                                           &prologue_cost_vec,
2739                                           &epilogue_cost_vec);
2740
2741       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2742         {
2743           struct _stmt_vec_info *stmt_info
2744             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2745           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2746                                 si->misalign, vect_prologue);
2747         }
2748
2749       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2750         {
2751           struct _stmt_vec_info *stmt_info
2752             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2753           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2754                                 si->misalign, vect_epilogue);
2755         }
2756
2757       prologue_cost_vec.release ();
2758       epilogue_cost_vec.release ();
2759     }
2760
2761   /* FORNOW: The scalar outside cost is incremented in one of the
2762      following ways:
2763
2764      1. The vectorizer checks for alignment and aliasing and generates
2765      a condition that allows dynamic vectorization.  A cost model
2766      check is ANDED with the versioning condition.  Hence scalar code
2767      path now has the added cost of the versioning check.
2768
2769        if (cost > th & versioning_check)
2770          jmp to vector code
2771
2772      Hence run-time scalar is incremented by not-taken branch cost.
2773
2774      2. The vectorizer then checks if a prologue is required.  If the
2775      cost model check was not done before during versioning, it has to
2776      be done before the prologue check.
2777
2778        if (cost <= th)
2779          prologue = scalar_iters
2780        if (prologue == 0)
2781          jmp to vector code
2782        else
2783          execute prologue
2784        if (prologue == num_iters)
2785          go to exit
2786
2787      Hence the run-time scalar cost is incremented by a taken branch,
2788      plus a not-taken branch, plus a taken branch cost.
2789
2790      3. The vectorizer then checks if an epilogue is required.  If the
2791      cost model check was not done before during prologue check, it
2792      has to be done with the epilogue check.
2793
2794        if (prologue == 0)
2795          jmp to vector code
2796        else
2797          execute prologue
2798        if (prologue == num_iters)
2799          go to exit
2800        vector code:
2801          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2802            jmp to epilogue
2803
2804      Hence the run-time scalar cost should be incremented by 2 taken
2805      branches.
2806
2807      TODO: The back end may reorder the BBS's differently and reverse
2808      conditions/branch directions.  Change the estimates below to
2809      something more reasonable.  */
2810
2811   /* If the number of iterations is known and we do not do versioning, we can
2812      decide whether to vectorize at compile time.  Hence the scalar version
2813      do not carry cost model guard costs.  */
2814   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2815       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2816       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2817     {
2818       /* Cost model check occurs at versioning.  */
2819       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2820           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2821         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2822       else
2823         {
2824           /* Cost model check occurs at prologue generation.  */
2825           if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2826             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2827               + vect_get_stmt_cost (cond_branch_not_taken);
2828           /* Cost model check occurs at epilogue generation.  */
2829           else
2830             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2831         }
2832     }
2833
2834   /* Complete the target-specific cost calculations.  */
2835   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2836                &vec_inside_cost, &vec_epilogue_cost);
2837
2838   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2839
2840   /* Calculate number of iterations required to make the vector version
2841      profitable, relative to the loop bodies only.  The following condition
2842      must hold true:
2843      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2844      where
2845      SIC = scalar iteration cost, VIC = vector iteration cost,
2846      VOC = vector outside cost, VF = vectorization factor,
2847      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2848      SOC = scalar outside cost for run time cost model check.  */
2849
2850   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2851     {
2852       if (vec_outside_cost <= 0)
2853         min_profitable_iters = 1;
2854       else
2855         {
2856           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2857                                   - vec_inside_cost * peel_iters_prologue
2858                                   - vec_inside_cost * peel_iters_epilogue)
2859                                  / ((scalar_single_iter_cost * vf)
2860                                     - vec_inside_cost);
2861
2862           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2863               <= (((int) vec_inside_cost * min_profitable_iters)
2864                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
2865             min_profitable_iters++;
2866         }
2867     }
2868   /* vector version will never be profitable.  */
2869   else
2870     {
2871       if (dump_enabled_p ())
2872         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2873                          "cost model: the vector iteration cost = %d "
2874                          "divided by the scalar iteration cost = %d "
2875                          "is greater or equal to the vectorization factor = %d.",
2876                          vec_inside_cost, scalar_single_iter_cost, vf);
2877       *ret_min_profitable_niters = -1;
2878       *ret_min_profitable_estimate = -1;
2879       return;
2880     }
2881
2882   if (dump_enabled_p ())
2883     {
2884       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2885       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
2886                    vec_inside_cost);
2887       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
2888                    vec_prologue_cost);
2889       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
2890                    vec_epilogue_cost);
2891       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
2892                    scalar_single_iter_cost);
2893       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
2894                    scalar_outside_cost);
2895       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
2896                    vec_outside_cost);
2897       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
2898                    peel_iters_prologue);
2899       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
2900                    peel_iters_epilogue);
2901       dump_printf (MSG_NOTE,
2902                    "  Calculated minimum iters for profitability: %d\n",
2903                    min_profitable_iters);
2904     }
2905
2906   min_profitable_iters =
2907         min_profitable_iters < vf ? vf : min_profitable_iters;
2908
2909   /* Because the condition we create is:
2910      if (niters <= min_profitable_iters)
2911        then skip the vectorized loop.  */
2912   min_profitable_iters--;
2913
2914   if (dump_enabled_p ())
2915     dump_printf_loc (MSG_NOTE, vect_location,
2916                      "  Runtime profitability threshold = %d\n", min_profitable_iters);
2917
2918   *ret_min_profitable_niters = min_profitable_iters;
2919
2920   /* Calculate number of iterations required to make the vector version
2921      profitable, relative to the loop bodies only.
2922
2923      Non-vectorized variant is SIC * niters and it must win over vector
2924      variant on the expected loop trip count.  The following condition must hold true:
2925      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
2926
2927   if (vec_outside_cost <= 0)
2928     min_profitable_estimate = 1;
2929   else
2930     {
2931       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
2932                                  - vec_inside_cost * peel_iters_prologue
2933                                  - vec_inside_cost * peel_iters_epilogue)
2934                                  / ((scalar_single_iter_cost * vf)
2935                                    - vec_inside_cost);
2936     }
2937   min_profitable_estimate --;
2938   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
2939   if (dump_enabled_p ())
2940     dump_printf_loc (MSG_NOTE, vect_location,
2941                      "  Static estimate profitability threshold = %d\n",
2942                       min_profitable_iters);
2943
2944   *ret_min_profitable_estimate = min_profitable_estimate;
2945 }
2946
2947
2948 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2949    functions. Design better to avoid maintenance issues.  */
2950
2951 /* Function vect_model_reduction_cost.
2952
2953    Models cost for a reduction operation, including the vector ops
2954    generated within the strip-mine loop, the initial definition before
2955    the loop, and the epilogue code that must be generated.  */
2956
2957 static bool
2958 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2959                            int ncopies)
2960 {
2961   int prologue_cost = 0, epilogue_cost = 0;
2962   enum tree_code code;
2963   optab optab;
2964   tree vectype;
2965   gimple stmt, orig_stmt;
2966   tree reduction_op;
2967   enum machine_mode mode;
2968   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2969   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2970   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2971
2972   /* Cost of reduction op inside loop.  */
2973   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
2974                                         stmt_info, 0, vect_body);
2975   stmt = STMT_VINFO_STMT (stmt_info);
2976
2977   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2978     {
2979     case GIMPLE_SINGLE_RHS:
2980       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2981       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2982       break;
2983     case GIMPLE_UNARY_RHS:
2984       reduction_op = gimple_assign_rhs1 (stmt);
2985       break;
2986     case GIMPLE_BINARY_RHS:
2987       reduction_op = gimple_assign_rhs2 (stmt);
2988       break;
2989     case GIMPLE_TERNARY_RHS:
2990       reduction_op = gimple_assign_rhs3 (stmt);
2991       break;
2992     default:
2993       gcc_unreachable ();
2994     }
2995
2996   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2997   if (!vectype)
2998     {
2999       if (dump_enabled_p ())
3000         {
3001           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3002                            "unsupported data-type ");
3003           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3004                              TREE_TYPE (reduction_op));
3005         }
3006       return false;
3007    }
3008
3009   mode = TYPE_MODE (vectype);
3010   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3011
3012   if (!orig_stmt)
3013     orig_stmt = STMT_VINFO_STMT (stmt_info);
3014
3015   code = gimple_assign_rhs_code (orig_stmt);
3016
3017   /* Add in cost for initial definition.  */
3018   prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3019                                   stmt_info, 0, vect_prologue);
3020
3021   /* Determine cost of epilogue code.
3022
3023      We have a reduction operator that will reduce the vector in one statement.
3024      Also requires scalar extract.  */
3025
3026   if (!nested_in_vect_loop_p (loop, orig_stmt))
3027     {
3028       if (reduc_code != ERROR_MARK)
3029         {
3030           epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3031                                           stmt_info, 0, vect_epilogue);
3032           epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3033                                           stmt_info, 0, vect_epilogue);
3034         }
3035       else
3036         {
3037           int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3038           tree bitsize =
3039             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3040           int element_bitsize = tree_low_cst (bitsize, 1);
3041           int nelements = vec_size_in_bits / element_bitsize;
3042
3043           optab = optab_for_tree_code (code, vectype, optab_default);
3044
3045           /* We have a whole vector shift available.  */
3046           if (VECTOR_MODE_P (mode)
3047               && optab_handler (optab, mode) != CODE_FOR_nothing
3048               && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3049             {
3050               /* Final reduction via vector shifts and the reduction operator.
3051                  Also requires scalar extract.  */
3052               epilogue_cost += add_stmt_cost (target_cost_data,
3053                                               exact_log2 (nelements) * 2,
3054                                               vector_stmt, stmt_info, 0,
3055                                               vect_epilogue);
3056               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3057                                               vec_to_scalar, stmt_info, 0,
3058                                               vect_epilogue);
3059             }
3060           else
3061             /* Use extracts and reduction op for final reduction.  For N
3062                elements, we have N extracts and N-1 reduction ops.  */
3063             epilogue_cost += add_stmt_cost (target_cost_data,
3064                                             nelements + nelements - 1,
3065                                             vector_stmt, stmt_info, 0,
3066                                             vect_epilogue);
3067         }
3068     }
3069
3070   if (dump_enabled_p ())
3071     dump_printf (MSG_NOTE,
3072                  "vect_model_reduction_cost: inside_cost = %d, "
3073                  "prologue_cost = %d, epilogue_cost = %d .", inside_cost,
3074                  prologue_cost, epilogue_cost);
3075
3076   return true;
3077 }
3078
3079
3080 /* Function vect_model_induction_cost.
3081
3082    Models cost for induction operations.  */
3083
3084 static void
3085 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3086 {
3087   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3088   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3089   unsigned inside_cost, prologue_cost;
3090
3091   /* loop cost for vec_loop.  */
3092   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3093                                stmt_info, 0, vect_body);
3094
3095   /* prologue cost for vec_init and vec_step.  */
3096   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3097                                  stmt_info, 0, vect_prologue);
3098
3099   if (dump_enabled_p ())
3100     dump_printf_loc (MSG_NOTE, vect_location,
3101                      "vect_model_induction_cost: inside_cost = %d, "
3102                      "prologue_cost = %d .", inside_cost, prologue_cost);
3103 }
3104
3105
3106 /* Function get_initial_def_for_induction
3107
3108    Input:
3109    STMT - a stmt that performs an induction operation in the loop.
3110    IV_PHI - the initial value of the induction variable
3111
3112    Output:
3113    Return a vector variable, initialized with the first VF values of
3114    the induction variable.  E.g., for an iv with IV_PHI='X' and
3115    evolution S, for a vector of 4 units, we want to return:
3116    [X, X + S, X + 2*S, X + 3*S].  */
3117
3118 static tree
3119 get_initial_def_for_induction (gimple iv_phi)
3120 {
3121   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3122   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3123   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3124   tree scalar_type;
3125   tree vectype;
3126   int nunits;
3127   edge pe = loop_preheader_edge (loop);
3128   struct loop *iv_loop;
3129   basic_block new_bb;
3130   tree new_vec, vec_init, vec_step, t;
3131   tree access_fn;
3132   tree new_var;
3133   tree new_name;
3134   gimple init_stmt, induction_phi, new_stmt;
3135   tree induc_def, vec_def, vec_dest;
3136   tree init_expr, step_expr;
3137   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3138   int i;
3139   bool ok;
3140   int ncopies;
3141   tree expr;
3142   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3143   bool nested_in_vect_loop = false;
3144   gimple_seq stmts = NULL;
3145   imm_use_iterator imm_iter;
3146   use_operand_p use_p;
3147   gimple exit_phi;
3148   edge latch_e;
3149   tree loop_arg;
3150   gimple_stmt_iterator si;
3151   basic_block bb = gimple_bb (iv_phi);
3152   tree stepvectype;
3153   tree resvectype;
3154
3155   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3156   if (nested_in_vect_loop_p (loop, iv_phi))
3157     {
3158       nested_in_vect_loop = true;
3159       iv_loop = loop->inner;
3160     }
3161   else
3162     iv_loop = loop;
3163   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3164
3165   latch_e = loop_latch_edge (iv_loop);
3166   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3167
3168   access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
3169   gcc_assert (access_fn);
3170   STRIP_NOPS (access_fn);
3171   ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
3172                                     &init_expr, &step_expr);
3173   gcc_assert (ok);
3174   pe = loop_preheader_edge (iv_loop);
3175
3176   scalar_type = TREE_TYPE (init_expr);
3177   vectype = get_vectype_for_scalar_type (scalar_type);
3178   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3179   gcc_assert (vectype);
3180   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3181   ncopies = vf / nunits;
3182
3183   gcc_assert (phi_info);
3184   gcc_assert (ncopies >= 1);
3185
3186   /* Find the first insertion point in the BB.  */
3187   si = gsi_after_labels (bb);
3188
3189   /* Create the vector that holds the initial_value of the induction.  */
3190   if (nested_in_vect_loop)
3191     {
3192       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3193          been created during vectorization of previous stmts.  We obtain it
3194          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3195       tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3196                                            loop_preheader_edge (iv_loop));
3197       vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
3198       /* If the initial value is not of proper type, convert it.  */
3199       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3200         {
3201           new_stmt = gimple_build_assign_with_ops
3202               (VIEW_CONVERT_EXPR,
3203                vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"),
3204                build1 (VIEW_CONVERT_EXPR, vectype, vec_init), NULL_TREE);
3205           vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3206           gimple_assign_set_lhs (new_stmt, vec_init);
3207           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3208                                                  new_stmt);
3209           gcc_assert (!new_bb);
3210           set_vinfo_for_stmt (new_stmt,
3211                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3212         }
3213     }
3214   else
3215     {
3216       vec<constructor_elt, va_gc> *v;
3217
3218       /* iv_loop is the loop to be vectorized. Create:
3219          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3220       new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
3221       new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
3222       if (stmts)
3223         {
3224           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3225           gcc_assert (!new_bb);
3226         }
3227
3228       vec_alloc (v, nunits);
3229       bool constant_p = is_gimple_min_invariant (new_name);
3230       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3231       for (i = 1; i < nunits; i++)
3232         {
3233           /* Create: new_name_i = new_name + step_expr  */
3234           enum tree_code code = POINTER_TYPE_P (scalar_type)
3235                                 ? POINTER_PLUS_EXPR : PLUS_EXPR;
3236           new_name = fold_build2 (code, scalar_type, new_name, step_expr);
3237           if (!is_gimple_min_invariant (new_name))
3238             {
3239               init_stmt = gimple_build_assign (new_var, new_name);
3240               new_name = make_ssa_name (new_var, init_stmt);
3241               gimple_assign_set_lhs (init_stmt, new_name);
3242               new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3243               gcc_assert (!new_bb);
3244               if (dump_enabled_p ())
3245                 {
3246                   dump_printf_loc (MSG_NOTE, vect_location,
3247                                    "created new init_stmt: ");
3248                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3249                 }
3250               constant_p = false;
3251             }
3252           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3253         }
3254       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3255       if (constant_p)
3256         new_vec = build_vector_from_ctor (vectype, v);
3257       else
3258         new_vec = build_constructor (vectype, v);
3259       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3260     }
3261
3262
3263   /* Create the vector that holds the step of the induction.  */
3264   if (nested_in_vect_loop)
3265     /* iv_loop is nested in the loop to be vectorized. Generate:
3266        vec_step = [S, S, S, S]  */
3267     new_name = step_expr;
3268   else
3269     {
3270       /* iv_loop is the loop to be vectorized. Generate:
3271           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3272       expr = build_int_cst (TREE_TYPE (step_expr), vf);
3273       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3274                               expr, step_expr);
3275     }
3276
3277   t = unshare_expr (new_name);
3278   gcc_assert (CONSTANT_CLASS_P (new_name));
3279   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3280   gcc_assert (stepvectype);
3281   new_vec = build_vector_from_val (stepvectype, t);
3282   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3283
3284
3285   /* Create the following def-use cycle:
3286      loop prolog:
3287          vec_init = ...
3288          vec_step = ...
3289      loop:
3290          vec_iv = PHI <vec_init, vec_loop>
3291          ...
3292          STMT
3293          ...
3294          vec_loop = vec_iv + vec_step;  */
3295
3296   /* Create the induction-phi that defines the induction-operand.  */
3297   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3298   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3299   set_vinfo_for_stmt (induction_phi,
3300                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3301   induc_def = PHI_RESULT (induction_phi);
3302
3303   /* Create the iv update inside the loop  */
3304   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3305                                            induc_def, vec_step);
3306   vec_def = make_ssa_name (vec_dest, new_stmt);
3307   gimple_assign_set_lhs (new_stmt, vec_def);
3308   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3309   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3310                                                    NULL));
3311
3312   /* Set the arguments of the phi node:  */
3313   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3314   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3315                UNKNOWN_LOCATION);
3316
3317
3318   /* In case that vectorization factor (VF) is bigger than the number
3319      of elements that we can fit in a vectype (nunits), we have to generate
3320      more than one vector stmt - i.e - we need to "unroll" the
3321      vector stmt by a factor VF/nunits.  For more details see documentation
3322      in vectorizable_operation.  */
3323
3324   if (ncopies > 1)
3325     {
3326       stmt_vec_info prev_stmt_vinfo;
3327       /* FORNOW. This restriction should be relaxed.  */
3328       gcc_assert (!nested_in_vect_loop);
3329
3330       /* Create the vector that holds the step of the induction.  */
3331       expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3332       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3333                               expr, step_expr);
3334       t = unshare_expr (new_name);
3335       gcc_assert (CONSTANT_CLASS_P (new_name));
3336       new_vec = build_vector_from_val (stepvectype, t);
3337       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3338
3339       vec_def = induc_def;
3340       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3341       for (i = 1; i < ncopies; i++)
3342         {
3343           /* vec_i = vec_prev + vec_step  */
3344           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3345                                                    vec_def, vec_step);
3346           vec_def = make_ssa_name (vec_dest, new_stmt);
3347           gimple_assign_set_lhs (new_stmt, vec_def);
3348
3349           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3350           if (!useless_type_conversion_p (resvectype, vectype))
3351             {
3352               new_stmt = gimple_build_assign_with_ops
3353                   (VIEW_CONVERT_EXPR,
3354                    vect_get_new_vect_var (resvectype, vect_simple_var,
3355                                           "vec_iv_"),
3356                    build1 (VIEW_CONVERT_EXPR, resvectype,
3357                            gimple_assign_lhs (new_stmt)), NULL_TREE);
3358               gimple_assign_set_lhs (new_stmt,
3359                                      make_ssa_name
3360                                        (gimple_assign_lhs (new_stmt), new_stmt));
3361               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3362             }
3363           set_vinfo_for_stmt (new_stmt,
3364                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3365           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3366           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3367         }
3368     }
3369
3370   if (nested_in_vect_loop)
3371     {
3372       /* Find the loop-closed exit-phi of the induction, and record
3373          the final vector of induction results:  */
3374       exit_phi = NULL;
3375       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3376         {
3377           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3378             {
3379               exit_phi = USE_STMT (use_p);
3380               break;
3381             }
3382         }
3383       if (exit_phi)
3384         {
3385           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3386           /* FORNOW. Currently not supporting the case that an inner-loop induction
3387              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3388           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3389                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3390
3391           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3392           if (dump_enabled_p ())
3393             {
3394               dump_printf_loc (MSG_NOTE, vect_location,
3395                                "vector of inductions after inner-loop:");
3396               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3397             }
3398         }
3399     }
3400
3401
3402   if (dump_enabled_p ())
3403     {
3404       dump_printf_loc (MSG_NOTE, vect_location,
3405                        "transform induction: created def-use cycle: ");
3406       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3407       dump_printf (MSG_NOTE, "\n");
3408       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3409                         SSA_NAME_DEF_STMT (vec_def), 0);
3410     }
3411
3412   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3413   if (!useless_type_conversion_p (resvectype, vectype))
3414     {
3415       new_stmt = gimple_build_assign_with_ops
3416          (VIEW_CONVERT_EXPR,
3417           vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3418           build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3419       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3420       gimple_assign_set_lhs (new_stmt, induc_def);
3421       si = gsi_after_labels (bb);
3422       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3423       set_vinfo_for_stmt (new_stmt,
3424                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3425       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3426         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3427     }
3428
3429   return induc_def;
3430 }
3431
3432
3433 /* Function get_initial_def_for_reduction
3434
3435    Input:
3436    STMT - a stmt that performs a reduction operation in the loop.
3437    INIT_VAL - the initial value of the reduction variable
3438
3439    Output:
3440    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3441         of the reduction (used for adjusting the epilog - see below).
3442    Return a vector variable, initialized according to the operation that STMT
3443         performs. This vector will be used as the initial value of the
3444         vector of partial results.
3445
3446    Option1 (adjust in epilog): Initialize the vector as follows:
3447      add/bit or/xor:    [0,0,...,0,0]
3448      mult/bit and:      [1,1,...,1,1]
3449      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3450    and when necessary (e.g. add/mult case) let the caller know
3451    that it needs to adjust the result by init_val.
3452
3453    Option2: Initialize the vector as follows:
3454      add/bit or/xor:    [init_val,0,0,...,0]
3455      mult/bit and:      [init_val,1,1,...,1]
3456      min/max/cond_expr: [init_val,init_val,...,init_val]
3457    and no adjustments are needed.
3458
3459    For example, for the following code:
3460
3461    s = init_val;
3462    for (i=0;i<n;i++)
3463      s = s + a[i];
3464
3465    STMT is 's = s + a[i]', and the reduction variable is 's'.
3466    For a vector of 4 units, we want to return either [0,0,0,init_val],
3467    or [0,0,0,0] and let the caller know that it needs to adjust
3468    the result at the end by 'init_val'.
3469
3470    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3471    initialization vector is simpler (same element in all entries), if
3472    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3473
3474    A cost model should help decide between these two schemes.  */
3475
3476 tree
3477 get_initial_def_for_reduction (gimple stmt, tree init_val,
3478                                tree *adjustment_def)
3479 {
3480   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3481   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3482   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3483   tree scalar_type = TREE_TYPE (init_val);
3484   tree vectype = get_vectype_for_scalar_type (scalar_type);
3485   int nunits;
3486   enum tree_code code = gimple_assign_rhs_code (stmt);
3487   tree def_for_init;
3488   tree init_def;
3489   tree *elts;
3490   int i;
3491   bool nested_in_vect_loop = false;
3492   tree init_value;
3493   REAL_VALUE_TYPE real_init_val = dconst0;
3494   int int_init_val = 0;
3495   gimple def_stmt = NULL;
3496
3497   gcc_assert (vectype);
3498   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3499
3500   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3501               || SCALAR_FLOAT_TYPE_P (scalar_type));
3502
3503   if (nested_in_vect_loop_p (loop, stmt))
3504     nested_in_vect_loop = true;
3505   else
3506     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3507
3508   /* In case of double reduction we only create a vector variable to be put
3509      in the reduction phi node.  The actual statement creation is done in
3510      vect_create_epilog_for_reduction.  */
3511   if (adjustment_def && nested_in_vect_loop
3512       && TREE_CODE (init_val) == SSA_NAME
3513       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3514       && gimple_code (def_stmt) == GIMPLE_PHI
3515       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3516       && vinfo_for_stmt (def_stmt)
3517       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3518           == vect_double_reduction_def)
3519     {
3520       *adjustment_def = NULL;
3521       return vect_create_destination_var (init_val, vectype);
3522     }
3523
3524   if (TREE_CONSTANT (init_val))
3525     {
3526       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3527         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3528       else
3529         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3530     }
3531   else
3532     init_value = init_val;
3533
3534   switch (code)
3535     {
3536       case WIDEN_SUM_EXPR:
3537       case DOT_PROD_EXPR:
3538       case PLUS_EXPR:
3539       case MINUS_EXPR:
3540       case BIT_IOR_EXPR:
3541       case BIT_XOR_EXPR:
3542       case MULT_EXPR:
3543       case BIT_AND_EXPR:
3544         /* ADJUSMENT_DEF is NULL when called from
3545            vect_create_epilog_for_reduction to vectorize double reduction.  */
3546         if (adjustment_def)
3547           {
3548             if (nested_in_vect_loop)
3549               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3550                                                               NULL);
3551             else
3552               *adjustment_def = init_val;
3553           }
3554
3555         if (code == MULT_EXPR)
3556           {
3557             real_init_val = dconst1;
3558             int_init_val = 1;
3559           }
3560
3561         if (code == BIT_AND_EXPR)
3562           int_init_val = -1;
3563
3564         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3565           def_for_init = build_real (scalar_type, real_init_val);
3566         else
3567           def_for_init = build_int_cst (scalar_type, int_init_val);
3568
3569         /* Create a vector of '0' or '1' except the first element.  */
3570         elts = XALLOCAVEC (tree, nunits);
3571         for (i = nunits - 2; i >= 0; --i)
3572           elts[i + 1] = def_for_init;
3573
3574         /* Option1: the first element is '0' or '1' as well.  */
3575         if (adjustment_def)
3576           {
3577             elts[0] = def_for_init;
3578             init_def = build_vector (vectype, elts);
3579             break;
3580           }
3581
3582         /* Option2: the first element is INIT_VAL.  */
3583         elts[0] = init_val;
3584         if (TREE_CONSTANT (init_val))
3585           init_def = build_vector (vectype, elts);
3586         else
3587           {
3588             vec<constructor_elt, va_gc> *v;
3589             vec_alloc (v, nunits);
3590             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3591             for (i = 1; i < nunits; ++i)
3592               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3593             init_def = build_constructor (vectype, v);
3594           }
3595
3596         break;
3597
3598       case MIN_EXPR:
3599       case MAX_EXPR:
3600       case COND_EXPR:
3601         if (adjustment_def)
3602           {
3603             *adjustment_def = NULL_TREE;
3604             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3605             break;
3606           }
3607
3608         init_def = build_vector_from_val (vectype, init_value);
3609         break;
3610
3611       default:
3612         gcc_unreachable ();
3613     }
3614
3615   return init_def;
3616 }
3617
3618
3619 /* Function vect_create_epilog_for_reduction
3620
3621    Create code at the loop-epilog to finalize the result of a reduction
3622    computation.
3623
3624    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3625      reduction statements.
3626    STMT is the scalar reduction stmt that is being vectorized.
3627    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3628      number of elements that we can fit in a vectype (nunits).  In this case
3629      we have to generate more than one vector stmt - i.e - we need to "unroll"
3630      the vector stmt by a factor VF/nunits.  For more details see documentation
3631      in vectorizable_operation.
3632    REDUC_CODE is the tree-code for the epilog reduction.
3633    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3634      computation.
3635    REDUC_INDEX is the index of the operand in the right hand side of the
3636      statement that is defined by REDUCTION_PHI.
3637    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3638    SLP_NODE is an SLP node containing a group of reduction statements. The
3639      first one in this group is STMT.
3640
3641    This function:
3642    1. Creates the reduction def-use cycles: sets the arguments for
3643       REDUCTION_PHIS:
3644       The loop-entry argument is the vectorized initial-value of the reduction.
3645       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3646       sums.
3647    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3648       by applying the operation specified by REDUC_CODE if available, or by
3649       other means (whole-vector shifts or a scalar loop).
3650       The function also creates a new phi node at the loop exit to preserve
3651       loop-closed form, as illustrated below.
3652
3653      The flow at the entry to this function:
3654
3655         loop:
3656           vec_def = phi <null, null>            # REDUCTION_PHI
3657           VECT_DEF = vector_stmt                # vectorized form of STMT
3658           s_loop = scalar_stmt                  # (scalar) STMT
3659         loop_exit:
3660           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3661           use <s_out0>
3662           use <s_out0>
3663
3664      The above is transformed by this function into:
3665
3666         loop:
3667           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3668           VECT_DEF = vector_stmt                # vectorized form of STMT
3669           s_loop = scalar_stmt                  # (scalar) STMT
3670         loop_exit:
3671           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3672           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3673           v_out2 = reduce <v_out1>
3674           s_out3 = extract_field <v_out2, 0>
3675           s_out4 = adjust_result <s_out3>
3676           use <s_out4>
3677           use <s_out4>
3678 */
3679
3680 static void
3681 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3682                                   int ncopies, enum tree_code reduc_code,
3683                                   vec<gimple> reduction_phis,
3684                                   int reduc_index, bool double_reduc,
3685                                   slp_tree slp_node)
3686 {
3687   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3688   stmt_vec_info prev_phi_info;
3689   tree vectype;
3690   enum machine_mode mode;
3691   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3692   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3693   basic_block exit_bb;
3694   tree scalar_dest;
3695   tree scalar_type;
3696   gimple new_phi = NULL, phi;
3697   gimple_stmt_iterator exit_gsi;
3698   tree vec_dest;
3699   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3700   gimple epilog_stmt = NULL;
3701   enum tree_code code = gimple_assign_rhs_code (stmt);
3702   gimple exit_phi;
3703   tree bitsize, bitpos;
3704   tree adjustment_def = NULL;
3705   tree vec_initial_def = NULL;
3706   tree reduction_op, expr, def;
3707   tree orig_name, scalar_result;
3708   imm_use_iterator imm_iter, phi_imm_iter;
3709   use_operand_p use_p, phi_use_p;
3710   bool extract_scalar_result = false;
3711   gimple use_stmt, orig_stmt, reduction_phi = NULL;
3712   bool nested_in_vect_loop = false;
3713   vec<gimple> new_phis = vNULL;
3714   vec<gimple> inner_phis = vNULL;
3715   enum vect_def_type dt = vect_unknown_def_type;
3716   int j, i;
3717   vec<tree> scalar_results = vNULL;
3718   unsigned int group_size = 1, k, ratio;
3719   vec<tree> vec_initial_defs = vNULL;
3720   vec<gimple> phis;
3721   bool slp_reduc = false;
3722   tree new_phi_result;
3723   gimple inner_phi = NULL;
3724
3725   if (slp_node)
3726     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3727
3728   if (nested_in_vect_loop_p (loop, stmt))
3729     {
3730       outer_loop = loop;
3731       loop = loop->inner;
3732       nested_in_vect_loop = true;
3733       gcc_assert (!slp_node);
3734     }
3735
3736   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3737     {
3738     case GIMPLE_SINGLE_RHS:
3739       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3740                   == ternary_op);
3741       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3742       break;
3743     case GIMPLE_UNARY_RHS:
3744       reduction_op = gimple_assign_rhs1 (stmt);
3745       break;
3746     case GIMPLE_BINARY_RHS:
3747       reduction_op = reduc_index ?
3748                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3749       break;
3750     case GIMPLE_TERNARY_RHS:
3751       reduction_op = gimple_op (stmt, reduc_index + 1);
3752       break;
3753     default:
3754       gcc_unreachable ();
3755     }
3756
3757   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3758   gcc_assert (vectype);
3759   mode = TYPE_MODE (vectype);
3760
3761   /* 1. Create the reduction def-use cycle:
3762      Set the arguments of REDUCTION_PHIS, i.e., transform
3763
3764         loop:
3765           vec_def = phi <null, null>            # REDUCTION_PHI
3766           VECT_DEF = vector_stmt                # vectorized form of STMT
3767           ...
3768
3769      into:
3770
3771         loop:
3772           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3773           VECT_DEF = vector_stmt                # vectorized form of STMT
3774           ...
3775
3776      (in case of SLP, do it for all the phis). */
3777
3778   /* Get the loop-entry arguments.  */
3779   if (slp_node)
3780     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3781                        NULL, slp_node, reduc_index);
3782   else
3783     {
3784       vec_initial_defs.create (1);
3785      /* For the case of reduction, vect_get_vec_def_for_operand returns
3786         the scalar def before the loop, that defines the initial value
3787         of the reduction variable.  */
3788       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3789                                                       &adjustment_def);
3790       vec_initial_defs.quick_push (vec_initial_def);
3791     }
3792
3793   /* Set phi nodes arguments.  */
3794   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
3795     {
3796       tree vec_init_def = vec_initial_defs[i];
3797       tree def = vect_defs[i];
3798       for (j = 0; j < ncopies; j++)
3799         {
3800           /* Set the loop-entry arg of the reduction-phi.  */
3801           add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3802                        UNKNOWN_LOCATION);
3803
3804           /* Set the loop-latch arg for the reduction-phi.  */
3805           if (j > 0)
3806             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3807
3808           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3809
3810           if (dump_enabled_p ())
3811             {
3812               dump_printf_loc (MSG_NOTE, vect_location,
3813                                "transform reduction: created def-use cycle: ");
3814               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
3815               dump_printf (MSG_NOTE, "\n");
3816               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
3817             }
3818
3819           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3820         }
3821     }
3822
3823   vec_initial_defs.release ();
3824
3825   /* 2. Create epilog code.
3826         The reduction epilog code operates across the elements of the vector
3827         of partial results computed by the vectorized loop.
3828         The reduction epilog code consists of:
3829
3830         step 1: compute the scalar result in a vector (v_out2)
3831         step 2: extract the scalar result (s_out3) from the vector (v_out2)
3832         step 3: adjust the scalar result (s_out3) if needed.
3833
3834         Step 1 can be accomplished using one the following three schemes:
3835           (scheme 1) using reduc_code, if available.
3836           (scheme 2) using whole-vector shifts, if available.
3837           (scheme 3) using a scalar loop. In this case steps 1+2 above are
3838                      combined.
3839
3840           The overall epilog code looks like this:
3841
3842           s_out0 = phi <s_loop>         # original EXIT_PHI
3843           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
3844           v_out2 = reduce <v_out1>              # step 1
3845           s_out3 = extract_field <v_out2, 0>    # step 2
3846           s_out4 = adjust_result <s_out3>       # step 3
3847
3848           (step 3 is optional, and steps 1 and 2 may be combined).
3849           Lastly, the uses of s_out0 are replaced by s_out4.  */
3850
3851
3852   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3853          v_out1 = phi <VECT_DEF>
3854          Store them in NEW_PHIS.  */
3855
3856   exit_bb = single_exit (loop)->dest;
3857   prev_phi_info = NULL;
3858   new_phis.create (vect_defs.length ());
3859   FOR_EACH_VEC_ELT (vect_defs, i, def)
3860     {
3861       for (j = 0; j < ncopies; j++)
3862         {
3863           tree new_def = copy_ssa_name (def, NULL);
3864           phi = create_phi_node (new_def, exit_bb);
3865           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3866           if (j == 0)
3867             new_phis.quick_push (phi);
3868           else
3869             {
3870               def = vect_get_vec_def_for_stmt_copy (dt, def);
3871               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3872             }
3873
3874           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3875           prev_phi_info = vinfo_for_stmt (phi);
3876         }
3877     }
3878
3879   /* The epilogue is created for the outer-loop, i.e., for the loop being
3880      vectorized.  Create exit phis for the outer loop.  */
3881   if (double_reduc)
3882     {
3883       loop = outer_loop;
3884       exit_bb = single_exit (loop)->dest;
3885       inner_phis.create (vect_defs.length ());
3886       FOR_EACH_VEC_ELT (new_phis, i, phi)
3887         {
3888           tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3889           gimple outer_phi = create_phi_node (new_result, exit_bb);
3890           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3891                            PHI_RESULT (phi));
3892           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3893                                                             loop_vinfo, NULL));
3894           inner_phis.quick_push (phi);
3895           new_phis[i] = outer_phi;
3896           prev_phi_info = vinfo_for_stmt (outer_phi);
3897           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
3898             {
3899               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3900               new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3901               outer_phi = create_phi_node (new_result, exit_bb);
3902               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3903                                PHI_RESULT (phi));
3904               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3905                                                         loop_vinfo, NULL));
3906               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
3907               prev_phi_info = vinfo_for_stmt (outer_phi);
3908             }
3909         }
3910     }
3911
3912   exit_gsi = gsi_after_labels (exit_bb);
3913
3914   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3915          (i.e. when reduc_code is not available) and in the final adjustment
3916          code (if needed).  Also get the original scalar reduction variable as
3917          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
3918          represents a reduction pattern), the tree-code and scalar-def are
3919          taken from the original stmt that the pattern-stmt (STMT) replaces.
3920          Otherwise (it is a regular reduction) - the tree-code and scalar-def
3921          are taken from STMT.  */
3922
3923   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3924   if (!orig_stmt)
3925     {
3926       /* Regular reduction  */
3927       orig_stmt = stmt;
3928     }
3929   else
3930     {
3931       /* Reduction pattern  */
3932       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3933       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3934       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3935     }
3936
3937   code = gimple_assign_rhs_code (orig_stmt);
3938   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3939      partial results are added and not subtracted.  */
3940   if (code == MINUS_EXPR)
3941     code = PLUS_EXPR;
3942
3943   scalar_dest = gimple_assign_lhs (orig_stmt);
3944   scalar_type = TREE_TYPE (scalar_dest);
3945   scalar_results.create (group_size);
3946   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3947   bitsize = TYPE_SIZE (scalar_type);
3948
3949   /* In case this is a reduction in an inner-loop while vectorizing an outer
3950      loop - we don't need to extract a single scalar result at the end of the
3951      inner-loop (unless it is double reduction, i.e., the use of reduction is
3952      outside the outer-loop).  The final vector of partial results will be used
3953      in the vectorized outer-loop, or reduced to a scalar result at the end of
3954      the outer-loop.  */
3955   if (nested_in_vect_loop && !double_reduc)
3956     goto vect_finalize_reduction;
3957
3958   /* SLP reduction without reduction chain, e.g.,
3959      # a1 = phi <a2, a0>
3960      # b1 = phi <b2, b0>
3961      a2 = operation (a1)
3962      b2 = operation (b1)  */
3963   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
3964
3965   /* In case of reduction chain, e.g.,
3966      # a1 = phi <a3, a0>
3967      a2 = operation (a1)
3968      a3 = operation (a2),
3969
3970      we may end up with more than one vector result.  Here we reduce them to
3971      one vector.  */
3972   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
3973     {
3974       tree first_vect = PHI_RESULT (new_phis[0]);
3975       tree tmp;
3976       gimple new_vec_stmt = NULL;
3977
3978       vec_dest = vect_create_destination_var (scalar_dest, vectype);
3979       for (k = 1; k < new_phis.length (); k++)
3980         {
3981           gimple next_phi = new_phis[k];
3982           tree second_vect = PHI_RESULT (next_phi);
3983
3984           tmp = build2 (code, vectype,  first_vect, second_vect);
3985           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
3986           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
3987           gimple_assign_set_lhs (new_vec_stmt, first_vect);
3988           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
3989         }
3990
3991       new_phi_result = first_vect;
3992       if (new_vec_stmt)
3993         {
3994           new_phis.truncate (0);
3995           new_phis.safe_push (new_vec_stmt);
3996         }
3997     }
3998   else
3999     new_phi_result = PHI_RESULT (new_phis[0]);
4000
4001   /* 2.3 Create the reduction code, using one of the three schemes described
4002          above. In SLP we simply need to extract all the elements from the
4003          vector (without reducing them), so we use scalar shifts.  */
4004   if (reduc_code != ERROR_MARK && !slp_reduc)
4005     {
4006       tree tmp;
4007
4008       /*** Case 1:  Create:
4009            v_out2 = reduc_expr <v_out1>  */
4010
4011       if (dump_enabled_p ())
4012         dump_printf_loc (MSG_NOTE, vect_location,
4013                          "Reduce using direct vector reduction.");
4014
4015       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4016       tmp = build1 (reduc_code, vectype, new_phi_result);
4017       epilog_stmt = gimple_build_assign (vec_dest, tmp);
4018       new_temp = make_ssa_name (vec_dest, epilog_stmt);
4019       gimple_assign_set_lhs (epilog_stmt, new_temp);
4020       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4021
4022       extract_scalar_result = true;
4023     }
4024   else
4025     {
4026       enum tree_code shift_code = ERROR_MARK;
4027       bool have_whole_vector_shift = true;
4028       int bit_offset;
4029       int element_bitsize = tree_low_cst (bitsize, 1);
4030       int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4031       tree vec_temp;
4032
4033       if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4034         shift_code = VEC_RSHIFT_EXPR;
4035       else
4036         have_whole_vector_shift = false;
4037
4038       /* Regardless of whether we have a whole vector shift, if we're
4039          emulating the operation via tree-vect-generic, we don't want
4040          to use it.  Only the first round of the reduction is likely
4041          to still be profitable via emulation.  */
4042       /* ??? It might be better to emit a reduction tree code here, so that
4043          tree-vect-generic can expand the first round via bit tricks.  */
4044       if (!VECTOR_MODE_P (mode))
4045         have_whole_vector_shift = false;
4046       else
4047         {
4048           optab optab = optab_for_tree_code (code, vectype, optab_default);
4049           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4050             have_whole_vector_shift = false;
4051         }
4052
4053       if (have_whole_vector_shift && !slp_reduc)
4054         {
4055           /*** Case 2: Create:
4056              for (offset = VS/2; offset >= element_size; offset/=2)
4057                 {
4058                   Create:  va' = vec_shift <va, offset>
4059                   Create:  va = vop <va, va'>
4060                 }  */
4061
4062           if (dump_enabled_p ())
4063             dump_printf_loc (MSG_NOTE, vect_location,
4064                              "Reduce using vector shifts");
4065
4066           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4067           new_temp = new_phi_result;
4068           for (bit_offset = vec_size_in_bits/2;
4069                bit_offset >= element_bitsize;
4070                bit_offset /= 2)
4071             {
4072               tree bitpos = size_int (bit_offset);
4073
4074               epilog_stmt = gimple_build_assign_with_ops (shift_code,
4075                                                vec_dest, new_temp, bitpos);
4076               new_name = make_ssa_name (vec_dest, epilog_stmt);
4077               gimple_assign_set_lhs (epilog_stmt, new_name);
4078               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4079
4080               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4081                                                           new_name, new_temp);
4082               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4083               gimple_assign_set_lhs (epilog_stmt, new_temp);
4084               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4085             }
4086
4087           extract_scalar_result = true;
4088         }
4089       else
4090         {
4091           tree rhs;
4092
4093           /*** Case 3: Create:
4094              s = extract_field <v_out2, 0>
4095              for (offset = element_size;
4096                   offset < vector_size;
4097                   offset += element_size;)
4098                {
4099                  Create:  s' = extract_field <v_out2, offset>
4100                  Create:  s = op <s, s'>  // For non SLP cases
4101                }  */
4102
4103           if (dump_enabled_p ())
4104             dump_printf_loc (MSG_NOTE, vect_location,
4105                              "Reduce using scalar code. ");
4106
4107           vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4108           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4109             {
4110               if (gimple_code (new_phi) == GIMPLE_PHI)
4111                 vec_temp = PHI_RESULT (new_phi);
4112               else
4113                 vec_temp = gimple_assign_lhs (new_phi);
4114               rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4115                             bitsize_zero_node);
4116               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4117               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4118               gimple_assign_set_lhs (epilog_stmt, new_temp);
4119               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4120
4121               /* In SLP we don't need to apply reduction operation, so we just
4122                  collect s' values in SCALAR_RESULTS.  */
4123               if (slp_reduc)
4124                 scalar_results.safe_push (new_temp);
4125
4126               for (bit_offset = element_bitsize;
4127                    bit_offset < vec_size_in_bits;
4128                    bit_offset += element_bitsize)
4129                 {
4130                   tree bitpos = bitsize_int (bit_offset);
4131                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4132                                      bitsize, bitpos);
4133
4134                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4135                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4136                   gimple_assign_set_lhs (epilog_stmt, new_name);
4137                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4138
4139                   if (slp_reduc)
4140                     {
4141                       /* In SLP we don't need to apply reduction operation, so
4142                          we just collect s' values in SCALAR_RESULTS.  */
4143                       new_temp = new_name;
4144                       scalar_results.safe_push (new_name);
4145                     }
4146                   else
4147                     {
4148                       epilog_stmt = gimple_build_assign_with_ops (code,
4149                                           new_scalar_dest, new_name, new_temp);
4150                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4151                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4152                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4153                     }
4154                 }
4155             }
4156
4157           /* The only case where we need to reduce scalar results in SLP, is
4158              unrolling.  If the size of SCALAR_RESULTS is greater than
4159              GROUP_SIZE, we reduce them combining elements modulo
4160              GROUP_SIZE.  */
4161           if (slp_reduc)
4162             {
4163               tree res, first_res, new_res;
4164               gimple new_stmt;
4165
4166               /* Reduce multiple scalar results in case of SLP unrolling.  */
4167               for (j = group_size; scalar_results.iterate (j, &res);
4168                    j++)
4169                 {
4170                   first_res = scalar_results[j % group_size];
4171                   new_stmt = gimple_build_assign_with_ops (code,
4172                                               new_scalar_dest, first_res, res);
4173                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4174                   gimple_assign_set_lhs (new_stmt, new_res);
4175                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4176                   scalar_results[j % group_size] = new_res;
4177                 }
4178             }
4179           else
4180             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4181             scalar_results.safe_push (new_temp);
4182
4183           extract_scalar_result = false;
4184         }
4185     }
4186
4187   /* 2.4  Extract the final scalar result.  Create:
4188           s_out3 = extract_field <v_out2, bitpos>  */
4189
4190   if (extract_scalar_result)
4191     {
4192       tree rhs;
4193
4194       if (dump_enabled_p ())
4195         dump_printf_loc (MSG_NOTE, vect_location,
4196                          "extract scalar result");
4197
4198       if (BYTES_BIG_ENDIAN)
4199         bitpos = size_binop (MULT_EXPR,
4200                              bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4201                              TYPE_SIZE (scalar_type));
4202       else
4203         bitpos = bitsize_zero_node;
4204
4205       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4206       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4207       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4208       gimple_assign_set_lhs (epilog_stmt, new_temp);
4209       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4210       scalar_results.safe_push (new_temp);
4211     }
4212
4213 vect_finalize_reduction:
4214
4215   if (double_reduc)
4216     loop = loop->inner;
4217
4218   /* 2.5 Adjust the final result by the initial value of the reduction
4219          variable. (When such adjustment is not needed, then
4220          'adjustment_def' is zero).  For example, if code is PLUS we create:
4221          new_temp = loop_exit_def + adjustment_def  */
4222
4223   if (adjustment_def)
4224     {
4225       gcc_assert (!slp_reduc);
4226       if (nested_in_vect_loop)
4227         {
4228           new_phi = new_phis[0];
4229           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4230           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4231           new_dest = vect_create_destination_var (scalar_dest, vectype);
4232         }
4233       else
4234         {
4235           new_temp = scalar_results[0];
4236           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4237           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4238           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4239         }
4240
4241       epilog_stmt = gimple_build_assign (new_dest, expr);
4242       new_temp = make_ssa_name (new_dest, epilog_stmt);
4243       gimple_assign_set_lhs (epilog_stmt, new_temp);
4244       SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
4245       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4246       if (nested_in_vect_loop)
4247         {
4248           set_vinfo_for_stmt (epilog_stmt,
4249                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
4250                                                  NULL));
4251           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4252                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4253
4254           if (!double_reduc)
4255             scalar_results.quick_push (new_temp);
4256           else
4257             scalar_results[0] = new_temp;
4258         }
4259       else
4260         scalar_results[0] = new_temp;
4261
4262       new_phis[0] = epilog_stmt;
4263     }
4264
4265   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4266           phis with new adjusted scalar results, i.e., replace use <s_out0>
4267           with use <s_out4>.
4268
4269      Transform:
4270         loop_exit:
4271           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4272           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4273           v_out2 = reduce <v_out1>
4274           s_out3 = extract_field <v_out2, 0>
4275           s_out4 = adjust_result <s_out3>
4276           use <s_out0>
4277           use <s_out0>
4278
4279      into:
4280
4281         loop_exit:
4282           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4283           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4284           v_out2 = reduce <v_out1>
4285           s_out3 = extract_field <v_out2, 0>
4286           s_out4 = adjust_result <s_out3>
4287           use <s_out4>
4288           use <s_out4> */
4289
4290
4291   /* In SLP reduction chain we reduce vector results into one vector if
4292      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4293      the last stmt in the reduction chain, since we are looking for the loop
4294      exit phi node.  */
4295   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4296     {
4297       scalar_dest = gimple_assign_lhs (
4298                         SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4299       group_size = 1;
4300     }
4301
4302   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4303      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4304      need to match SCALAR_RESULTS with corresponding statements.  The first
4305      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4306      the first vector stmt, etc.
4307      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4308   if (group_size > new_phis.length ())
4309     {
4310       ratio = group_size / new_phis.length ();
4311       gcc_assert (!(group_size % new_phis.length ()));
4312     }
4313   else
4314     ratio = 1;
4315
4316   for (k = 0; k < group_size; k++)
4317     {
4318       if (k % ratio == 0)
4319         {
4320           epilog_stmt = new_phis[k / ratio];
4321           reduction_phi = reduction_phis[k / ratio];
4322           if (double_reduc)
4323             inner_phi = inner_phis[k / ratio];
4324         }
4325
4326       if (slp_reduc)
4327         {
4328           gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4329
4330           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4331           /* SLP statements can't participate in patterns.  */
4332           gcc_assert (!orig_stmt);
4333           scalar_dest = gimple_assign_lhs (current_stmt);
4334         }
4335
4336       phis.create (3);
4337       /* Find the loop-closed-use at the loop exit of the original scalar
4338          result.  (The reduction result is expected to have two immediate uses -
4339          one at the latch block, and one at the loop exit).  */
4340       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4341         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4342           phis.safe_push (USE_STMT (use_p));
4343
4344       /* We expect to have found an exit_phi because of loop-closed-ssa
4345          form.  */
4346       gcc_assert (!phis.is_empty ());
4347
4348       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4349         {
4350           if (outer_loop)
4351             {
4352               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4353               gimple vect_phi;
4354
4355               /* FORNOW. Currently not supporting the case that an inner-loop
4356                  reduction is not used in the outer-loop (but only outside the
4357                  outer-loop), unless it is double reduction.  */
4358               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4359                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4360                           || double_reduc);
4361
4362               STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4363               if (!double_reduc
4364                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4365                       != vect_double_reduction_def)
4366                 continue;
4367
4368               /* Handle double reduction:
4369
4370                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4371                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4372                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4373                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4374
4375                  At that point the regular reduction (stmt2 and stmt3) is
4376                  already vectorized, as well as the exit phi node, stmt4.
4377                  Here we vectorize the phi node of double reduction, stmt1, and
4378                  update all relevant statements.  */
4379
4380               /* Go through all the uses of s2 to find double reduction phi
4381                  node, i.e., stmt1 above.  */
4382               orig_name = PHI_RESULT (exit_phi);
4383               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4384                 {
4385                   stmt_vec_info use_stmt_vinfo;
4386                   stmt_vec_info new_phi_vinfo;
4387                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4388                   basic_block bb = gimple_bb (use_stmt);
4389                   gimple use;
4390
4391                   /* Check that USE_STMT is really double reduction phi
4392                      node.  */
4393                   if (gimple_code (use_stmt) != GIMPLE_PHI
4394                       || gimple_phi_num_args (use_stmt) != 2
4395                       || bb->loop_father != outer_loop)
4396                     continue;
4397                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4398                   if (!use_stmt_vinfo
4399                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4400                           != vect_double_reduction_def)
4401                     continue;
4402
4403                   /* Create vector phi node for double reduction:
4404                      vs1 = phi <vs0, vs2>
4405                      vs1 was created previously in this function by a call to
4406                        vect_get_vec_def_for_operand and is stored in
4407                        vec_initial_def;
4408                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4409                      vs0 is created here.  */
4410
4411                   /* Create vector phi node.  */
4412                   vect_phi = create_phi_node (vec_initial_def, bb);
4413                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4414                                     loop_vec_info_for_loop (outer_loop), NULL);
4415                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4416
4417                   /* Create vs0 - initial def of the double reduction phi.  */
4418                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4419                                              loop_preheader_edge (outer_loop));
4420                   init_def = get_initial_def_for_reduction (stmt,
4421                                                           preheader_arg, NULL);
4422                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4423                                                     vectype, NULL);
4424
4425                   /* Update phi node arguments with vs0 and vs2.  */
4426                   add_phi_arg (vect_phi, vect_phi_init,
4427                                loop_preheader_edge (outer_loop),
4428                                UNKNOWN_LOCATION);
4429                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4430                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4431                   if (dump_enabled_p ())
4432                     {
4433                       dump_printf_loc (MSG_NOTE, vect_location,
4434                                        "created double reduction phi node: ");
4435                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4436                     }
4437
4438                   vect_phi_res = PHI_RESULT (vect_phi);
4439
4440                   /* Replace the use, i.e., set the correct vs1 in the regular
4441                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4442                      loop is redundant.  */
4443                   use = reduction_phi;
4444                   for (j = 0; j < ncopies; j++)
4445                     {
4446                       edge pr_edge = loop_preheader_edge (loop);
4447                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4448                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4449                     }
4450                 }
4451             }
4452         }
4453
4454       phis.release ();
4455       if (nested_in_vect_loop)
4456         {
4457           if (double_reduc)
4458             loop = outer_loop;
4459           else
4460             continue;
4461         }
4462
4463       phis.create (3);
4464       /* Find the loop-closed-use at the loop exit of the original scalar
4465          result.  (The reduction result is expected to have two immediate uses,
4466          one at the latch block, and one at the loop exit).  For double
4467          reductions we are looking for exit phis of the outer loop.  */
4468       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4469         {
4470           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4471             phis.safe_push (USE_STMT (use_p));
4472           else
4473             {
4474               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4475                 {
4476                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4477
4478                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4479                     {
4480                       if (!flow_bb_inside_loop_p (loop,
4481                                              gimple_bb (USE_STMT (phi_use_p))))
4482                         phis.safe_push (USE_STMT (phi_use_p));
4483                     }
4484                 }
4485             }
4486         }
4487
4488       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4489         {
4490           /* Replace the uses:  */
4491           orig_name = PHI_RESULT (exit_phi);
4492           scalar_result = scalar_results[k];
4493           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4494             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4495               SET_USE (use_p, scalar_result);
4496         }
4497
4498       phis.release ();
4499     }
4500
4501   scalar_results.release ();
4502   inner_phis.release ();
4503   new_phis.release ();
4504 }
4505
4506
4507 /* Function vectorizable_reduction.
4508
4509    Check if STMT performs a reduction operation that can be vectorized.
4510    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4511    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4512    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4513
4514    This function also handles reduction idioms (patterns) that have been
4515    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4516    of this form:
4517      X = pattern_expr (arg0, arg1, ..., X)
4518    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4519    sequence that had been detected and replaced by the pattern-stmt (STMT).
4520
4521    In some cases of reduction patterns, the type of the reduction variable X is
4522    different than the type of the other arguments of STMT.
4523    In such cases, the vectype that is used when transforming STMT into a vector
4524    stmt is different than the vectype that is used to determine the
4525    vectorization factor, because it consists of a different number of elements
4526    than the actual number of elements that are being operated upon in parallel.
4527
4528    For example, consider an accumulation of shorts into an int accumulator.
4529    On some targets it's possible to vectorize this pattern operating on 8
4530    shorts at a time (hence, the vectype for purposes of determining the
4531    vectorization factor should be V8HI); on the other hand, the vectype that
4532    is used to create the vector form is actually V4SI (the type of the result).
4533
4534    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4535    indicates what is the actual level of parallelism (V8HI in the example), so
4536    that the right vectorization factor would be derived.  This vectype
4537    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4538    be used to create the vectorized stmt.  The right vectype for the vectorized
4539    stmt is obtained from the type of the result X:
4540         get_vectype_for_scalar_type (TREE_TYPE (X))
4541
4542    This means that, contrary to "regular" reductions (or "regular" stmts in
4543    general), the following equation:
4544       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4545    does *NOT* necessarily hold for reduction patterns.  */
4546
4547 bool
4548 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4549                         gimple *vec_stmt, slp_tree slp_node)
4550 {
4551   tree vec_dest;
4552   tree scalar_dest;
4553   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4554   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4555   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4556   tree vectype_in = NULL_TREE;
4557   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4558   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4559   enum tree_code code, orig_code, epilog_reduc_code;
4560   enum machine_mode vec_mode;
4561   int op_type;
4562   optab optab, reduc_optab;
4563   tree new_temp = NULL_TREE;
4564   tree def;
4565   gimple def_stmt;
4566   enum vect_def_type dt;
4567   gimple new_phi = NULL;
4568   tree scalar_type;
4569   bool is_simple_use;
4570   gimple orig_stmt;
4571   stmt_vec_info orig_stmt_info;
4572   tree expr = NULL_TREE;
4573   int i;
4574   int ncopies;
4575   int epilog_copies;
4576   stmt_vec_info prev_stmt_info, prev_phi_info;
4577   bool single_defuse_cycle = false;
4578   tree reduc_def = NULL_TREE;
4579   gimple new_stmt = NULL;
4580   int j;
4581   tree ops[3];
4582   bool nested_cycle = false, found_nested_cycle_def = false;
4583   gimple reduc_def_stmt = NULL;
4584   /* The default is that the reduction variable is the last in statement.  */
4585   int reduc_index = 2;
4586   bool double_reduc = false, dummy;
4587   basic_block def_bb;
4588   struct loop * def_stmt_loop, *outer_loop = NULL;
4589   tree def_arg;
4590   gimple def_arg_stmt;
4591   vec<tree> vec_oprnds0 = vNULL;
4592   vec<tree> vec_oprnds1 = vNULL;
4593   vec<tree> vect_defs = vNULL;
4594   vec<gimple> phis = vNULL;
4595   int vec_num;
4596   tree def0, def1, tem, op0, op1 = NULL_TREE;
4597
4598   /* In case of reduction chain we switch to the first stmt in the chain, but
4599      we don't update STMT_INFO, since only the last stmt is marked as reduction
4600      and has reduction properties.  */
4601   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4602     stmt = GROUP_FIRST_ELEMENT (stmt_info);
4603
4604   if (nested_in_vect_loop_p (loop, stmt))
4605     {
4606       outer_loop = loop;
4607       loop = loop->inner;
4608       nested_cycle = true;
4609     }
4610
4611   /* 1. Is vectorizable reduction?  */
4612   /* Not supportable if the reduction variable is used in the loop, unless
4613      it's a reduction chain.  */
4614   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4615       && !GROUP_FIRST_ELEMENT (stmt_info))
4616     return false;
4617
4618   /* Reductions that are not used even in an enclosing outer-loop,
4619      are expected to be "live" (used out of the loop).  */
4620   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4621       && !STMT_VINFO_LIVE_P (stmt_info))
4622     return false;
4623
4624   /* Make sure it was already recognized as a reduction computation.  */
4625   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4626       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4627     return false;
4628
4629   /* 2. Has this been recognized as a reduction pattern?
4630
4631      Check if STMT represents a pattern that has been recognized
4632      in earlier analysis stages.  For stmts that represent a pattern,
4633      the STMT_VINFO_RELATED_STMT field records the last stmt in
4634      the original sequence that constitutes the pattern.  */
4635
4636   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4637   if (orig_stmt)
4638     {
4639       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4640       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4641       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4642     }
4643
4644   /* 3. Check the operands of the operation.  The first operands are defined
4645         inside the loop body. The last operand is the reduction variable,
4646         which is defined by the loop-header-phi.  */
4647
4648   gcc_assert (is_gimple_assign (stmt));
4649
4650   /* Flatten RHS.  */
4651   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4652     {
4653     case GIMPLE_SINGLE_RHS:
4654       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4655       if (op_type == ternary_op)
4656         {
4657           tree rhs = gimple_assign_rhs1 (stmt);
4658           ops[0] = TREE_OPERAND (rhs, 0);
4659           ops[1] = TREE_OPERAND (rhs, 1);
4660           ops[2] = TREE_OPERAND (rhs, 2);
4661           code = TREE_CODE (rhs);
4662         }
4663       else
4664         return false;
4665       break;
4666
4667     case GIMPLE_BINARY_RHS:
4668       code = gimple_assign_rhs_code (stmt);
4669       op_type = TREE_CODE_LENGTH (code);
4670       gcc_assert (op_type == binary_op);
4671       ops[0] = gimple_assign_rhs1 (stmt);
4672       ops[1] = gimple_assign_rhs2 (stmt);
4673       break;
4674
4675     case GIMPLE_TERNARY_RHS:
4676       code = gimple_assign_rhs_code (stmt);
4677       op_type = TREE_CODE_LENGTH (code);
4678       gcc_assert (op_type == ternary_op);
4679       ops[0] = gimple_assign_rhs1 (stmt);
4680       ops[1] = gimple_assign_rhs2 (stmt);
4681       ops[2] = gimple_assign_rhs3 (stmt);
4682       break;
4683
4684     case GIMPLE_UNARY_RHS:
4685       return false;
4686
4687     default:
4688       gcc_unreachable ();
4689     }
4690
4691   if (code == COND_EXPR && slp_node)
4692     return false;
4693
4694   scalar_dest = gimple_assign_lhs (stmt);
4695   scalar_type = TREE_TYPE (scalar_dest);
4696   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4697       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4698     return false;
4699
4700   /* Do not try to vectorize bit-precision reductions.  */
4701   if ((TYPE_PRECISION (scalar_type)
4702        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4703     return false;
4704
4705   /* All uses but the last are expected to be defined in the loop.
4706      The last use is the reduction variable.  In case of nested cycle this
4707      assumption is not true: we use reduc_index to record the index of the
4708      reduction variable.  */
4709   for (i = 0; i < op_type - 1; i++)
4710     {
4711       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4712       if (i == 0 && code == COND_EXPR)
4713         continue;
4714
4715       is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4716                                             &def_stmt, &def, &dt, &tem);
4717       if (!vectype_in)
4718         vectype_in = tem;
4719       gcc_assert (is_simple_use);
4720
4721       if (dt != vect_internal_def
4722           && dt != vect_external_def
4723           && dt != vect_constant_def
4724           && dt != vect_induction_def
4725           && !(dt == vect_nested_cycle && nested_cycle))
4726         return false;
4727
4728       if (dt == vect_nested_cycle)
4729         {
4730           found_nested_cycle_def = true;
4731           reduc_def_stmt = def_stmt;
4732           reduc_index = i;
4733         }
4734     }
4735
4736   is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4737                                         &def_stmt, &def, &dt, &tem);
4738   if (!vectype_in)
4739     vectype_in = tem;
4740   gcc_assert (is_simple_use);
4741   if (!(dt == vect_reduction_def
4742         || dt == vect_nested_cycle
4743         || ((dt == vect_internal_def || dt == vect_external_def
4744              || dt == vect_constant_def || dt == vect_induction_def)
4745             && nested_cycle && found_nested_cycle_def)))
4746     {
4747       /* For pattern recognized stmts, orig_stmt might be a reduction,
4748          but some helper statements for the pattern might not, or
4749          might be COND_EXPRs with reduction uses in the condition.  */
4750       gcc_assert (orig_stmt);
4751       return false;
4752     }
4753   if (!found_nested_cycle_def)
4754     reduc_def_stmt = def_stmt;
4755
4756   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4757   if (orig_stmt)
4758     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4759                                                        reduc_def_stmt,
4760                                                        !nested_cycle,
4761                                                        &dummy));
4762   else
4763     {
4764       gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4765                                              !nested_cycle, &dummy);
4766       /* We changed STMT to be the first stmt in reduction chain, hence we
4767          check that in this case the first element in the chain is STMT.  */
4768       gcc_assert (stmt == tmp
4769                   || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4770     }
4771
4772   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4773     return false;
4774
4775   if (slp_node || PURE_SLP_STMT (stmt_info))
4776     ncopies = 1;
4777   else
4778     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4779                / TYPE_VECTOR_SUBPARTS (vectype_in));
4780
4781   gcc_assert (ncopies >= 1);
4782
4783   vec_mode = TYPE_MODE (vectype_in);
4784
4785   if (code == COND_EXPR)
4786     {
4787       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4788         {
4789           if (dump_enabled_p ())
4790             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4791                              "unsupported condition in reduction");
4792
4793             return false;
4794         }
4795     }
4796   else
4797     {
4798       /* 4. Supportable by target?  */
4799
4800       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
4801           || code == LROTATE_EXPR || code == RROTATE_EXPR)
4802         {
4803           /* Shifts and rotates are only supported by vectorizable_shifts,
4804              not vectorizable_reduction.  */
4805           if (dump_enabled_p ())
4806             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4807                              "unsupported shift or rotation.");
4808           return false;
4809         }
4810
4811       /* 4.1. check support for the operation in the loop  */
4812       optab = optab_for_tree_code (code, vectype_in, optab_default);
4813       if (!optab)
4814         {
4815           if (dump_enabled_p ())
4816             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4817                              "no optab.");
4818
4819           return false;
4820         }
4821
4822       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4823         {
4824           if (dump_enabled_p ())
4825             dump_printf (MSG_NOTE, "op not supported by target.");
4826
4827           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4828               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4829                   < vect_min_worthwhile_factor (code))
4830             return false;
4831
4832           if (dump_enabled_p ())
4833             dump_printf (MSG_NOTE, "proceeding using word mode.");
4834         }
4835
4836       /* Worthwhile without SIMD support?  */
4837       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4838           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4839              < vect_min_worthwhile_factor (code))
4840         {
4841           if (dump_enabled_p ())
4842             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4843                              "not worthwhile without SIMD support.");
4844
4845           return false;
4846         }
4847     }
4848
4849   /* 4.2. Check support for the epilog operation.
4850
4851           If STMT represents a reduction pattern, then the type of the
4852           reduction variable may be different than the type of the rest
4853           of the arguments.  For example, consider the case of accumulation
4854           of shorts into an int accumulator; The original code:
4855                         S1: int_a = (int) short_a;
4856           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
4857
4858           was replaced with:
4859                         STMT: int_acc = widen_sum <short_a, int_acc>
4860
4861           This means that:
4862           1. The tree-code that is used to create the vector operation in the
4863              epilog code (that reduces the partial results) is not the
4864              tree-code of STMT, but is rather the tree-code of the original
4865              stmt from the pattern that STMT is replacing.  I.e, in the example
4866              above we want to use 'widen_sum' in the loop, but 'plus' in the
4867              epilog.
4868           2. The type (mode) we use to check available target support
4869              for the vector operation to be created in the *epilog*, is
4870              determined by the type of the reduction variable (in the example
4871              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4872              However the type (mode) we use to check available target support
4873              for the vector operation to be created *inside the loop*, is
4874              determined by the type of the other arguments to STMT (in the
4875              example we'd check this: optab_handler (widen_sum_optab,
4876              vect_short_mode)).
4877
4878           This is contrary to "regular" reductions, in which the types of all
4879           the arguments are the same as the type of the reduction variable.
4880           For "regular" reductions we can therefore use the same vector type
4881           (and also the same tree-code) when generating the epilog code and
4882           when generating the code inside the loop.  */
4883
4884   if (orig_stmt)
4885     {
4886       /* This is a reduction pattern: get the vectype from the type of the
4887          reduction variable, and get the tree-code from orig_stmt.  */
4888       orig_code = gimple_assign_rhs_code (orig_stmt);
4889       gcc_assert (vectype_out);
4890       vec_mode = TYPE_MODE (vectype_out);
4891     }
4892   else
4893     {
4894       /* Regular reduction: use the same vectype and tree-code as used for
4895          the vector code inside the loop can be used for the epilog code. */
4896       orig_code = code;
4897     }
4898
4899   if (nested_cycle)
4900     {
4901       def_bb = gimple_bb (reduc_def_stmt);
4902       def_stmt_loop = def_bb->loop_father;
4903       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4904                                        loop_preheader_edge (def_stmt_loop));
4905       if (TREE_CODE (def_arg) == SSA_NAME
4906           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4907           && gimple_code (def_arg_stmt) == GIMPLE_PHI
4908           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4909           && vinfo_for_stmt (def_arg_stmt)
4910           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4911               == vect_double_reduction_def)
4912         double_reduc = true;
4913     }
4914
4915   epilog_reduc_code = ERROR_MARK;
4916   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4917     {
4918       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4919                                          optab_default);
4920       if (!reduc_optab)
4921         {
4922           if (dump_enabled_p ())
4923             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4924                              "no optab for reduction.");
4925
4926           epilog_reduc_code = ERROR_MARK;
4927         }
4928
4929       if (reduc_optab
4930           && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4931         {
4932           if (dump_enabled_p ())
4933             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4934                              "reduc op not supported by target.");
4935
4936           epilog_reduc_code = ERROR_MARK;
4937         }
4938     }
4939   else
4940     {
4941       if (!nested_cycle || double_reduc)
4942         {
4943           if (dump_enabled_p ())
4944             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4945                              "no reduc code for scalar code.");
4946
4947           return false;
4948         }
4949     }
4950
4951   if (double_reduc && ncopies > 1)
4952     {
4953       if (dump_enabled_p ())
4954         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4955                          "multiple types in double reduction");
4956
4957       return false;
4958     }
4959
4960   /* In case of widenning multiplication by a constant, we update the type
4961      of the constant to be the type of the other operand.  We check that the
4962      constant fits the type in the pattern recognition pass.  */
4963   if (code == DOT_PROD_EXPR
4964       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
4965     {
4966       if (TREE_CODE (ops[0]) == INTEGER_CST)
4967         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
4968       else if (TREE_CODE (ops[1]) == INTEGER_CST)
4969         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
4970       else
4971         {
4972           if (dump_enabled_p ())
4973             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4974                              "invalid types in dot-prod");
4975
4976           return false;
4977         }
4978     }
4979
4980   if (!vec_stmt) /* transformation not required.  */
4981     {
4982       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4983         return false;
4984       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4985       return true;
4986     }
4987
4988   /** Transform.  **/
4989
4990   if (dump_enabled_p ())
4991     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.");
4992
4993   /* FORNOW: Multiple types are not supported for condition.  */
4994   if (code == COND_EXPR)
4995     gcc_assert (ncopies == 1);
4996
4997   /* Create the destination vector  */
4998   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4999
5000   /* In case the vectorization factor (VF) is bigger than the number
5001      of elements that we can fit in a vectype (nunits), we have to generate
5002      more than one vector stmt - i.e - we need to "unroll" the
5003      vector stmt by a factor VF/nunits.  For more details see documentation
5004      in vectorizable_operation.  */
5005
5006   /* If the reduction is used in an outer loop we need to generate
5007      VF intermediate results, like so (e.g. for ncopies=2):
5008         r0 = phi (init, r0)
5009         r1 = phi (init, r1)
5010         r0 = x0 + r0;
5011         r1 = x1 + r1;
5012     (i.e. we generate VF results in 2 registers).
5013     In this case we have a separate def-use cycle for each copy, and therefore
5014     for each copy we get the vector def for the reduction variable from the
5015     respective phi node created for this copy.
5016
5017     Otherwise (the reduction is unused in the loop nest), we can combine
5018     together intermediate results, like so (e.g. for ncopies=2):
5019         r = phi (init, r)
5020         r = x0 + r;
5021         r = x1 + r;
5022    (i.e. we generate VF/2 results in a single register).
5023    In this case for each copy we get the vector def for the reduction variable
5024    from the vectorized reduction operation generated in the previous iteration.
5025   */
5026
5027   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5028     {
5029       single_defuse_cycle = true;
5030       epilog_copies = 1;
5031     }
5032   else
5033     epilog_copies = ncopies;
5034
5035   prev_stmt_info = NULL;
5036   prev_phi_info = NULL;
5037   if (slp_node)
5038     {
5039       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5040       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5041                   == TYPE_VECTOR_SUBPARTS (vectype_in));
5042     }
5043   else
5044     {
5045       vec_num = 1;
5046       vec_oprnds0.create (1);
5047       if (op_type == ternary_op)
5048         vec_oprnds1.create (1);
5049     }
5050
5051   phis.create (vec_num);
5052   vect_defs.create (vec_num);
5053   if (!slp_node)
5054     vect_defs.quick_push (NULL_TREE);
5055
5056   for (j = 0; j < ncopies; j++)
5057     {
5058       if (j == 0 || !single_defuse_cycle)
5059         {
5060           for (i = 0; i < vec_num; i++)
5061             {
5062               /* Create the reduction-phi that defines the reduction
5063                  operand.  */
5064               new_phi = create_phi_node (vec_dest, loop->header);
5065               set_vinfo_for_stmt (new_phi,
5066                                   new_stmt_vec_info (new_phi, loop_vinfo,
5067                                                      NULL));
5068                if (j == 0 || slp_node)
5069                  phis.quick_push (new_phi);
5070             }
5071         }
5072
5073       if (code == COND_EXPR)
5074         {
5075           gcc_assert (!slp_node);
5076           vectorizable_condition (stmt, gsi, vec_stmt,
5077                                   PHI_RESULT (phis[0]),
5078                                   reduc_index, NULL);
5079           /* Multiple types are not supported for condition.  */
5080           break;
5081         }
5082
5083       /* Handle uses.  */
5084       if (j == 0)
5085         {
5086           op0 = ops[!reduc_index];
5087           if (op_type == ternary_op)
5088             {
5089               if (reduc_index == 0)
5090                 op1 = ops[2];
5091               else
5092                 op1 = ops[1];
5093             }
5094
5095           if (slp_node)
5096             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5097                                slp_node, -1);
5098           else
5099             {
5100               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5101                                                             stmt, NULL);
5102               vec_oprnds0.quick_push (loop_vec_def0);
5103               if (op_type == ternary_op)
5104                {
5105                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5106                                                                NULL);
5107                  vec_oprnds1.quick_push (loop_vec_def1);
5108                }
5109             }
5110         }
5111       else
5112         {
5113           if (!slp_node)
5114             {
5115               enum vect_def_type dt;
5116               gimple dummy_stmt;
5117               tree dummy;
5118
5119               vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5120                                   &dummy_stmt, &dummy, &dt);
5121               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5122                                                               loop_vec_def0);
5123               vec_oprnds0[0] = loop_vec_def0;
5124               if (op_type == ternary_op)
5125                 {
5126                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5127                                       &dummy, &dt);
5128                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5129                                                                 loop_vec_def1);
5130                   vec_oprnds1[0] = loop_vec_def1;
5131                 }
5132             }
5133
5134           if (single_defuse_cycle)
5135             reduc_def = gimple_assign_lhs (new_stmt);
5136
5137           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5138         }
5139
5140       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5141         {
5142           if (slp_node)
5143             reduc_def = PHI_RESULT (phis[i]);
5144           else
5145             {
5146               if (!single_defuse_cycle || j == 0)
5147                 reduc_def = PHI_RESULT (new_phi);
5148             }
5149
5150           def1 = ((op_type == ternary_op)
5151                   ? vec_oprnds1[i] : NULL);
5152           if (op_type == binary_op)
5153             {
5154               if (reduc_index == 0)
5155                 expr = build2 (code, vectype_out, reduc_def, def0);
5156               else
5157                 expr = build2 (code, vectype_out, def0, reduc_def);
5158             }
5159           else
5160             {
5161               if (reduc_index == 0)
5162                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5163               else
5164                 {
5165                   if (reduc_index == 1)
5166                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5167                   else
5168                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5169                 }
5170             }
5171
5172           new_stmt = gimple_build_assign (vec_dest, expr);
5173           new_temp = make_ssa_name (vec_dest, new_stmt);
5174           gimple_assign_set_lhs (new_stmt, new_temp);
5175           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5176
5177           if (slp_node)
5178             {
5179               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5180               vect_defs.quick_push (new_temp);
5181             }
5182           else
5183             vect_defs[0] = new_temp;
5184         }
5185
5186       if (slp_node)
5187         continue;
5188
5189       if (j == 0)
5190         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5191       else
5192         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5193
5194       prev_stmt_info = vinfo_for_stmt (new_stmt);
5195       prev_phi_info = vinfo_for_stmt (new_phi);
5196     }
5197
5198   /* Finalize the reduction-phi (set its arguments) and create the
5199      epilog reduction code.  */
5200   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5201     {
5202       new_temp = gimple_assign_lhs (*vec_stmt);
5203       vect_defs[0] = new_temp;
5204     }
5205
5206   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5207                                     epilog_reduc_code, phis, reduc_index,
5208                                     double_reduc, slp_node);
5209
5210   phis.release ();
5211   vect_defs.release ();
5212   vec_oprnds0.release ();
5213   vec_oprnds1.release ();
5214
5215   return true;
5216 }
5217
5218 /* Function vect_min_worthwhile_factor.
5219
5220    For a loop where we could vectorize the operation indicated by CODE,
5221    return the minimum vectorization factor that makes it worthwhile
5222    to use generic vectors.  */
5223 int
5224 vect_min_worthwhile_factor (enum tree_code code)
5225 {
5226   switch (code)
5227     {
5228     case PLUS_EXPR:
5229     case MINUS_EXPR:
5230     case NEGATE_EXPR:
5231       return 4;
5232
5233     case BIT_AND_EXPR:
5234     case BIT_IOR_EXPR:
5235     case BIT_XOR_EXPR:
5236     case BIT_NOT_EXPR:
5237       return 2;
5238
5239     default:
5240       return INT_MAX;
5241     }
5242 }
5243
5244
5245 /* Function vectorizable_induction
5246
5247    Check if PHI performs an induction computation that can be vectorized.
5248    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5249    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5250    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5251
5252 bool
5253 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5254                         gimple *vec_stmt)
5255 {
5256   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5257   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5258   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5259   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5260   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5261   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5262   tree vec_def;
5263
5264   gcc_assert (ncopies >= 1);
5265   /* FORNOW. These restrictions should be relaxed.  */
5266   if (nested_in_vect_loop_p (loop, phi))
5267     {
5268       imm_use_iterator imm_iter;
5269       use_operand_p use_p;
5270       gimple exit_phi;
5271       edge latch_e;
5272       tree loop_arg;
5273
5274       if (ncopies > 1)
5275         {
5276           if (dump_enabled_p ())
5277             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5278                              "multiple types in nested loop.");
5279           return false;
5280         }
5281
5282       exit_phi = NULL;
5283       latch_e = loop_latch_edge (loop->inner);
5284       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5285       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5286         {
5287           if (!flow_bb_inside_loop_p (loop->inner,
5288                                       gimple_bb (USE_STMT (use_p))))
5289             {
5290               exit_phi = USE_STMT (use_p);
5291               break;
5292             }
5293         }
5294       if (exit_phi)
5295         {
5296           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5297           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5298                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5299             {
5300               if (dump_enabled_p ())
5301                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5302                                  "inner-loop induction only used outside "
5303                                  "of the outer vectorized loop.");
5304               return false;
5305             }
5306         }
5307     }
5308
5309   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5310     return false;
5311
5312   /* FORNOW: SLP not supported.  */
5313   if (STMT_SLP_TYPE (stmt_info))
5314     return false;
5315
5316   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5317
5318   if (gimple_code (phi) != GIMPLE_PHI)
5319     return false;
5320
5321   if (!vec_stmt) /* transformation not required.  */
5322     {
5323       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5324       if (dump_enabled_p ())
5325         dump_printf_loc (MSG_NOTE, vect_location,
5326                          "=== vectorizable_induction ===");
5327       vect_model_induction_cost (stmt_info, ncopies);
5328       return true;
5329     }
5330
5331   /** Transform.  **/
5332
5333   if (dump_enabled_p ())
5334     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.");
5335
5336   vec_def = get_initial_def_for_induction (phi);
5337   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5338   return true;
5339 }
5340
5341 /* Function vectorizable_live_operation.
5342
5343    STMT computes a value that is used outside the loop.  Check if
5344    it can be supported.  */
5345
5346 bool
5347 vectorizable_live_operation (gimple stmt,
5348                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5349                              gimple *vec_stmt ATTRIBUTE_UNUSED)
5350 {
5351   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5352   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5353   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5354   int i;
5355   int op_type;
5356   tree op;
5357   tree def;
5358   gimple def_stmt;
5359   enum vect_def_type dt;
5360   enum tree_code code;
5361   enum gimple_rhs_class rhs_class;
5362
5363   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5364
5365   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5366     return false;
5367
5368   if (!is_gimple_assign (stmt))
5369     return false;
5370
5371   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5372     return false;
5373
5374   /* FORNOW. CHECKME. */
5375   if (nested_in_vect_loop_p (loop, stmt))
5376     return false;
5377
5378   code = gimple_assign_rhs_code (stmt);
5379   op_type = TREE_CODE_LENGTH (code);
5380   rhs_class = get_gimple_rhs_class (code);
5381   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5382   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5383
5384   /* FORNOW: support only if all uses are invariant.  This means
5385      that the scalar operations can remain in place, unvectorized.
5386      The original last scalar value that they compute will be used.  */
5387
5388   for (i = 0; i < op_type; i++)
5389     {
5390       if (rhs_class == GIMPLE_SINGLE_RHS)
5391         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5392       else
5393         op = gimple_op (stmt, i + 1);
5394       if (op
5395           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5396                                   &dt))
5397         {
5398           if (dump_enabled_p ())
5399             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5400                              "use not simple.");
5401           return false;
5402         }
5403
5404       if (dt != vect_external_def && dt != vect_constant_def)
5405         return false;
5406     }
5407
5408   /* No transformation is required for the cases we currently support.  */
5409   return true;
5410 }
5411
5412 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5413
5414 static void
5415 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5416 {
5417   ssa_op_iter op_iter;
5418   imm_use_iterator imm_iter;
5419   def_operand_p def_p;
5420   gimple ustmt;
5421
5422   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5423     {
5424       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5425         {
5426           basic_block bb;
5427
5428           if (!is_gimple_debug (ustmt))
5429             continue;
5430
5431           bb = gimple_bb (ustmt);
5432
5433           if (!flow_bb_inside_loop_p (loop, bb))
5434             {
5435               if (gimple_debug_bind_p (ustmt))
5436                 {
5437                   if (dump_enabled_p ())
5438                     dump_printf_loc (MSG_NOTE, vect_location,
5439                                      "killing debug use");
5440
5441                   gimple_debug_bind_reset_value (ustmt);
5442                   update_stmt (ustmt);
5443                 }
5444               else
5445                 gcc_unreachable ();
5446             }
5447         }
5448     }
5449 }
5450
5451 /* Function vect_transform_loop.
5452
5453    The analysis phase has determined that the loop is vectorizable.
5454    Vectorize the loop - created vectorized stmts to replace the scalar
5455    stmts in the loop, and update the loop exit condition.  */
5456
5457 void
5458 vect_transform_loop (loop_vec_info loop_vinfo)
5459 {
5460   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5461   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5462   int nbbs = loop->num_nodes;
5463   gimple_stmt_iterator si;
5464   int i;
5465   tree ratio = NULL;
5466   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5467   bool grouped_store;
5468   bool slp_scheduled = false;
5469   unsigned int nunits;
5470   gimple stmt, pattern_stmt;
5471   gimple_seq pattern_def_seq = NULL;
5472   gimple_stmt_iterator pattern_def_si = gsi_none ();
5473   bool transform_pattern_stmt = false;
5474   bool check_profitability = false;
5475   int th;
5476   /* Record number of iterations before we started tampering with the profile. */
5477   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5478
5479   if (dump_enabled_p ())
5480     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===");
5481
5482   /* If profile is inprecise, we have chance to fix it up.  */
5483   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5484     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5485
5486   /* Use the more conservative vectorization threshold.  If the number
5487      of iterations is constant assume the cost check has been performed
5488      by our caller.  If the threshold makes all loops profitable that
5489      run at least the vectorization factor number of times checking
5490      is pointless, too.  */
5491   th = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
5492          * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
5493   th = MAX (th, LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo));
5494   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5495       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5496     {
5497       if (dump_enabled_p ())
5498         dump_printf_loc (MSG_NOTE, vect_location,
5499                          "Profitability threshold is %d loop iterations.", th);
5500       check_profitability = true;
5501     }
5502
5503   /* Peel the loop if there are data refs with unknown alignment.
5504      Only one data ref with unknown store is allowed.  */
5505
5506   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5507     {
5508       vect_do_peeling_for_alignment (loop_vinfo, th, check_profitability);
5509       check_profitability = false;
5510     }
5511
5512   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5513       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5514     {
5515       vect_loop_versioning (loop_vinfo, th, check_profitability);
5516       check_profitability = false;
5517     }
5518
5519   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5520      compile time constant), or it is a constant that doesn't divide by the
5521      vectorization factor, then an epilog loop needs to be created.
5522      We therefore duplicate the loop: the original loop will be vectorized,
5523      and will compute the first (n/VF) iterations.  The second copy of the loop
5524      will remain scalar and will compute the remaining (n%VF) iterations.
5525      (VF is the vectorization factor).  */
5526
5527   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5528        || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5529            && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
5530        || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5531     vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
5532                                     th, check_profitability);
5533   else
5534     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5535                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5536
5537   /* 1) Make sure the loop header has exactly two entries
5538      2) Make sure we have a preheader basic block.  */
5539
5540   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5541
5542   split_edge (loop_preheader_edge (loop));
5543
5544   /* FORNOW: the vectorizer supports only loops which body consist
5545      of one basic block (header + empty latch). When the vectorizer will
5546      support more involved loop forms, the order by which the BBs are
5547      traversed need to be reconsidered.  */
5548
5549   for (i = 0; i < nbbs; i++)
5550     {
5551       basic_block bb = bbs[i];
5552       stmt_vec_info stmt_info;
5553       gimple phi;
5554
5555       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5556         {
5557           phi = gsi_stmt (si);
5558           if (dump_enabled_p ())
5559             {
5560               dump_printf_loc (MSG_NOTE, vect_location,
5561                                "------>vectorizing phi: ");
5562               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5563             }
5564           stmt_info = vinfo_for_stmt (phi);
5565           if (!stmt_info)
5566             continue;
5567
5568           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5569             vect_loop_kill_debug_uses (loop, phi);
5570
5571           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5572               && !STMT_VINFO_LIVE_P (stmt_info))
5573             continue;
5574
5575           if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5576                 != (unsigned HOST_WIDE_INT) vectorization_factor)
5577               && dump_enabled_p ())
5578             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.");
5579
5580           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5581             {
5582               if (dump_enabled_p ())
5583                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.");
5584               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5585             }
5586         }
5587
5588       pattern_stmt = NULL;
5589       for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5590         {
5591           bool is_store;
5592
5593           if (transform_pattern_stmt)
5594             stmt = pattern_stmt;
5595           else
5596             stmt = gsi_stmt (si);
5597
5598           if (dump_enabled_p ())
5599             {
5600               dump_printf_loc (MSG_NOTE, vect_location,
5601                                "------>vectorizing statement: ");
5602               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5603             }
5604
5605           stmt_info = vinfo_for_stmt (stmt);
5606
5607           /* vector stmts created in the outer-loop during vectorization of
5608              stmts in an inner-loop may not have a stmt_info, and do not
5609              need to be vectorized.  */
5610           if (!stmt_info)
5611             {
5612               gsi_next (&si);
5613               continue;
5614             }
5615
5616           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5617             vect_loop_kill_debug_uses (loop, stmt);
5618
5619           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5620               && !STMT_VINFO_LIVE_P (stmt_info))
5621             {
5622               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5623                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5624                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5625                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5626                 {
5627                   stmt = pattern_stmt;
5628                   stmt_info = vinfo_for_stmt (stmt);
5629                 }
5630               else
5631                 {
5632                   gsi_next (&si);
5633                   continue;
5634                 }
5635             }
5636           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5637                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5638                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5639                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5640             transform_pattern_stmt = true;
5641
5642           /* If pattern statement has def stmts, vectorize them too.  */
5643           if (is_pattern_stmt_p (stmt_info))
5644             {
5645               if (pattern_def_seq == NULL)
5646                 {
5647                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
5648                   pattern_def_si = gsi_start (pattern_def_seq);
5649                 }
5650               else if (!gsi_end_p (pattern_def_si))
5651                 gsi_next (&pattern_def_si);
5652               if (pattern_def_seq != NULL)
5653                 {
5654                   gimple pattern_def_stmt = NULL;
5655                   stmt_vec_info pattern_def_stmt_info = NULL;
5656
5657                   while (!gsi_end_p (pattern_def_si))
5658                     {
5659                       pattern_def_stmt = gsi_stmt (pattern_def_si);
5660                       pattern_def_stmt_info
5661                         = vinfo_for_stmt (pattern_def_stmt);
5662                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
5663                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
5664                         break;
5665                       gsi_next (&pattern_def_si);
5666                     }
5667
5668                   if (!gsi_end_p (pattern_def_si))
5669                     {
5670                       if (dump_enabled_p ())
5671                         {
5672                           dump_printf_loc (MSG_NOTE, vect_location,
5673                                            "==> vectorizing pattern def "
5674                                            "stmt: ");
5675                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
5676                                             pattern_def_stmt, 0);
5677                         }
5678
5679                       stmt = pattern_def_stmt;
5680                       stmt_info = pattern_def_stmt_info;
5681                     }
5682                   else
5683                     {
5684                       pattern_def_si = gsi_none ();
5685                       transform_pattern_stmt = false;
5686                     }
5687                 }
5688               else
5689                 transform_pattern_stmt = false;
5690             }
5691
5692           gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5693           nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
5694                                                STMT_VINFO_VECTYPE (stmt_info));
5695           if (!STMT_SLP_TYPE (stmt_info)
5696               && nunits != (unsigned int) vectorization_factor
5697               && dump_enabled_p ())
5698             /* For SLP VF is set according to unrolling factor, and not to
5699                vector size, hence for SLP this print is not valid.  */
5700             dump_printf_loc (MSG_NOTE, vect_location,
5701                              "multiple-types.");
5702
5703           /* SLP. Schedule all the SLP instances when the first SLP stmt is
5704              reached.  */
5705           if (STMT_SLP_TYPE (stmt_info))
5706             {
5707               if (!slp_scheduled)
5708                 {
5709                   slp_scheduled = true;
5710
5711                   if (dump_enabled_p ())
5712                     dump_printf_loc (MSG_NOTE, vect_location,
5713                                      "=== scheduling SLP instances ===");
5714
5715                   vect_schedule_slp (loop_vinfo, NULL);
5716                 }
5717
5718               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
5719               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
5720                 {
5721                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5722                     {
5723                       pattern_def_seq = NULL;
5724                       gsi_next (&si);
5725                     }
5726                   continue;
5727                 }
5728             }
5729
5730           /* -------- vectorize statement ------------ */
5731           if (dump_enabled_p ())
5732             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.");
5733
5734           grouped_store = false;
5735           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
5736           if (is_store)
5737             {
5738               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
5739                 {
5740                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5741                      interleaving chain was completed - free all the stores in
5742                      the chain.  */
5743                   gsi_next (&si);
5744                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
5745                   continue;
5746                 }
5747               else
5748                 {
5749                   /* Free the attached stmt_vec_info and remove the stmt.  */
5750                   gimple store = gsi_stmt (si);
5751                   free_stmt_vec_info (store);
5752                   unlink_stmt_vdef (store);
5753                   gsi_remove (&si, true);
5754                   release_defs (store);
5755                   continue;
5756                 }
5757             }
5758
5759           if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5760             {
5761               pattern_def_seq = NULL;
5762               gsi_next (&si);
5763             }
5764         }                       /* stmts in BB */
5765     }                           /* BBs in loop */
5766
5767   slpeel_make_loop_iterate_ntimes (loop, ratio);
5768
5769   /* Reduce loop iterations by the vectorization factor.  */
5770   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
5771                       expected_iterations / vectorization_factor);
5772   loop->nb_iterations_upper_bound
5773     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (vectorization_factor),
5774                                             FLOOR_DIV_EXPR);
5775   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5776       && loop->nb_iterations_upper_bound != double_int_zero)
5777     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - double_int_one;
5778   if (loop->any_estimate)
5779     {
5780       loop->nb_iterations_estimate
5781         = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (vectorization_factor),
5782                                              FLOOR_DIV_EXPR);
5783        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5784            && loop->nb_iterations_estimate != double_int_zero)
5785          loop->nb_iterations_estimate = loop->nb_iterations_estimate - double_int_one;
5786     }
5787
5788   if (dump_enabled_p ())
5789     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, "LOOP VECTORIZED.");
5790   if (loop->inner && dump_enabled_p ())
5791     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
5792                      "OUTER LOOP VECTORIZED.");
5793 }