gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2013 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "ggc.h"
  28 #include "tree.h"
  29 #include "basic-block.h"
  30 #include "gimple-pretty-print.h"
  31 #include "tree-flow.h"
  32 #include "tree-pass.h"
  33 #include "cfgloop.h"
  34 #include "expr.h"
  35 #include "recog.h"
  36 #include "optabs.h"
  37 #include "params.h"
  38 #include "diagnostic-core.h"
  39 #include "tree-chrec.h"
  40 #include "tree-scalar-evolution.h"
  41 #include "tree-vectorizer.h"
  42 #include "target.h"
  43
  44 /* Loop Vectorization Pass.
  45
  46    This pass tries to vectorize loops.
  47
  48    For example, the vectorizer transforms the following simple loop:
  49
  50         short a[N]; short b[N]; short c[N]; int i;
  51
  52         for (i=0; i<N; i++){
  53           a[i] = b[i] + c[i];
  54         }
  55
  56    as if it was manually vectorized by rewriting the source code into:
  57
  58         typedef int __attribute__((mode(V8HI))) v8hi;
  59         short a[N];  short b[N]; short c[N];   int i;
  60         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  61         v8hi va, vb, vc;
  62
  63         for (i=0; i<N/8; i++){
  64           vb = pb[i];
  65           vc = pc[i];
  66           va = vb + vc;
  67           pa[i] = va;
  68         }
  69
  70         The main entry to this pass is vectorize_loops(), in which
  71    the vectorizer applies a set of analyses on a given set of loops,
  72    followed by the actual vectorization transformation for the loops that
  73    had successfully passed the analysis phase.
  74         Throughout this pass we make a distinction between two types of
  75    data: scalars (which are represented by SSA_NAMES), and memory references
  76    ("data-refs").  These two types of data require different handling both
  77    during analysis and transformation. The types of data-refs that the
  78    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  79    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  80    accesses are required to have a simple (consecutive) access pattern.
  81
  82    Analysis phase:
  83    ===============
  84         The driver for the analysis phase is vect_analyze_loop().
  85    It applies a set of analyses, some of which rely on the scalar evolution
  86    analyzer (scev) developed by Sebastian Pop.
  87
  88         During the analysis phase the vectorizer records some information
  89    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  90    loop, as well as general information about the loop as a whole, which is
  91    recorded in a "loop_vec_info" struct attached to each loop.
  92
  93    Transformation phase:
  94    =====================
  95         The loop transformation phase scans all the stmts in the loop, and
  96    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
  97    the loop that needs to be vectorized.  It inserts the vector code sequence
  98    just before the scalar stmt S, and records a pointer to the vector code
  99    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 100    attached to S).  This pointer will be used for the vectorization of following
 101    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 102    otherwise, we rely on dead code elimination for removing it.
 103
 104         For example, say stmt S1 was vectorized into stmt VS1:
 105
 106    VS1: vb = px[i];
 107    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 108    S2:  a = b;
 109
 110    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 111    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 112    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 113    resulting sequence would be:
 114
 115    VS1: vb = px[i];
 116    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 117    VS2: va = vb;
 118    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 119
 120         Operands that are not SSA_NAMEs, are data-refs that appear in
 121    load/store operations (like 'x[i]' in S1), and are handled differently.
 122
 123    Target modeling:
 124    =================
 125         Currently the only target specific information that is used is the
 126    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 127    Targets that can support different sizes of vectors, for now will need
 128    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 129    flexibility will be added in the future.
 130
 131         Since we only vectorize operations which vector form can be
 132    expressed using existing tree codes, to verify that an operation is
 133    supported, the vectorizer checks the relevant optab at the relevant
 134    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 135    the value found is CODE_FOR_nothing, then there's no target support, and
 136    we can't vectorize the stmt.
 137
 138    For additional information on this project see:
 139    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 140 */
 141
 142 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 143
 144 /* Function vect_determine_vectorization_factor
 145
 146    Determine the vectorization factor (VF).  VF is the number of data elements
 147    that are operated upon in parallel in a single iteration of the vectorized
 148    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 149    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 150    elements can fit in a single vector register.
 151
 152    We currently support vectorization of loops in which all types operated upon
 153    are of the same size.  Therefore this function currently sets VF according to
 154    the size of the types operated upon, and fails if there are multiple sizes
 155    in the loop.
 156
 157    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 158    original loop:
 159         for (i=0; i<N; i++){
 160           a[i] = b[i] + c[i];
 161         }
 162
 163    vectorized loop:
 164         for (i=0; i<N; i+=VF){
 165           a[i:VF] = b[i:VF] + c[i:VF];
 166         }
 167 */
 168
 169 static bool
 170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 171 {
 172   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 173   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 174   int nbbs = loop->num_nodes;
 175   gimple_stmt_iterator si;
 176   unsigned int vectorization_factor = 0;
 177   tree scalar_type;
 178   gimple phi;
 179   tree vectype;
 180   unsigned int nunits;
 181   stmt_vec_info stmt_info;
 182   int i;
 183   HOST_WIDE_INT dummy;
 184   gimple stmt, pattern_stmt = NULL;
 185   gimple_seq pattern_def_seq = NULL;
 186   gimple_stmt_iterator pattern_def_si = gsi_none ();
 187   bool analyze_pattern_stmt = false;
 188
 189   if (dump_enabled_p ())
 190     dump_printf_loc (MSG_NOTE, vect_location,
 191                      "=== vect_determine_vectorization_factor ===");
 192
 193   for (i = 0; i < nbbs; i++)
 194     {
 195       basic_block bb = bbs[i];
 196
 197       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 198         {
 199           phi = gsi_stmt (si);
 200           stmt_info = vinfo_for_stmt (phi);
 201           if (dump_enabled_p ())
 202             {
 203               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 204               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 205             }
 206
 207           gcc_assert (stmt_info);
 208
 209           if (STMT_VINFO_RELEVANT_P (stmt_info))
 210             {
 211               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 212               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 213
 214               if (dump_enabled_p ())
 215                 {
 216                   dump_printf_loc (MSG_NOTE, vect_location,
 217                                    "get vectype for scalar type:  ");
 218                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 219                 }
 220
 221               vectype = get_vectype_for_scalar_type (scalar_type);
 222               if (!vectype)
 223                 {
 224                   if (dump_enabled_p ())
 225                     {
 226                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 227                                        "not vectorized: unsupported "
 228                                        "data-type ");
 229                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 230                                          scalar_type);
 231                     }
 232                   return false;
 233                 }
 234               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 235
 236               if (dump_enabled_p ())
 237                 {
 238                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 239                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 240                 }
 241
 242               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 243               if (dump_enabled_p ())
 244                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
 245
 246               if (!vectorization_factor
 247                   || (nunits > vectorization_factor))
 248                 vectorization_factor = nunits;
 249             }
 250         }
 251
 252       for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
 253         {
 254           tree vf_vectype;
 255
 256           if (analyze_pattern_stmt)
 257             stmt = pattern_stmt;
 258           else
 259             stmt = gsi_stmt (si);
 260
 261           stmt_info = vinfo_for_stmt (stmt);
 262
 263           if (dump_enabled_p ())
 264             {
 265               dump_printf_loc (MSG_NOTE, vect_location,
 266                                "==> examining statement: ");
 267               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 268             }
 269
 270           gcc_assert (stmt_info);
 271
 272           /* Skip stmts which do not need to be vectorized.  */
 273           if (!STMT_VINFO_RELEVANT_P (stmt_info)
 274               && !STMT_VINFO_LIVE_P (stmt_info))
 275             {
 276               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 277                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 278                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 279                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 280                 {
 281                   stmt = pattern_stmt;
 282                   stmt_info = vinfo_for_stmt (pattern_stmt);
 283                   if (dump_enabled_p ())
 284                     {
 285                       dump_printf_loc (MSG_NOTE, vect_location,
 286                                        "==> examining pattern statement: ");
 287                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 288                     }
 289                 }
 290               else
 291                 {
 292                   if (dump_enabled_p ())
 293                     dump_printf_loc (MSG_NOTE, vect_location, "skip.");
 294                   gsi_next (&si);
 295                   continue;
 296                 }
 297             }
 298           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 299                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 300                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 301                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 302             analyze_pattern_stmt = true;
 303
 304           /* If a pattern statement has def stmts, analyze them too.  */
 305           if (is_pattern_stmt_p (stmt_info))
 306             {
 307               if (pattern_def_seq == NULL)
 308                 {
 309                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 310                   pattern_def_si = gsi_start (pattern_def_seq);
 311                 }
 312               else if (!gsi_end_p (pattern_def_si))
 313                 gsi_next (&pattern_def_si);
 314               if (pattern_def_seq != NULL)
 315                 {
 316                   gimple pattern_def_stmt = NULL;
 317                   stmt_vec_info pattern_def_stmt_info = NULL;
 318
 319                   while (!gsi_end_p (pattern_def_si))
 320                     {
 321                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 322                       pattern_def_stmt_info
 323                         = vinfo_for_stmt (pattern_def_stmt);
 324                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 325                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 326                         break;
 327                       gsi_next (&pattern_def_si);
 328                     }
 329
 330                   if (!gsi_end_p (pattern_def_si))
 331                     {
 332                       if (dump_enabled_p ())
 333                         {
 334                           dump_printf_loc (MSG_NOTE, vect_location,
 335                                            "==> examining pattern def stmt: ");
 336                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 337                                             pattern_def_stmt, 0);
 338                         }
 339
 340                       stmt = pattern_def_stmt;
 341                       stmt_info = pattern_def_stmt_info;
 342                     }
 343                   else
 344                     {
 345                       pattern_def_si = gsi_none ();
 346                       analyze_pattern_stmt = false;
 347                     }
 348                 }
 349               else
 350                 analyze_pattern_stmt = false;
 351             }
 352
 353           if (gimple_get_lhs (stmt) == NULL_TREE)
 354             {
 355               if (dump_enabled_p ())
 356                 {
 357                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 358                                    "not vectorized: irregular stmt.");
 359                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 360                                     0);
 361                 }
 362               return false;
 363             }
 364
 365           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 366             {
 367               if (dump_enabled_p ())
 368                 {
 369                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 370                                    "not vectorized: vector stmt in loop:");
 371                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 372                 }
 373               return false;
 374             }
 375
 376           if (STMT_VINFO_VECTYPE (stmt_info))
 377             {
 378               /* The only case when a vectype had been already set is for stmts
 379                  that contain a dataref, or for "pattern-stmts" (stmts
 380                  generated by the vectorizer to represent/replace a certain
 381                  idiom).  */
 382               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 383                           || is_pattern_stmt_p (stmt_info)
 384                           || !gsi_end_p (pattern_def_si));
 385               vectype = STMT_VINFO_VECTYPE (stmt_info);
 386             }
 387           else
 388             {
 389               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 390               scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 391               if (dump_enabled_p ())
 392                 {
 393                   dump_printf_loc (MSG_NOTE, vect_location,
 394                                    "get vectype for scalar type:  ");
 395                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 396                 }
 397               vectype = get_vectype_for_scalar_type (scalar_type);
 398               if (!vectype)
 399                 {
 400                   if (dump_enabled_p ())
 401                     {
 402                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 403                                        "not vectorized: unsupported "
 404                                        "data-type ");
 405                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 406                                          scalar_type);
 407                     }
 408                   return false;
 409                 }
 410
 411               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 412
 413               if (dump_enabled_p ())
 414                 {
 415                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 416                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 417                 }
 418             }
 419
 420           /* The vectorization factor is according to the smallest
 421              scalar type (or the largest vector size, but we only
 422              support one vector size per loop).  */
 423           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 424                                                        &dummy);
 425           if (dump_enabled_p ())
 426             {
 427               dump_printf_loc (MSG_NOTE, vect_location,
 428                                "get vectype for scalar type:  ");
 429               dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 430             }
 431           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 432           if (!vf_vectype)
 433             {
 434               if (dump_enabled_p ())
 435                 {
 436                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 437                                    "not vectorized: unsupported data-type ");
 438                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 439                                      scalar_type);
 440                 }
 441               return false;
 442             }
 443
 444           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 445                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 446             {
 447               if (dump_enabled_p ())
 448                 {
 449                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 450                                    "not vectorized: different sized vector "
 451                                    "types in statement, ");
 452                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 453                                      vectype);
 454                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 455                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 456                                      vf_vectype);
 457                 }
 458               return false;
 459             }
 460
 461           if (dump_enabled_p ())
 462             {
 463               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 464               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 465             }
 466
 467           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 468           if (dump_enabled_p ())
 469             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
 470           if (!vectorization_factor
 471               || (nunits > vectorization_factor))
 472             vectorization_factor = nunits;
 473
 474           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 475             {
 476               pattern_def_seq = NULL;
 477               gsi_next (&si);
 478             }
 479         }
 480     }
 481
 482   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 483   if (dump_enabled_p ())
 484     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d",
 485                      vectorization_factor);
 486   if (vectorization_factor <= 1)
 487     {
 488       if (dump_enabled_p ())
 489         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 490                          "not vectorized: unsupported data-type");
 491       return false;
 492     }
 493   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 494
 495   return true;
 496 }
 497
 498
 499 /* Function vect_is_simple_iv_evolution.
 500
 501    FORNOW: A simple evolution of an induction variables in the loop is
 502    considered a polynomial evolution with constant step.  */
 503
 504 static bool
 505 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 506                              tree * step)
 507 {
 508   tree init_expr;
 509   tree step_expr;
 510   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 511
 512   /* When there is no evolution in this loop, the evolution function
 513      is not "simple".  */
 514   if (evolution_part == NULL_TREE)
 515     return false;
 516
 517   /* When the evolution is a polynomial of degree >= 2
 518      the evolution function is not "simple".  */
 519   if (tree_is_chrec (evolution_part))
 520     return false;
 521
 522   step_expr = evolution_part;
 523   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 524
 525   if (dump_enabled_p ())
 526     {
 527       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 528       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 529       dump_printf (MSG_NOTE, ",  init: ");
 530       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 531     }
 532
 533   *init = init_expr;
 534   *step = step_expr;
 535
 536   if (TREE_CODE (step_expr) != INTEGER_CST)
 537     {
 538       if (dump_enabled_p ())
 539         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 540                          "step unknown.");
 541       return false;
 542     }
 543
 544   return true;
 545 }
 546
 547 /* Function vect_analyze_scalar_cycles_1.
 548
 549    Examine the cross iteration def-use cycles of scalar variables
 550    in LOOP.  LOOP_VINFO represents the loop that is now being
 551    considered for vectorization (can be LOOP, or an outer-loop
 552    enclosing LOOP).  */
 553
 554 static void
 555 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 556 {
 557   basic_block bb = loop->header;
 558   tree dumy;
 559   vec<gimple> worklist;
 560   worklist.create (64);
 561   gimple_stmt_iterator gsi;
 562   bool double_reduc;
 563
 564   if (dump_enabled_p ())
 565     dump_printf_loc (MSG_NOTE, vect_location,
 566                      "=== vect_analyze_scalar_cycles ===");
 567
 568   /* First - identify all inductions.  Reduction detection assumes that all the
 569      inductions have been identified, therefore, this order must not be
 570      changed.  */
 571   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 572     {
 573       gimple phi = gsi_stmt (gsi);
 574       tree access_fn = NULL;
 575       tree def = PHI_RESULT (phi);
 576       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 577
 578       if (dump_enabled_p ())
 579         {
 580           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 581           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 582         }
 583
 584       /* Skip virtual phi's.  The data dependences that are associated with
 585          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 586       if (virtual_operand_p (def))
 587         continue;
 588
 589       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 590
 591       /* Analyze the evolution function.  */
 592       access_fn = analyze_scalar_evolution (loop, def);
 593       if (access_fn)
 594         {
 595           STRIP_NOPS (access_fn);
 596           if (dump_enabled_p ())
 597             {
 598               dump_printf_loc (MSG_NOTE, vect_location,
 599                                "Access function of PHI: ");
 600               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 601             }
 602           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 603             = evolution_part_in_loop_num (access_fn, loop->num);
 604         }
 605
 606       if (!access_fn
 607           || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
 608         {
 609           worklist.safe_push (phi);
 610           continue;
 611         }
 612
 613       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 614
 615       if (dump_enabled_p ())
 616         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.");
 617       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 618     }
 619
 620
 621   /* Second - identify all reductions and nested cycles.  */
 622   while (worklist.length () > 0)
 623     {
 624       gimple phi = worklist.pop ();
 625       tree def = PHI_RESULT (phi);
 626       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 627       gimple reduc_stmt;
 628       bool nested_cycle;
 629
 630       if (dump_enabled_p ())
 631         {
 632           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 633           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 634         }
 635
 636       gcc_assert (!virtual_operand_p (def)
 637                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 638
 639       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 640       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 641                                                 &double_reduc);
 642       if (reduc_stmt)
 643         {
 644           if (double_reduc)
 645             {
 646               if (dump_enabled_p ())
 647                 dump_printf_loc (MSG_NOTE, vect_location,
 648                                  "Detected double reduction.");
 649
 650               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 651               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 652                                                     vect_double_reduction_def;
 653             }
 654           else
 655             {
 656               if (nested_cycle)
 657                 {
 658                   if (dump_enabled_p ())
 659                     dump_printf_loc (MSG_NOTE, vect_location,
 660                                      "Detected vectorizable nested cycle.");
 661
 662                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 663                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 664                                                              vect_nested_cycle;
 665                 }
 666               else
 667                 {
 668                   if (dump_enabled_p ())
 669                     dump_printf_loc (MSG_NOTE, vect_location,
 670                                      "Detected reduction.");
 671
 672                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 673                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 674                                                            vect_reduction_def;
 675                   /* Store the reduction cycles for possible vectorization in
 676                      loop-aware SLP.  */
 677                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 678                 }
 679             }
 680         }
 681       else
 682         if (dump_enabled_p ())
 683           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 684                            "Unknown def-use cycle pattern.");
 685     }
 686
 687   worklist.release ();
 688 }
 689
 690
 691 /* Function vect_analyze_scalar_cycles.
 692
 693    Examine the cross iteration def-use cycles of scalar variables, by
 694    analyzing the loop-header PHIs of scalar variables.  Classify each
 695    cycle as one of the following: invariant, induction, reduction, unknown.
 696    We do that for the loop represented by LOOP_VINFO, and also to its
 697    inner-loop, if exists.
 698    Examples for scalar cycles:
 699
 700    Example1: reduction:
 701
 702               loop1:
 703               for (i=0; i<N; i++)
 704                  sum += a[i];
 705
 706    Example2: induction:
 707
 708               loop2:
 709               for (i=0; i<N; i++)
 710                  a[i] = i;  */
 711
 712 static void
 713 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 714 {
 715   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 716
 717   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 718
 719   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 720      Reductions in such inner-loop therefore have different properties than
 721      the reductions in the nest that gets vectorized:
 722      1. When vectorized, they are executed in the same order as in the original
 723         scalar loop, so we can't change the order of computation when
 724         vectorizing them.
 725      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 726         current checks are too strict.  */
 727
 728   if (loop->inner)
 729     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 730 }
 731
 732 /* Function vect_get_loop_niters.
 733
 734    Determine how many iterations the loop is executed.
 735    If an expression that represents the number of iterations
 736    can be constructed, place it in NUMBER_OF_ITERATIONS.
 737    Return the loop exit condition.  */
 738
 739 static gimple
 740 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
 741 {
 742   tree niters;
 743
 744   if (dump_enabled_p ())
 745     dump_printf_loc (MSG_NOTE, vect_location,
 746                      "=== get_loop_niters ===");
 747   niters = number_of_exit_cond_executions (loop);
 748
 749   if (niters != NULL_TREE
 750       && niters != chrec_dont_know)
 751     {
 752       *number_of_iterations = niters;
 753
 754       if (dump_enabled_p ())
 755         {
 756           dump_printf_loc (MSG_NOTE, vect_location, "==> get_loop_niters:");
 757           dump_generic_expr (MSG_NOTE, TDF_SLIM, *number_of_iterations);
 758         }
 759     }
 760
 761   return get_loop_exit_condition (loop);
 762 }
 763
 764
 765 /* Function bb_in_loop_p
 766
 767    Used as predicate for dfs order traversal of the loop bbs.  */
 768
 769 static bool
 770 bb_in_loop_p (const_basic_block bb, const void *data)
 771 {
 772   const struct loop *const loop = (const struct loop *)data;
 773   if (flow_bb_inside_loop_p (loop, bb))
 774     return true;
 775   return false;
 776 }
 777
 778
 779 /* Function new_loop_vec_info.
 780
 781    Create and initialize a new loop_vec_info struct for LOOP, as well as
 782    stmt_vec_info structs for all the stmts in LOOP.  */
 783
 784 static loop_vec_info
 785 new_loop_vec_info (struct loop *loop)
 786 {
 787   loop_vec_info res;
 788   basic_block *bbs;
 789   gimple_stmt_iterator si;
 790   unsigned int i, nbbs;
 791
 792   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 793   LOOP_VINFO_LOOP (res) = loop;
 794
 795   bbs = get_loop_body (loop);
 796
 797   /* Create/Update stmt_info for all stmts in the loop.  */
 798   for (i = 0; i < loop->num_nodes; i++)
 799     {
 800       basic_block bb = bbs[i];
 801
 802       /* BBs in a nested inner-loop will have been already processed (because
 803          we will have called vect_analyze_loop_form for any nested inner-loop).
 804          Therefore, for stmts in an inner-loop we just want to update the
 805          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 806          loop_info of the outer-loop we are currently considering to vectorize
 807          (instead of the loop_info of the inner-loop).
 808          For stmts in other BBs we need to create a stmt_info from scratch.  */
 809       if (bb->loop_father != loop)
 810         {
 811           /* Inner-loop bb.  */
 812           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 813           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 814             {
 815               gimple phi = gsi_stmt (si);
 816               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 817               loop_vec_info inner_loop_vinfo =
 818                 STMT_VINFO_LOOP_VINFO (stmt_info);
 819               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 820               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 821             }
 822           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 823            {
 824               gimple stmt = gsi_stmt (si);
 825               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 826               loop_vec_info inner_loop_vinfo =
 827                  STMT_VINFO_LOOP_VINFO (stmt_info);
 828               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 829               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 830            }
 831         }
 832       else
 833         {
 834           /* bb in current nest.  */
 835           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 836             {
 837               gimple phi = gsi_stmt (si);
 838               gimple_set_uid (phi, 0);
 839               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 840             }
 841
 842           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 843             {
 844               gimple stmt = gsi_stmt (si);
 845               gimple_set_uid (stmt, 0);
 846               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 847             }
 848         }
 849     }
 850
 851   /* CHECKME: We want to visit all BBs before their successors (except for
 852      latch blocks, for which this assertion wouldn't hold).  In the simple
 853      case of the loop forms we allow, a dfs order of the BBs would the same
 854      as reversed postorder traversal, so we are safe.  */
 855
 856    free (bbs);
 857    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 858    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 859                               bbs, loop->num_nodes, loop);
 860    gcc_assert (nbbs == loop->num_nodes);
 861
 862   LOOP_VINFO_BBS (res) = bbs;
 863   LOOP_VINFO_NITERS (res) = NULL;
 864   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 865   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 866   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 867   LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
 868   LOOP_VINFO_VECT_FACTOR (res) = 0;
 869   LOOP_VINFO_LOOP_NEST (res).create (3);
 870   LOOP_VINFO_DATAREFS (res).create (10);
 871   LOOP_VINFO_DDRS (res).create (10 * 10);
 872   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 873   LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
 874              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 875   LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
 876              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 877   LOOP_VINFO_GROUPED_STORES (res).create (10);
 878   LOOP_VINFO_REDUCTIONS (res).create (10);
 879   LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
 880   LOOP_VINFO_SLP_INSTANCES (res).create (10);
 881   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 882   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
 883   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
 884   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
 885
 886   return res;
 887 }
 888
 889
 890 /* Function destroy_loop_vec_info.
 891
 892    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 893    stmts in the loop.  */
 894
 895 void
 896 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 897 {
 898   struct loop *loop;
 899   basic_block *bbs;
 900   int nbbs;
 901   gimple_stmt_iterator si;
 902   int j;
 903   vec<slp_instance> slp_instances;
 904   slp_instance instance;
 905   bool swapped;
 906
 907   if (!loop_vinfo)
 908     return;
 909
 910   loop = LOOP_VINFO_LOOP (loop_vinfo);
 911
 912   bbs = LOOP_VINFO_BBS (loop_vinfo);
 913   nbbs = clean_stmts ? loop->num_nodes : 0;
 914   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
 915
 916   for (j = 0; j < nbbs; j++)
 917     {
 918       basic_block bb = bbs[j];
 919       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 920         free_stmt_vec_info (gsi_stmt (si));
 921
 922       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 923         {
 924           gimple stmt = gsi_stmt (si);
 925
 926           /* We may have broken canonical form by moving a constant
 927              into RHS1 of a commutative op.  Fix such occurrences.  */
 928           if (swapped && is_gimple_assign (stmt))
 929             {
 930               enum tree_code code = gimple_assign_rhs_code (stmt);
 931
 932               if ((code == PLUS_EXPR
 933                    || code == POINTER_PLUS_EXPR
 934                    || code == MULT_EXPR)
 935                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 936                 swap_tree_operands (stmt,
 937                                     gimple_assign_rhs1_ptr (stmt),
 938                                     gimple_assign_rhs2_ptr (stmt));
 939             }
 940
 941           /* Free stmt_vec_info.  */
 942           free_stmt_vec_info (stmt);
 943           gsi_next (&si);
 944         }
 945     }
 946
 947   free (LOOP_VINFO_BBS (loop_vinfo));
 948   free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 949   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 950   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
 951   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
 952   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
 953   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
 954   FOR_EACH_VEC_ELT (slp_instances, j, instance)
 955     vect_free_slp_instance (instance);
 956
 957   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
 958   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
 959   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
 960   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
 961
 962   if (LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
 963     LOOP_VINFO_PEELING_HTAB (loop_vinfo).dispose ();
 964
 965   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
 966
 967   free (loop_vinfo);
 968   loop->aux = NULL;
 969 }
 970
 971
 972 /* Function vect_analyze_loop_1.
 973
 974    Apply a set of analyses on LOOP, and create a loop_vec_info struct
 975    for it. The different analyses will record information in the
 976    loop_vec_info struct.  This is a subset of the analyses applied in
 977    vect_analyze_loop, to be applied on an inner-loop nested in the loop
 978    that is now considered for (outer-loop) vectorization.  */
 979
 980 static loop_vec_info
 981 vect_analyze_loop_1 (struct loop *loop)
 982 {
 983   loop_vec_info loop_vinfo;
 984
 985   if (dump_enabled_p ())
 986     dump_printf_loc (MSG_NOTE, vect_location,
 987                      "===== analyze_loop_nest_1 =====");
 988
 989   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
 990
 991   loop_vinfo = vect_analyze_loop_form (loop);
 992   if (!loop_vinfo)
 993     {
 994       if (dump_enabled_p ())
 995         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 996                          "bad inner-loop form.");
 997       return NULL;
 998     }
 999
1000   return loop_vinfo;
1001 }
1002
1003
1004 /* Function vect_analyze_loop_form.
1005
1006    Verify that certain CFG restrictions hold, including:
1007    - the loop has a pre-header
1008    - the loop has a single entry and exit
1009    - the loop exit condition is simple enough, and the number of iterations
1010      can be analyzed (a countable loop).  */
1011
1012 loop_vec_info
1013 vect_analyze_loop_form (struct loop *loop)
1014 {
1015   loop_vec_info loop_vinfo;
1016   gimple loop_cond;
1017   tree number_of_iterations = NULL;
1018   loop_vec_info inner_loop_vinfo = NULL;
1019
1020   if (dump_enabled_p ())
1021     dump_printf_loc (MSG_NOTE, vect_location,
1022                      "=== vect_analyze_loop_form ===");
1023
1024   /* Different restrictions apply when we are considering an inner-most loop,
1025      vs. an outer (nested) loop.
1026      (FORNOW. May want to relax some of these restrictions in the future).  */
1027
1028   if (!loop->inner)
1029     {
1030       /* Inner-most loop.  We currently require that the number of BBs is
1031          exactly 2 (the header and latch).  Vectorizable inner-most loops
1032          look like this:
1033
1034                         (pre-header)
1035                            |
1036                           header <--------+
1037                            | |            |
1038                            | +--> latch --+
1039                            |
1040                         (exit-bb)  */
1041
1042       if (loop->num_nodes != 2)
1043         {
1044           if (dump_enabled_p ())
1045             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1046                              "not vectorized: control flow in loop.");
1047           return NULL;
1048         }
1049
1050       if (empty_block_p (loop->header))
1051     {
1052           if (dump_enabled_p ())
1053             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1054                              "not vectorized: empty loop.");
1055       return NULL;
1056     }
1057     }
1058   else
1059     {
1060       struct loop *innerloop = loop->inner;
1061       edge entryedge;
1062
1063       /* Nested loop. We currently require that the loop is doubly-nested,
1064          contains a single inner loop, and the number of BBs is exactly 5.
1065          Vectorizable outer-loops look like this:
1066
1067                         (pre-header)
1068                            |
1069                           header <---+
1070                            |         |
1071                           inner-loop |
1072                            |         |
1073                           tail ------+
1074                            |
1075                         (exit-bb)
1076
1077          The inner-loop has the properties expected of inner-most loops
1078          as described above.  */
1079
1080       if ((loop->inner)->inner || (loop->inner)->next)
1081         {
1082           if (dump_enabled_p ())
1083             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1084                              "not vectorized: multiple nested loops.");
1085           return NULL;
1086         }
1087
1088       /* Analyze the inner-loop.  */
1089       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1090       if (!inner_loop_vinfo)
1091         {
1092           if (dump_enabled_p ())
1093             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1094                              "not vectorized: Bad inner loop.");
1095           return NULL;
1096         }
1097
1098       if (!expr_invariant_in_loop_p (loop,
1099                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
1100         {
1101           if (dump_enabled_p ())
1102             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1103                              "not vectorized: inner-loop count not invariant.");
1104           destroy_loop_vec_info (inner_loop_vinfo, true);
1105           return NULL;
1106         }
1107
1108       if (loop->num_nodes != 5)
1109         {
1110           if (dump_enabled_p ())
1111             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1112                              "not vectorized: control flow in loop.");
1113           destroy_loop_vec_info (inner_loop_vinfo, true);
1114           return NULL;
1115         }
1116
1117       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1118       entryedge = EDGE_PRED (innerloop->header, 0);
1119       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1120         entryedge = EDGE_PRED (innerloop->header, 1);
1121
1122       if (entryedge->src != loop->header
1123           || !single_exit (innerloop)
1124           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1125         {
1126           if (dump_enabled_p ())
1127             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1128                              "not vectorized: unsupported outerloop form.");
1129           destroy_loop_vec_info (inner_loop_vinfo, true);
1130           return NULL;
1131         }
1132
1133       if (dump_enabled_p ())
1134         dump_printf_loc (MSG_NOTE, vect_location,
1135                          "Considering outer-loop vectorization.");
1136     }
1137
1138   if (!single_exit (loop)
1139       || EDGE_COUNT (loop->header->preds) != 2)
1140     {
1141       if (dump_enabled_p ())
1142         {
1143           if (!single_exit (loop))
1144             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1145                              "not vectorized: multiple exits.");
1146           else if (EDGE_COUNT (loop->header->preds) != 2)
1147             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1148                              "not vectorized: too many incoming edges.");
1149         }
1150       if (inner_loop_vinfo)
1151         destroy_loop_vec_info (inner_loop_vinfo, true);
1152       return NULL;
1153     }
1154
1155   /* We assume that the loop exit condition is at the end of the loop. i.e,
1156      that the loop is represented as a do-while (with a proper if-guard
1157      before the loop if needed), where the loop header contains all the
1158      executable statements, and the latch is empty.  */
1159   if (!empty_block_p (loop->latch)
1160       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1161     {
1162       if (dump_enabled_p ())
1163         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1164                          "not vectorized: latch block not empty.");
1165       if (inner_loop_vinfo)
1166         destroy_loop_vec_info (inner_loop_vinfo, true);
1167       return NULL;
1168     }
1169
1170   /* Make sure there exists a single-predecessor exit bb:  */
1171   if (!single_pred_p (single_exit (loop)->dest))
1172     {
1173       edge e = single_exit (loop);
1174       if (!(e->flags & EDGE_ABNORMAL))
1175         {
1176           split_loop_exit_edge (e);
1177           if (dump_enabled_p ())
1178             dump_printf (MSG_NOTE, "split exit edge.");
1179         }
1180       else
1181         {
1182           if (dump_enabled_p ())
1183             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1184                              "not vectorized: abnormal loop exit edge.");
1185           if (inner_loop_vinfo)
1186             destroy_loop_vec_info (inner_loop_vinfo, true);
1187           return NULL;
1188         }
1189     }
1190
1191   loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1192   if (!loop_cond)
1193     {
1194       if (dump_enabled_p ())
1195         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1196                          "not vectorized: complicated exit condition.");
1197       if (inner_loop_vinfo)
1198         destroy_loop_vec_info (inner_loop_vinfo, true);
1199       return NULL;
1200     }
1201
1202   if (!number_of_iterations)
1203     {
1204       if (dump_enabled_p ())
1205         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1206                          "not vectorized: number of iterations cannot be "
1207                          "computed.");
1208       if (inner_loop_vinfo)
1209         destroy_loop_vec_info (inner_loop_vinfo, true);
1210       return NULL;
1211     }
1212
1213   if (chrec_contains_undetermined (number_of_iterations))
1214     {
1215       if (dump_enabled_p ())
1216             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1217                              "Infinite number of iterations.");
1218       if (inner_loop_vinfo)
1219         destroy_loop_vec_info (inner_loop_vinfo, true);
1220       return NULL;
1221     }
1222
1223   if (!NITERS_KNOWN_P (number_of_iterations))
1224     {
1225       if (dump_enabled_p ())
1226         {
1227           dump_printf_loc (MSG_NOTE, vect_location,
1228                            "Symbolic number of iterations is ");
1229           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1230         }
1231     }
1232   else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1233     {
1234       if (dump_enabled_p ())
1235         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1236                          "not vectorized: number of iterations = 0.");
1237       if (inner_loop_vinfo)
1238         destroy_loop_vec_info (inner_loop_vinfo, true);
1239       return NULL;
1240     }
1241
1242   loop_vinfo = new_loop_vec_info (loop);
1243   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1244   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1245
1246   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1247
1248   /* CHECKME: May want to keep it around it in the future.  */
1249   if (inner_loop_vinfo)
1250     destroy_loop_vec_info (inner_loop_vinfo, false);
1251
1252   gcc_assert (!loop->aux);
1253   loop->aux = loop_vinfo;
1254   return loop_vinfo;
1255 }
1256
1257
1258 /* Function vect_analyze_loop_operations.
1259
1260    Scan the loop stmts and make sure they are all vectorizable.  */
1261
1262 static bool
1263 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1264 {
1265   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1266   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1267   int nbbs = loop->num_nodes;
1268   gimple_stmt_iterator si;
1269   unsigned int vectorization_factor = 0;
1270   int i;
1271   gimple phi;
1272   stmt_vec_info stmt_info;
1273   bool need_to_vectorize = false;
1274   int min_profitable_iters;
1275   int min_scalar_loop_bound;
1276   unsigned int th;
1277   bool only_slp_in_loop = true, ok;
1278   HOST_WIDE_INT max_niter;
1279   HOST_WIDE_INT estimated_niter;
1280   int min_profitable_estimate;
1281
1282   if (dump_enabled_p ())
1283     dump_printf_loc (MSG_NOTE, vect_location,
1284                      "=== vect_analyze_loop_operations ===");
1285
1286   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1287   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1288   if (slp)
1289     {
1290       /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1291          vectorization factor of the loop is the unrolling factor required by
1292          the SLP instances.  If that unrolling factor is 1, we say, that we
1293          perform pure SLP on loop - cross iteration parallelism is not
1294          exploited.  */
1295       for (i = 0; i < nbbs; i++)
1296         {
1297           basic_block bb = bbs[i];
1298           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1299             {
1300               gimple stmt = gsi_stmt (si);
1301               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1302               gcc_assert (stmt_info);
1303               if ((STMT_VINFO_RELEVANT_P (stmt_info)
1304                    || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1305                   && !PURE_SLP_STMT (stmt_info))
1306                 /* STMT needs both SLP and loop-based vectorization.  */
1307                 only_slp_in_loop = false;
1308             }
1309         }
1310
1311       if (only_slp_in_loop)
1312         vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1313       else
1314         vectorization_factor = least_common_multiple (vectorization_factor,
1315                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1316
1317       LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1318       if (dump_enabled_p ())
1319         dump_printf_loc (MSG_NOTE, vect_location,
1320                          "Updating vectorization factor to %d ",
1321                          vectorization_factor);
1322     }
1323
1324   for (i = 0; i < nbbs; i++)
1325     {
1326       basic_block bb = bbs[i];
1327
1328       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1329         {
1330           phi = gsi_stmt (si);
1331           ok = true;
1332
1333           stmt_info = vinfo_for_stmt (phi);
1334           if (dump_enabled_p ())
1335             {
1336               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1337               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1338             }
1339
1340           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1341              (i.e., a phi in the tail of the outer-loop).  */
1342           if (! is_loop_header_bb_p (bb))
1343             {
1344               /* FORNOW: we currently don't support the case that these phis
1345                  are not used in the outerloop (unless it is double reduction,
1346                  i.e., this phi is vect_reduction_def), cause this case
1347                  requires to actually do something here.  */
1348               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1349                    || STMT_VINFO_LIVE_P (stmt_info))
1350                   && STMT_VINFO_DEF_TYPE (stmt_info)
1351                      != vect_double_reduction_def)
1352                 {
1353                   if (dump_enabled_p ())
1354                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1355                                      "Unsupported loop-closed phi in "
1356                                      "outer-loop.");
1357                   return false;
1358                 }
1359
1360               /* If PHI is used in the outer loop, we check that its operand
1361                  is defined in the inner loop.  */
1362               if (STMT_VINFO_RELEVANT_P (stmt_info))
1363                 {
1364                   tree phi_op;
1365                   gimple op_def_stmt;
1366
1367                   if (gimple_phi_num_args (phi) != 1)
1368                     return false;
1369
1370                   phi_op = PHI_ARG_DEF (phi, 0);
1371                   if (TREE_CODE (phi_op) != SSA_NAME)
1372                     return false;
1373
1374                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1375                   if (!op_def_stmt
1376                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1377                       || !vinfo_for_stmt (op_def_stmt))
1378                     return false;
1379
1380                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1381                         != vect_used_in_outer
1382                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1383                            != vect_used_in_outer_by_reduction)
1384                     return false;
1385                 }
1386
1387               continue;
1388             }
1389
1390           gcc_assert (stmt_info);
1391
1392           if (STMT_VINFO_LIVE_P (stmt_info))
1393             {
1394               /* FORNOW: not yet supported.  */
1395               if (dump_enabled_p ())
1396                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1397                                  "not vectorized: value used after loop.");
1398               return false;
1399             }
1400
1401           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1402               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1403             {
1404               /* A scalar-dependence cycle that we don't support.  */
1405               if (dump_enabled_p ())
1406                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1407                                  "not vectorized: scalar dependence cycle.");
1408               return false;
1409             }
1410
1411           if (STMT_VINFO_RELEVANT_P (stmt_info))
1412             {
1413               need_to_vectorize = true;
1414               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1415                 ok = vectorizable_induction (phi, NULL, NULL);
1416             }
1417
1418           if (!ok)
1419             {
1420               if (dump_enabled_p ())
1421                 {
1422                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1423                                    "not vectorized: relevant phi not "
1424                                    "supported: ");
1425                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1426                 }
1427               return false;
1428             }
1429         }
1430
1431       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1432         {
1433           gimple stmt = gsi_stmt (si);
1434           if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1435             return false;
1436         }
1437     } /* bbs */
1438
1439   /* All operations in the loop are either irrelevant (deal with loop
1440      control, or dead), or only used outside the loop and can be moved
1441      out of the loop (e.g. invariants, inductions).  The loop can be
1442      optimized away by scalar optimizations.  We're better off not
1443      touching this loop.  */
1444   if (!need_to_vectorize)
1445     {
1446       if (dump_enabled_p ())
1447         dump_printf_loc (MSG_NOTE, vect_location,
1448                          "All the computation can be taken out of the loop.");
1449       if (dump_enabled_p ())
1450         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1451                          "not vectorized: redundant loop. no profit to "
1452                          "vectorize.");
1453       return false;
1454     }
1455
1456   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1457     dump_printf_loc (MSG_NOTE, vect_location,
1458                      "vectorization_factor = %d, niters = "
1459                      HOST_WIDE_INT_PRINT_DEC, vectorization_factor,
1460                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1461
1462   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1463        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1464       || ((max_niter = max_stmt_executions_int (loop)) != -1
1465           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1466     {
1467       if (dump_enabled_p ())
1468         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1469                          "not vectorized: iteration count too small.");
1470       if (dump_enabled_p ())
1471         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1472                          "not vectorized: iteration count smaller than "
1473                          "vectorization factor.");
1474       return false;
1475     }
1476
1477   /* Analyze cost.  Decide if worth while to vectorize.  */
1478
1479   /* Once VF is set, SLP costs should be updated since the number of created
1480      vector stmts depends on VF.  */
1481   vect_update_slp_costs_according_to_vf (loop_vinfo);
1482
1483   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1484                                       &min_profitable_estimate);
1485   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1486
1487   if (min_profitable_iters < 0)
1488     {
1489       if (dump_enabled_p ())
1490         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1491                          "not vectorized: vectorization not profitable.");
1492       if (dump_enabled_p ())
1493         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1494                          "not vectorized: vector version will never be "
1495                          "profitable.");
1496       return false;
1497     }
1498
1499   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1500                             * vectorization_factor) - 1);
1501
1502
1503   /* Use the cost model only if it is more conservative than user specified
1504      threshold.  */
1505
1506   th = (unsigned) min_scalar_loop_bound;
1507   if (min_profitable_iters
1508       && (!min_scalar_loop_bound
1509           || min_profitable_iters > min_scalar_loop_bound))
1510     th = (unsigned) min_profitable_iters;
1511
1512   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1513       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1514     {
1515       if (dump_enabled_p ())
1516         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1517                          "not vectorized: vectorization not profitable.");
1518       if (dump_enabled_p ())
1519         dump_printf_loc (MSG_NOTE, vect_location,
1520                          "not vectorized: iteration count smaller than user "
1521                          "specified loop bound parameter or minimum profitable "
1522                          "iterations (whichever is more conservative).");
1523       return false;
1524     }
1525
1526   if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1527       && ((unsigned HOST_WIDE_INT) estimated_niter
1528           <= MAX (th, (unsigned)min_profitable_estimate)))
1529     {
1530       if (dump_enabled_p ())
1531         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1532                          "not vectorized: estimated iteration count too "
1533                          "small.");
1534       if (dump_enabled_p ())
1535         dump_printf_loc (MSG_NOTE, vect_location,
1536                          "not vectorized: estimated iteration count smaller "
1537                          "than specified loop bound parameter or minimum "
1538                          "profitable iterations (whichever is more "
1539                          "conservative).");
1540       return false;
1541     }
1542
1543   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1544       || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1545       || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1546     {
1547       if (dump_enabled_p ())
1548         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required.");
1549       if (!vect_can_advance_ivs_p (loop_vinfo))
1550         {
1551           if (dump_enabled_p ())
1552             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1553                              "not vectorized: can't create epilog loop 1.");
1554           return false;
1555         }
1556       if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1557         {
1558           if (dump_enabled_p ())
1559             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1560                              "not vectorized: can't create epilog loop 2.");
1561           return false;
1562         }
1563     }
1564
1565   return true;
1566 }
1567
1568
1569 /* Function vect_analyze_loop_2.
1570
1571    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1572    for it.  The different analyses will record information in the
1573    loop_vec_info struct.  */
1574 static bool
1575 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1576 {
1577   bool ok, slp = false;
1578   int max_vf = MAX_VECTORIZATION_FACTOR;
1579   int min_vf = 2;
1580
1581   /* Find all data references in the loop (which correspond to vdefs/vuses)
1582      and analyze their evolution in the loop.  Also adjust the minimal
1583      vectorization factor according to the loads and stores.
1584
1585      FORNOW: Handle only simple, array references, which
1586      alignment can be forced, and aligned pointer-references.  */
1587
1588   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1589   if (!ok)
1590     {
1591       if (dump_enabled_p ())
1592         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1593                          "bad data references.");
1594       return false;
1595     }
1596
1597   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1598      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1599
1600   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1601   if (!ok)
1602     {
1603       if (dump_enabled_p ())
1604         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1605                          "bad data access.");
1606       return false;
1607     }
1608
1609   /* Classify all cross-iteration scalar data-flow cycles.
1610      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1611
1612   vect_analyze_scalar_cycles (loop_vinfo);
1613
1614   vect_pattern_recog (loop_vinfo, NULL);
1615
1616   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1617
1618   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1619   if (!ok)
1620     {
1621       if (dump_enabled_p ())
1622         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1623                          "unexpected pattern.");
1624       return false;
1625     }
1626
1627   /* Analyze data dependences between the data-refs in the loop
1628      and adjust the maximum vectorization factor according to
1629      the dependences.
1630      FORNOW: fail at the first data dependence that we encounter.  */
1631
1632   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1633   if (!ok
1634       || max_vf < min_vf)
1635     {
1636       if (dump_enabled_p ())
1637             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1638                              "bad data dependence.");
1639       return false;
1640     }
1641
1642   ok = vect_determine_vectorization_factor (loop_vinfo);
1643   if (!ok)
1644     {
1645       if (dump_enabled_p ())
1646         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1647                          "can't determine vectorization factor.");
1648       return false;
1649     }
1650   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1651     {
1652       if (dump_enabled_p ())
1653         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1654                          "bad data dependence.");
1655       return false;
1656     }
1657
1658   /* Analyze the alignment of the data-refs in the loop.
1659      Fail if a data reference is found that cannot be vectorized.  */
1660
1661   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1662   if (!ok)
1663     {
1664       if (dump_enabled_p ())
1665         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1666                          "bad data alignment.");
1667       return false;
1668     }
1669
1670   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1671      It is important to call pruning after vect_analyze_data_ref_accesses,
1672      since we use grouping information gathered by interleaving analysis.  */
1673   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1674   if (!ok)
1675     {
1676       if (dump_enabled_p ())
1677         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1678                          "too long list of versioning for alias "
1679                          "run-time tests.");
1680       return false;
1681     }
1682
1683   /* This pass will decide on using loop versioning and/or loop peeling in
1684      order to enhance the alignment of data references in the loop.  */
1685
1686   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1687   if (!ok)
1688     {
1689       if (dump_enabled_p ())
1690         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1691                          "bad data alignment.");
1692       return false;
1693     }
1694
1695   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1696   ok = vect_analyze_slp (loop_vinfo, NULL);
1697   if (ok)
1698     {
1699       /* Decide which possible SLP instances to SLP.  */
1700       slp = vect_make_slp_decision (loop_vinfo);
1701
1702       /* Find stmts that need to be both vectorized and SLPed.  */
1703       vect_detect_hybrid_slp (loop_vinfo);
1704     }
1705   else
1706     return false;
1707
1708   /* Scan all the operations in the loop and make sure they are
1709      vectorizable.  */
1710
1711   ok = vect_analyze_loop_operations (loop_vinfo, slp);
1712   if (!ok)
1713     {
1714       if (dump_enabled_p ())
1715         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1716                          "bad operation or unsupported loop bound.");
1717       return false;
1718     }
1719
1720   return true;
1721 }
1722
1723 /* Function vect_analyze_loop.
1724
1725    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1726    for it.  The different analyses will record information in the
1727    loop_vec_info struct.  */
1728 loop_vec_info
1729 vect_analyze_loop (struct loop *loop)
1730 {
1731   loop_vec_info loop_vinfo;
1732   unsigned int vector_sizes;
1733
1734   /* Autodetect first vector size we try.  */
1735   current_vector_size = 0;
1736   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1737
1738   if (dump_enabled_p ())
1739     dump_printf_loc (MSG_NOTE, vect_location,
1740                      "===== analyze_loop_nest =====");
1741
1742   if (loop_outer (loop)
1743       && loop_vec_info_for_loop (loop_outer (loop))
1744       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1745     {
1746       if (dump_enabled_p ())
1747         dump_printf_loc (MSG_NOTE, vect_location,
1748                          "outer-loop already vectorized.");
1749       return NULL;
1750     }
1751
1752   while (1)
1753     {
1754       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1755       loop_vinfo = vect_analyze_loop_form (loop);
1756       if (!loop_vinfo)
1757         {
1758           if (dump_enabled_p ())
1759             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1760                              "bad loop form.");
1761           return NULL;
1762         }
1763
1764       if (vect_analyze_loop_2 (loop_vinfo))
1765         {
1766           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1767
1768           return loop_vinfo;
1769         }
1770
1771       destroy_loop_vec_info (loop_vinfo, true);
1772
1773       vector_sizes &= ~current_vector_size;
1774       if (vector_sizes == 0
1775           || current_vector_size == 0)
1776         return NULL;
1777
1778       /* Try the next biggest vector size.  */
1779       current_vector_size = 1 << floor_log2 (vector_sizes);
1780       if (dump_enabled_p ())
1781         dump_printf_loc (MSG_NOTE, vect_location,
1782                          "***** Re-trying analysis with "
1783                          "vector size %d\n", current_vector_size);
1784     }
1785 }
1786
1787
1788 /* Function reduction_code_for_scalar_code
1789
1790    Input:
1791    CODE - tree_code of a reduction operations.
1792
1793    Output:
1794    REDUC_CODE - the corresponding tree-code to be used to reduce the
1795       vector of partial results into a single scalar result (which
1796       will also reside in a vector) or ERROR_MARK if the operation is
1797       a supported reduction operation, but does not have such tree-code.
1798
1799    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1800
1801 static bool
1802 reduction_code_for_scalar_code (enum tree_code code,
1803                                 enum tree_code *reduc_code)
1804 {
1805   switch (code)
1806     {
1807       case MAX_EXPR:
1808         *reduc_code = REDUC_MAX_EXPR;
1809         return true;
1810
1811       case MIN_EXPR:
1812         *reduc_code = REDUC_MIN_EXPR;
1813         return true;
1814
1815       case PLUS_EXPR:
1816         *reduc_code = REDUC_PLUS_EXPR;
1817         return true;
1818
1819       case MULT_EXPR:
1820       case MINUS_EXPR:
1821       case BIT_IOR_EXPR:
1822       case BIT_XOR_EXPR:
1823       case BIT_AND_EXPR:
1824         *reduc_code = ERROR_MARK;
1825         return true;
1826
1827       default:
1828        return false;
1829     }
1830 }
1831
1832
1833 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
1834    STMT is printed with a message MSG. */
1835
1836 static void
1837 report_vect_op (int msg_type, gimple stmt, const char *msg)
1838 {
1839   dump_printf_loc (msg_type, vect_location, "%s", msg);
1840   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1841 }
1842
1843
1844 /* Detect SLP reduction of the form:
1845
1846    #a1 = phi <a5, a0>
1847    a2 = operation (a1)
1848    a3 = operation (a2)
1849    a4 = operation (a3)
1850    a5 = operation (a4)
1851
1852    #a = phi <a5>
1853
1854    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1855    FIRST_STMT is the first reduction stmt in the chain
1856    (a2 = operation (a1)).
1857
1858    Return TRUE if a reduction chain was detected.  */
1859
1860 static bool
1861 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1862 {
1863   struct loop *loop = (gimple_bb (phi))->loop_father;
1864   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1865   enum tree_code code;
1866   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1867   stmt_vec_info use_stmt_info, current_stmt_info;
1868   tree lhs;
1869   imm_use_iterator imm_iter;
1870   use_operand_p use_p;
1871   int nloop_uses, size = 0, n_out_of_loop_uses;
1872   bool found = false;
1873
1874   if (loop != vect_loop)
1875     return false;
1876
1877   lhs = PHI_RESULT (phi);
1878   code = gimple_assign_rhs_code (first_stmt);
1879   while (1)
1880     {
1881       nloop_uses = 0;
1882       n_out_of_loop_uses = 0;
1883       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1884         {
1885           gimple use_stmt = USE_STMT (use_p);
1886           if (is_gimple_debug (use_stmt))
1887             continue;
1888
1889           use_stmt = USE_STMT (use_p);
1890
1891           /* Check if we got back to the reduction phi.  */
1892           if (use_stmt == phi)
1893             {
1894               loop_use_stmt = use_stmt;
1895               found = true;
1896               break;
1897             }
1898
1899           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1900             {
1901               if (vinfo_for_stmt (use_stmt)
1902                   && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1903                 {
1904                   loop_use_stmt = use_stmt;
1905                   nloop_uses++;
1906                 }
1907             }
1908            else
1909              n_out_of_loop_uses++;
1910
1911            /* There are can be either a single use in the loop or two uses in
1912               phi nodes.  */
1913            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1914              return false;
1915         }
1916
1917       if (found)
1918         break;
1919
1920       /* We reached a statement with no loop uses.  */
1921       if (nloop_uses == 0)
1922         return false;
1923
1924       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
1925       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
1926         return false;
1927
1928       if (!is_gimple_assign (loop_use_stmt)
1929           || code != gimple_assign_rhs_code (loop_use_stmt)
1930           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
1931         return false;
1932
1933       /* Insert USE_STMT into reduction chain.  */
1934       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
1935       if (current_stmt)
1936         {
1937           current_stmt_info = vinfo_for_stmt (current_stmt);
1938           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
1939           GROUP_FIRST_ELEMENT (use_stmt_info)
1940             = GROUP_FIRST_ELEMENT (current_stmt_info);
1941         }
1942       else
1943         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
1944
1945       lhs = gimple_assign_lhs (loop_use_stmt);
1946       current_stmt = loop_use_stmt;
1947       size++;
1948    }
1949
1950   if (!found || loop_use_stmt != phi || size < 2)
1951     return false;
1952
1953   /* Swap the operands, if needed, to make the reduction operand be the second
1954      operand.  */
1955   lhs = PHI_RESULT (phi);
1956   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1957   while (next_stmt)
1958     {
1959       if (gimple_assign_rhs2 (next_stmt) == lhs)
1960         {
1961           tree op = gimple_assign_rhs1 (next_stmt);
1962           gimple def_stmt = NULL;
1963
1964           if (TREE_CODE (op) == SSA_NAME)
1965             def_stmt = SSA_NAME_DEF_STMT (op);
1966
1967           /* Check that the other def is either defined in the loop
1968              ("vect_internal_def"), or it's an induction (defined by a
1969              loop-header phi-node).  */
1970           if (def_stmt
1971               && gimple_bb (def_stmt)
1972               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1973               && (is_gimple_assign (def_stmt)
1974                   || is_gimple_call (def_stmt)
1975                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1976                            == vect_induction_def
1977                   || (gimple_code (def_stmt) == GIMPLE_PHI
1978                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1979                                   == vect_internal_def
1980                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
1981             {
1982               lhs = gimple_assign_lhs (next_stmt);
1983               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1984               continue;
1985             }
1986
1987           return false;
1988         }
1989       else
1990         {
1991           tree op = gimple_assign_rhs2 (next_stmt);
1992           gimple def_stmt = NULL;
1993
1994           if (TREE_CODE (op) == SSA_NAME)
1995             def_stmt = SSA_NAME_DEF_STMT (op);
1996
1997           /* Check that the other def is either defined in the loop
1998             ("vect_internal_def"), or it's an induction (defined by a
1999             loop-header phi-node).  */
2000           if (def_stmt
2001               && gimple_bb (def_stmt)
2002               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2003               && (is_gimple_assign (def_stmt)
2004                   || is_gimple_call (def_stmt)
2005                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2006                               == vect_induction_def
2007                   || (gimple_code (def_stmt) == GIMPLE_PHI
2008                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2009                                   == vect_internal_def
2010                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2011             {
2012               if (dump_enabled_p ())
2013                 {
2014                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2015                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2016                 }
2017
2018               swap_tree_operands (next_stmt,
2019                                   gimple_assign_rhs1_ptr (next_stmt),
2020                                   gimple_assign_rhs2_ptr (next_stmt));
2021               update_stmt (next_stmt);
2022
2023               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2024                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2025             }
2026           else
2027             return false;
2028         }
2029
2030       lhs = gimple_assign_lhs (next_stmt);
2031       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2032     }
2033
2034   /* Save the chain for further analysis in SLP detection.  */
2035   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2036   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2037   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2038
2039   return true;
2040 }
2041
2042
2043 /* Function vect_is_simple_reduction_1
2044
2045    (1) Detect a cross-iteration def-use cycle that represents a simple
2046    reduction computation.  We look for the following pattern:
2047
2048    loop_header:
2049      a1 = phi < a0, a2 >
2050      a3 = ...
2051      a2 = operation (a3, a1)
2052
2053    such that:
2054    1. operation is commutative and associative and it is safe to
2055       change the order of the computation (if CHECK_REDUCTION is true)
2056    2. no uses for a2 in the loop (a2 is used out of the loop)
2057    3. no uses of a1 in the loop besides the reduction operation
2058    4. no uses of a1 outside the loop.
2059
2060    Conditions 1,4 are tested here.
2061    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2062
2063    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2064    nested cycles, if CHECK_REDUCTION is false.
2065
2066    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2067    reductions:
2068
2069      a1 = phi < a0, a2 >
2070      inner loop (def of a3)
2071      a2 = phi < a3 >
2072
2073    If MODIFY is true it tries also to rework the code in-place to enable
2074    detection of more reduction patterns.  For the time being we rewrite
2075    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2076 */
2077
2078 static gimple
2079 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2080                             bool check_reduction, bool *double_reduc,
2081                             bool modify)
2082 {
2083   struct loop *loop = (gimple_bb (phi))->loop_father;
2084   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2085   edge latch_e = loop_latch_edge (loop);
2086   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2087   gimple def_stmt, def1 = NULL, def2 = NULL;
2088   enum tree_code orig_code, code;
2089   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2090   tree type;
2091   int nloop_uses;
2092   tree name;
2093   imm_use_iterator imm_iter;
2094   use_operand_p use_p;
2095   bool phi_def;
2096
2097   *double_reduc = false;
2098
2099   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2100      otherwise, we assume outer loop vectorization.  */
2101   gcc_assert ((check_reduction && loop == vect_loop)
2102               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2103
2104   name = PHI_RESULT (phi);
2105   nloop_uses = 0;
2106   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2107     {
2108       gimple use_stmt = USE_STMT (use_p);
2109       if (is_gimple_debug (use_stmt))
2110         continue;
2111
2112       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2113         {
2114           if (dump_enabled_p ())
2115             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2116                              "intermediate value used outside loop.");
2117
2118           return NULL;
2119         }
2120
2121       if (vinfo_for_stmt (use_stmt)
2122           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2123         nloop_uses++;
2124       if (nloop_uses > 1)
2125         {
2126           if (dump_enabled_p ())
2127             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2128                              "reduction used in loop.");
2129           return NULL;
2130         }
2131     }
2132
2133   if (TREE_CODE (loop_arg) != SSA_NAME)
2134     {
2135       if (dump_enabled_p ())
2136         {
2137           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2138                            "reduction: not ssa_name: ");
2139           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2140         }
2141       return NULL;
2142     }
2143
2144   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2145   if (!def_stmt)
2146     {
2147       if (dump_enabled_p ())
2148         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2149                          "reduction: no def_stmt.");
2150       return NULL;
2151     }
2152
2153   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2154     {
2155       if (dump_enabled_p ())
2156         dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2157       return NULL;
2158     }
2159
2160   if (is_gimple_assign (def_stmt))
2161     {
2162       name = gimple_assign_lhs (def_stmt);
2163       phi_def = false;
2164     }
2165   else
2166     {
2167       name = PHI_RESULT (def_stmt);
2168       phi_def = true;
2169     }
2170
2171   nloop_uses = 0;
2172   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2173     {
2174       gimple use_stmt = USE_STMT (use_p);
2175       if (is_gimple_debug (use_stmt))
2176         continue;
2177       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2178           && vinfo_for_stmt (use_stmt)
2179           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2180         nloop_uses++;
2181       if (nloop_uses > 1)
2182         {
2183           if (dump_enabled_p ())
2184             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2185                              "reduction used in loop.");
2186           return NULL;
2187         }
2188     }
2189
2190   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2191      defined in the inner loop.  */
2192   if (phi_def)
2193     {
2194       op1 = PHI_ARG_DEF (def_stmt, 0);
2195
2196       if (gimple_phi_num_args (def_stmt) != 1
2197           || TREE_CODE (op1) != SSA_NAME)
2198         {
2199           if (dump_enabled_p ())
2200             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2201                              "unsupported phi node definition.");
2202
2203           return NULL;
2204         }
2205
2206       def1 = SSA_NAME_DEF_STMT (op1);
2207       if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2208           && loop->inner
2209           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2210           && is_gimple_assign (def1))
2211         {
2212           if (dump_enabled_p ())
2213             report_vect_op (MSG_NOTE, def_stmt,
2214                             "detected double reduction: ");
2215
2216           *double_reduc = true;
2217           return def_stmt;
2218         }
2219
2220       return NULL;
2221     }
2222
2223   code = orig_code = gimple_assign_rhs_code (def_stmt);
2224
2225   /* We can handle "res -= x[i]", which is non-associative by
2226      simply rewriting this into "res += -x[i]".  Avoid changing
2227      gimple instruction for the first simple tests and only do this
2228      if we're allowed to change code at all.  */
2229   if (code == MINUS_EXPR
2230       && modify
2231       && (op1 = gimple_assign_rhs1 (def_stmt))
2232       && TREE_CODE (op1) == SSA_NAME
2233       && SSA_NAME_DEF_STMT (op1) == phi)
2234     code = PLUS_EXPR;
2235
2236   if (check_reduction
2237       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2238     {
2239       if (dump_enabled_p ())
2240         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2241                         "reduction: not commutative/associative: ");
2242       return NULL;
2243     }
2244
2245   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2246     {
2247       if (code != COND_EXPR)
2248         {
2249           if (dump_enabled_p ())
2250             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2251                             "reduction: not binary operation: ");
2252
2253           return NULL;
2254         }
2255
2256       op3 = gimple_assign_rhs1 (def_stmt);
2257       if (COMPARISON_CLASS_P (op3))
2258         {
2259           op4 = TREE_OPERAND (op3, 1);
2260           op3 = TREE_OPERAND (op3, 0);
2261         }
2262
2263       op1 = gimple_assign_rhs2 (def_stmt);
2264       op2 = gimple_assign_rhs3 (def_stmt);
2265
2266       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2267         {
2268           if (dump_enabled_p ())
2269             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2270                             "reduction: uses not ssa_names: ");
2271
2272           return NULL;
2273         }
2274     }
2275   else
2276     {
2277       op1 = gimple_assign_rhs1 (def_stmt);
2278       op2 = gimple_assign_rhs2 (def_stmt);
2279
2280       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2281         {
2282           if (dump_enabled_p ())
2283             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2284                             "reduction: uses not ssa_names: ");
2285
2286           return NULL;
2287         }
2288    }
2289
2290   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2291   if ((TREE_CODE (op1) == SSA_NAME
2292        && !types_compatible_p (type,TREE_TYPE (op1)))
2293       || (TREE_CODE (op2) == SSA_NAME
2294           && !types_compatible_p (type, TREE_TYPE (op2)))
2295       || (op3 && TREE_CODE (op3) == SSA_NAME
2296           && !types_compatible_p (type, TREE_TYPE (op3)))
2297       || (op4 && TREE_CODE (op4) == SSA_NAME
2298           && !types_compatible_p (type, TREE_TYPE (op4))))
2299     {
2300       if (dump_enabled_p ())
2301         {
2302           dump_printf_loc (MSG_NOTE, vect_location,
2303                            "reduction: multiple types: operation type: ");
2304           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2305           dump_printf (MSG_NOTE, ", operands types: ");
2306           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2307                              TREE_TYPE (op1));
2308           dump_printf (MSG_NOTE, ",");
2309           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2310                              TREE_TYPE (op2));
2311           if (op3)
2312             {
2313               dump_printf (MSG_NOTE, ",");
2314               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2315                                  TREE_TYPE (op3));
2316             }
2317
2318           if (op4)
2319             {
2320               dump_printf (MSG_NOTE, ",");
2321               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2322                                  TREE_TYPE (op4));
2323             }
2324         }
2325
2326       return NULL;
2327     }
2328
2329   /* Check that it's ok to change the order of the computation.
2330      Generally, when vectorizing a reduction we change the order of the
2331      computation.  This may change the behavior of the program in some
2332      cases, so we need to check that this is ok.  One exception is when
2333      vectorizing an outer-loop: the inner-loop is executed sequentially,
2334      and therefore vectorizing reductions in the inner-loop during
2335      outer-loop vectorization is safe.  */
2336
2337   /* CHECKME: check for !flag_finite_math_only too?  */
2338   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2339       && check_reduction)
2340     {
2341       /* Changing the order of operations changes the semantics.  */
2342       if (dump_enabled_p ())
2343         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2344                         "reduction: unsafe fp math optimization: ");
2345       return NULL;
2346     }
2347   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2348            && check_reduction)
2349     {
2350       /* Changing the order of operations changes the semantics.  */
2351       if (dump_enabled_p ())
2352         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2353                         "reduction: unsafe int math optimization: ");
2354       return NULL;
2355     }
2356   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2357     {
2358       /* Changing the order of operations changes the semantics.  */
2359       if (dump_enabled_p ())
2360         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2361                         "reduction: unsafe fixed-point math optimization: ");
2362       return NULL;
2363     }
2364
2365   /* If we detected "res -= x[i]" earlier, rewrite it into
2366      "res += -x[i]" now.  If this turns out to be useless reassoc
2367      will clean it up again.  */
2368   if (orig_code == MINUS_EXPR)
2369     {
2370       tree rhs = gimple_assign_rhs2 (def_stmt);
2371       tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
2372       gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2373                                                          rhs, NULL);
2374       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2375       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2376                                                           loop_info, NULL));
2377       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2378       gimple_assign_set_rhs2 (def_stmt, negrhs);
2379       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2380       update_stmt (def_stmt);
2381     }
2382
2383   /* Reduction is safe. We're dealing with one of the following:
2384      1) integer arithmetic and no trapv
2385      2) floating point arithmetic, and special flags permit this optimization
2386      3) nested cycle (i.e., outer loop vectorization).  */
2387   if (TREE_CODE (op1) == SSA_NAME)
2388     def1 = SSA_NAME_DEF_STMT (op1);
2389
2390   if (TREE_CODE (op2) == SSA_NAME)
2391     def2 = SSA_NAME_DEF_STMT (op2);
2392
2393   if (code != COND_EXPR
2394       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2395     {
2396       if (dump_enabled_p ())
2397         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2398       return NULL;
2399     }
2400
2401   /* Check that one def is the reduction def, defined by PHI,
2402      the other def is either defined in the loop ("vect_internal_def"),
2403      or it's an induction (defined by a loop-header phi-node).  */
2404
2405   if (def2 && def2 == phi
2406       && (code == COND_EXPR
2407           || !def1 || gimple_nop_p (def1)
2408           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2409               && (is_gimple_assign (def1)
2410                   || is_gimple_call (def1)
2411                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2412                       == vect_induction_def
2413                   || (gimple_code (def1) == GIMPLE_PHI
2414                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2415                           == vect_internal_def
2416                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2417     {
2418       if (dump_enabled_p ())
2419         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2420       return def_stmt;
2421     }
2422
2423   if (def1 && def1 == phi
2424       && (code == COND_EXPR
2425           || !def2 || gimple_nop_p (def2)
2426           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2427               && (is_gimple_assign (def2)
2428                   || is_gimple_call (def2)
2429                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2430                       == vect_induction_def
2431                   || (gimple_code (def2) == GIMPLE_PHI
2432                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2433                           == vect_internal_def
2434                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2435     {
2436       if (check_reduction)
2437         {
2438           /* Swap operands (just for simplicity - so that the rest of the code
2439              can assume that the reduction variable is always the last (second)
2440              argument).  */
2441           if (dump_enabled_p ())
2442             report_vect_op (MSG_NOTE, def_stmt,
2443                             "detected reduction: need to swap operands: ");
2444
2445           swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2446                               gimple_assign_rhs2_ptr (def_stmt));
2447
2448           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2449             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2450         }
2451       else
2452         {
2453           if (dump_enabled_p ())
2454             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2455         }
2456
2457       return def_stmt;
2458     }
2459
2460   /* Try to find SLP reduction chain.  */
2461   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2462     {
2463       if (dump_enabled_p ())
2464         report_vect_op (MSG_NOTE, def_stmt,
2465                         "reduction: detected reduction chain: ");
2466
2467       return def_stmt;
2468     }
2469
2470   if (dump_enabled_p ())
2471     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2472                     "reduction: unknown pattern: ");
2473
2474   return NULL;
2475 }
2476
2477 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2478    in-place.  Arguments as there.  */
2479
2480 static gimple
2481 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2482                           bool check_reduction, bool *double_reduc)
2483 {
2484   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2485                                      double_reduc, false);
2486 }
2487
2488 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2489    in-place if it enables detection of more reductions.  Arguments
2490    as there.  */
2491
2492 gimple
2493 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2494                           bool check_reduction, bool *double_reduc)
2495 {
2496   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2497                                      double_reduc, true);
2498 }
2499
2500 /* Calculate the cost of one scalar iteration of the loop.  */
2501 int
2502 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2503 {
2504   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2505   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2506   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2507   int innerloop_iters, i, stmt_cost;
2508
2509   /* Count statements in scalar loop.  Using this as scalar cost for a single
2510      iteration for now.
2511
2512      TODO: Add outer loop support.
2513
2514      TODO: Consider assigning different costs to different scalar
2515      statements.  */
2516
2517   /* FORNOW.  */
2518   innerloop_iters = 1;
2519   if (loop->inner)
2520     innerloop_iters = 50; /* FIXME */
2521
2522   for (i = 0; i < nbbs; i++)
2523     {
2524       gimple_stmt_iterator si;
2525       basic_block bb = bbs[i];
2526
2527       if (bb->loop_father == loop->inner)
2528         factor = innerloop_iters;
2529       else
2530         factor = 1;
2531
2532       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2533         {
2534           gimple stmt = gsi_stmt (si);
2535           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2536
2537           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2538             continue;
2539
2540           /* Skip stmts that are not vectorized inside the loop.  */
2541           if (stmt_info
2542               && !STMT_VINFO_RELEVANT_P (stmt_info)
2543               && (!STMT_VINFO_LIVE_P (stmt_info)
2544                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2545               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2546             continue;
2547
2548           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2549             {
2550               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2551                stmt_cost = vect_get_stmt_cost (scalar_load);
2552              else
2553                stmt_cost = vect_get_stmt_cost (scalar_store);
2554             }
2555           else
2556             stmt_cost = vect_get_stmt_cost (scalar_stmt);
2557
2558           scalar_single_iter_cost += stmt_cost * factor;
2559         }
2560     }
2561   return scalar_single_iter_cost;
2562 }
2563
2564 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2565 int
2566 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2567                              int *peel_iters_epilogue,
2568                              int scalar_single_iter_cost,
2569                              stmt_vector_for_cost *prologue_cost_vec,
2570                              stmt_vector_for_cost *epilogue_cost_vec)
2571 {
2572   int retval = 0;
2573   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2574
2575   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2576     {
2577       *peel_iters_epilogue = vf/2;
2578       if (dump_enabled_p ())
2579         dump_printf_loc (MSG_NOTE, vect_location,
2580                          "cost model: epilogue peel iters set to vf/2 "
2581                          "because loop iterations are unknown .");
2582
2583       /* If peeled iterations are known but number of scalar loop
2584          iterations are unknown, count a taken branch per peeled loop.  */
2585       retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2586                                  NULL, 0, vect_prologue);
2587     }
2588   else
2589     {
2590       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2591       peel_iters_prologue = niters < peel_iters_prologue ?
2592                             niters : peel_iters_prologue;
2593       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2594       /* If we need to peel for gaps, but no peeling is required, we have to
2595          peel VF iterations.  */
2596       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2597         *peel_iters_epilogue = vf;
2598     }
2599
2600   if (peel_iters_prologue)
2601     retval += record_stmt_cost (prologue_cost_vec,
2602                                 peel_iters_prologue * scalar_single_iter_cost,
2603                                 scalar_stmt, NULL, 0, vect_prologue);
2604   if (*peel_iters_epilogue)
2605     retval += record_stmt_cost (epilogue_cost_vec,
2606                                 *peel_iters_epilogue * scalar_single_iter_cost,
2607                                 scalar_stmt, NULL, 0, vect_epilogue);
2608   return retval;
2609 }
2610
2611 /* Function vect_estimate_min_profitable_iters
2612
2613    Return the number of iterations required for the vector version of the
2614    loop to be profitable relative to the cost of the scalar version of the
2615    loop.  */
2616
2617 static void
2618 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2619                                     int *ret_min_profitable_niters,
2620                                     int *ret_min_profitable_estimate)
2621 {
2622   int min_profitable_iters;
2623   int min_profitable_estimate;
2624   int peel_iters_prologue;
2625   int peel_iters_epilogue;
2626   unsigned vec_inside_cost = 0;
2627   int vec_outside_cost = 0;
2628   unsigned vec_prologue_cost = 0;
2629   unsigned vec_epilogue_cost = 0;
2630   int scalar_single_iter_cost = 0;
2631   int scalar_outside_cost = 0;
2632   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2633   int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2634   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2635
2636   /* Cost model disabled.  */
2637   if (!flag_vect_cost_model)
2638     {
2639       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.");
2640       *ret_min_profitable_niters = 0;
2641       *ret_min_profitable_estimate = 0;
2642       return;
2643     }
2644
2645   /* Requires loop versioning tests to handle misalignment.  */
2646   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2647     {
2648       /*  FIXME: Make cost depend on complexity of individual check.  */
2649       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2650       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2651                             vect_prologue);
2652       dump_printf (MSG_NOTE,
2653                    "cost model: Adding cost of checks for loop "
2654                    "versioning to treat misalignment.\n");
2655     }
2656
2657   /* Requires loop versioning with alias checks.  */
2658   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2659     {
2660       /*  FIXME: Make cost depend on complexity of individual check.  */
2661       unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2662       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2663                             vect_prologue);
2664       dump_printf (MSG_NOTE,
2665                    "cost model: Adding cost of checks for loop "
2666                    "versioning aliasing.\n");
2667     }
2668
2669   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2670       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2671     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2672                           vect_prologue);
2673
2674   /* Count statements in scalar loop.  Using this as scalar cost for a single
2675      iteration for now.
2676
2677      TODO: Add outer loop support.
2678
2679      TODO: Consider assigning different costs to different scalar
2680      statements.  */
2681
2682   scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2683
2684   /* Add additional cost for the peeled instructions in prologue and epilogue
2685      loop.
2686
2687      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2688      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2689
2690      TODO: Build an expression that represents peel_iters for prologue and
2691      epilogue to be used in a run-time test.  */
2692
2693   if (npeel  < 0)
2694     {
2695       peel_iters_prologue = vf/2;
2696       dump_printf (MSG_NOTE, "cost model: "
2697                    "prologue peel iters set to vf/2.");
2698
2699       /* If peeling for alignment is unknown, loop bound of main loop becomes
2700          unknown.  */
2701       peel_iters_epilogue = vf/2;
2702       dump_printf (MSG_NOTE, "cost model: "
2703                    "epilogue peel iters set to vf/2 because "
2704                    "peeling for alignment is unknown.");
2705
2706       /* If peeled iterations are unknown, count a taken branch and a not taken
2707          branch per peeled loop. Even if scalar loop iterations are known,
2708          vector iterations are not known since peeled prologue iterations are
2709          not known. Hence guards remain the same.  */
2710       (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2711                             NULL, 0, vect_prologue);
2712       (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2713                             NULL, 0, vect_prologue);
2714       /* FORNOW: Don't attempt to pass individual scalar instructions to
2715          the model; just assume linear cost for scalar iterations.  */
2716       (void) add_stmt_cost (target_cost_data,
2717                             peel_iters_prologue * scalar_single_iter_cost,
2718                             scalar_stmt, NULL, 0, vect_prologue);
2719       (void) add_stmt_cost (target_cost_data,
2720                             peel_iters_epilogue * scalar_single_iter_cost,
2721                             scalar_stmt, NULL, 0, vect_epilogue);
2722     }
2723   else
2724     {
2725       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2726       stmt_info_for_cost *si;
2727       int j;
2728       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2729
2730       prologue_cost_vec.create (2);
2731       epilogue_cost_vec.create (2);
2732       peel_iters_prologue = npeel;
2733
2734       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2735                                           &peel_iters_epilogue,
2736                                           scalar_single_iter_cost,
2737                                           &prologue_cost_vec,
2738                                           &epilogue_cost_vec);
2739
2740       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2741         {
2742           struct _stmt_vec_info *stmt_info
2743             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2744           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2745                                 si->misalign, vect_prologue);
2746         }
2747
2748       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2749         {
2750           struct _stmt_vec_info *stmt_info
2751             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2752           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2753                                 si->misalign, vect_epilogue);
2754         }
2755
2756       prologue_cost_vec.release ();
2757       epilogue_cost_vec.release ();
2758     }
2759
2760   /* FORNOW: The scalar outside cost is incremented in one of the
2761      following ways:
2762
2763      1. The vectorizer checks for alignment and aliasing and generates
2764      a condition that allows dynamic vectorization.  A cost model
2765      check is ANDED with the versioning condition.  Hence scalar code
2766      path now has the added cost of the versioning check.
2767
2768        if (cost > th & versioning_check)
2769          jmp to vector code
2770
2771      Hence run-time scalar is incremented by not-taken branch cost.
2772
2773      2. The vectorizer then checks if a prologue is required.  If the
2774      cost model check was not done before during versioning, it has to
2775      be done before the prologue check.
2776
2777        if (cost <= th)
2778          prologue = scalar_iters
2779        if (prologue == 0)
2780          jmp to vector code
2781        else
2782          execute prologue
2783        if (prologue == num_iters)
2784          go to exit
2785
2786      Hence the run-time scalar cost is incremented by a taken branch,
2787      plus a not-taken branch, plus a taken branch cost.
2788
2789      3. The vectorizer then checks if an epilogue is required.  If the
2790      cost model check was not done before during prologue check, it
2791      has to be done with the epilogue check.
2792
2793        if (prologue == 0)
2794          jmp to vector code
2795        else
2796          execute prologue
2797        if (prologue == num_iters)
2798          go to exit
2799        vector code:
2800          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2801            jmp to epilogue
2802
2803      Hence the run-time scalar cost should be incremented by 2 taken
2804      branches.
2805
2806      TODO: The back end may reorder the BBS's differently and reverse
2807      conditions/branch directions.  Change the estimates below to
2808      something more reasonable.  */
2809
2810   /* If the number of iterations is known and we do not do versioning, we can
2811      decide whether to vectorize at compile time.  Hence the scalar version
2812      do not carry cost model guard costs.  */
2813   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2814       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2815       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2816     {
2817       /* Cost model check occurs at versioning.  */
2818       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2819           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2820         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2821       else
2822         {
2823           /* Cost model check occurs at prologue generation.  */
2824           if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2825             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2826               + vect_get_stmt_cost (cond_branch_not_taken);
2827           /* Cost model check occurs at epilogue generation.  */
2828           else
2829             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2830         }
2831     }
2832
2833   /* Complete the target-specific cost calculations.  */
2834   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2835                &vec_inside_cost, &vec_epilogue_cost);
2836
2837   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2838
2839   /* Calculate number of iterations required to make the vector version
2840      profitable, relative to the loop bodies only.  The following condition
2841      must hold true:
2842      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2843      where
2844      SIC = scalar iteration cost, VIC = vector iteration cost,
2845      VOC = vector outside cost, VF = vectorization factor,
2846      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2847      SOC = scalar outside cost for run time cost model check.  */
2848
2849   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2850     {
2851       if (vec_outside_cost <= 0)
2852         min_profitable_iters = 1;
2853       else
2854         {
2855           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2856                                   - vec_inside_cost * peel_iters_prologue
2857                                   - vec_inside_cost * peel_iters_epilogue)
2858                                  / ((scalar_single_iter_cost * vf)
2859                                     - vec_inside_cost);
2860
2861           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2862               <= (((int) vec_inside_cost * min_profitable_iters)
2863                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
2864             min_profitable_iters++;
2865         }
2866     }
2867   /* vector version will never be profitable.  */
2868   else
2869     {
2870       if (dump_enabled_p ())
2871         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2872                          "cost model: the vector iteration cost = %d "
2873                          "divided by the scalar iteration cost = %d "
2874                          "is greater or equal to the vectorization factor = %d.",
2875                          vec_inside_cost, scalar_single_iter_cost, vf);
2876       *ret_min_profitable_niters = -1;
2877       *ret_min_profitable_estimate = -1;
2878       return;
2879     }
2880
2881   if (dump_enabled_p ())
2882     {
2883       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2884       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
2885                    vec_inside_cost);
2886       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
2887                    vec_prologue_cost);
2888       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
2889                    vec_epilogue_cost);
2890       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
2891                    scalar_single_iter_cost);
2892       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
2893                    scalar_outside_cost);
2894       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
2895                    vec_outside_cost);
2896       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
2897                    peel_iters_prologue);
2898       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
2899                    peel_iters_epilogue);
2900       dump_printf (MSG_NOTE,
2901                    "  Calculated minimum iters for profitability: %d\n",
2902                    min_profitable_iters);
2903     }
2904
2905   min_profitable_iters =
2906         min_profitable_iters < vf ? vf : min_profitable_iters;
2907
2908   /* Because the condition we create is:
2909      if (niters <= min_profitable_iters)
2910        then skip the vectorized loop.  */
2911   min_profitable_iters--;
2912
2913   if (dump_enabled_p ())
2914     dump_printf_loc (MSG_NOTE, vect_location,
2915                      "  Runtime profitability threshold = %d\n", min_profitable_iters);
2916
2917   *ret_min_profitable_niters = min_profitable_iters;
2918
2919   /* Calculate number of iterations required to make the vector version
2920      profitable, relative to the loop bodies only.
2921
2922      Non-vectorized variant is SIC * niters and it must win over vector
2923      variant on the expected loop trip count.  The following condition must hold true:
2924      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
2925
2926   if (vec_outside_cost <= 0)
2927     min_profitable_estimate = 1;
2928   else
2929     {
2930       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
2931                                  - vec_inside_cost * peel_iters_prologue
2932                                  - vec_inside_cost * peel_iters_epilogue)
2933                                  / ((scalar_single_iter_cost * vf)
2934                                    - vec_inside_cost);
2935     }
2936   min_profitable_estimate --;
2937   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
2938   if (dump_enabled_p ())
2939     dump_printf_loc (MSG_NOTE, vect_location,
2940                      "  Static estimate profitability threshold = %d\n",
2941                       min_profitable_iters);
2942
2943   *ret_min_profitable_estimate = min_profitable_estimate;
2944 }
2945
2946
2947 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2948    functions. Design better to avoid maintenance issues.  */
2949
2950 /* Function vect_model_reduction_cost.
2951
2952    Models cost for a reduction operation, including the vector ops
2953    generated within the strip-mine loop, the initial definition before
2954    the loop, and the epilogue code that must be generated.  */
2955
2956 static bool
2957 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2958                            int ncopies)
2959 {
2960   int prologue_cost = 0, epilogue_cost = 0;
2961   enum tree_code code;
2962   optab optab;
2963   tree vectype;
2964   gimple stmt, orig_stmt;
2965   tree reduction_op;
2966   enum machine_mode mode;
2967   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2968   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2969   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2970
2971   /* Cost of reduction op inside loop.  */
2972   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
2973                                         stmt_info, 0, vect_body);
2974   stmt = STMT_VINFO_STMT (stmt_info);
2975
2976   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2977     {
2978     case GIMPLE_SINGLE_RHS:
2979       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2980       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2981       break;
2982     case GIMPLE_UNARY_RHS:
2983       reduction_op = gimple_assign_rhs1 (stmt);
2984       break;
2985     case GIMPLE_BINARY_RHS:
2986       reduction_op = gimple_assign_rhs2 (stmt);
2987       break;
2988     case GIMPLE_TERNARY_RHS:
2989       reduction_op = gimple_assign_rhs3 (stmt);
2990       break;
2991     default:
2992       gcc_unreachable ();
2993     }
2994
2995   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2996   if (!vectype)
2997     {
2998       if (dump_enabled_p ())
2999         {
3000           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3001                            "unsupported data-type ");
3002           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3003                              TREE_TYPE (reduction_op));
3004         }
3005       return false;
3006    }
3007
3008   mode = TYPE_MODE (vectype);
3009   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3010
3011   if (!orig_stmt)
3012     orig_stmt = STMT_VINFO_STMT (stmt_info);
3013
3014   code = gimple_assign_rhs_code (orig_stmt);
3015
3016   /* Add in cost for initial definition.  */
3017   prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3018                                   stmt_info, 0, vect_prologue);
3019
3020   /* Determine cost of epilogue code.
3021
3022      We have a reduction operator that will reduce the vector in one statement.
3023      Also requires scalar extract.  */
3024
3025   if (!nested_in_vect_loop_p (loop, orig_stmt))
3026     {
3027       if (reduc_code != ERROR_MARK)
3028         {
3029           epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3030                                           stmt_info, 0, vect_epilogue);
3031           epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3032                                           stmt_info, 0, vect_epilogue);
3033         }
3034       else
3035         {
3036           int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3037           tree bitsize =
3038             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3039           int element_bitsize = tree_low_cst (bitsize, 1);
3040           int nelements = vec_size_in_bits / element_bitsize;
3041
3042           optab = optab_for_tree_code (code, vectype, optab_default);
3043
3044           /* We have a whole vector shift available.  */
3045           if (VECTOR_MODE_P (mode)
3046               && optab_handler (optab, mode) != CODE_FOR_nothing
3047               && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3048             {
3049               /* Final reduction via vector shifts and the reduction operator.
3050                  Also requires scalar extract.  */
3051               epilogue_cost += add_stmt_cost (target_cost_data,
3052                                               exact_log2 (nelements) * 2,
3053                                               vector_stmt, stmt_info, 0,
3054                                               vect_epilogue);
3055               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3056                                               vec_to_scalar, stmt_info, 0,
3057                                               vect_epilogue);
3058             }
3059           else
3060             /* Use extracts and reduction op for final reduction.  For N
3061                elements, we have N extracts and N-1 reduction ops.  */
3062             epilogue_cost += add_stmt_cost (target_cost_data,
3063                                             nelements + nelements - 1,
3064                                             vector_stmt, stmt_info, 0,
3065                                             vect_epilogue);
3066         }
3067     }
3068
3069   if (dump_enabled_p ())
3070     dump_printf (MSG_NOTE,
3071                  "vect_model_reduction_cost: inside_cost = %d, "
3072                  "prologue_cost = %d, epilogue_cost = %d .", inside_cost,
3073                  prologue_cost, epilogue_cost);
3074
3075   return true;
3076 }
3077
3078
3079 /* Function vect_model_induction_cost.
3080
3081    Models cost for induction operations.  */
3082
3083 static void
3084 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3085 {
3086   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3087   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3088   unsigned inside_cost, prologue_cost;
3089
3090   /* loop cost for vec_loop.  */
3091   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3092                                stmt_info, 0, vect_body);
3093
3094   /* prologue cost for vec_init and vec_step.  */
3095   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3096                                  stmt_info, 0, vect_prologue);
3097
3098   if (dump_enabled_p ())
3099     dump_printf_loc (MSG_NOTE, vect_location,
3100                      "vect_model_induction_cost: inside_cost = %d, "
3101                      "prologue_cost = %d .", inside_cost, prologue_cost);
3102 }
3103
3104
3105 /* Function get_initial_def_for_induction
3106
3107    Input:
3108    STMT - a stmt that performs an induction operation in the loop.
3109    IV_PHI - the initial value of the induction variable
3110
3111    Output:
3112    Return a vector variable, initialized with the first VF values of
3113    the induction variable.  E.g., for an iv with IV_PHI='X' and
3114    evolution S, for a vector of 4 units, we want to return:
3115    [X, X + S, X + 2*S, X + 3*S].  */
3116
3117 static tree
3118 get_initial_def_for_induction (gimple iv_phi)
3119 {
3120   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3121   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3122   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3123   tree scalar_type;
3124   tree vectype;
3125   int nunits;
3126   edge pe = loop_preheader_edge (loop);
3127   struct loop *iv_loop;
3128   basic_block new_bb;
3129   tree new_vec, vec_init, vec_step, t;
3130   tree access_fn;
3131   tree new_var;
3132   tree new_name;
3133   gimple init_stmt, induction_phi, new_stmt;
3134   tree induc_def, vec_def, vec_dest;
3135   tree init_expr, step_expr;
3136   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3137   int i;
3138   bool ok;
3139   int ncopies;
3140   tree expr;
3141   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3142   bool nested_in_vect_loop = false;
3143   gimple_seq stmts = NULL;
3144   imm_use_iterator imm_iter;
3145   use_operand_p use_p;
3146   gimple exit_phi;
3147   edge latch_e;
3148   tree loop_arg;
3149   gimple_stmt_iterator si;
3150   basic_block bb = gimple_bb (iv_phi);
3151   tree stepvectype;
3152   tree resvectype;
3153
3154   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3155   if (nested_in_vect_loop_p (loop, iv_phi))
3156     {
3157       nested_in_vect_loop = true;
3158       iv_loop = loop->inner;
3159     }
3160   else
3161     iv_loop = loop;
3162   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3163
3164   latch_e = loop_latch_edge (iv_loop);
3165   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3166
3167   access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
3168   gcc_assert (access_fn);
3169   STRIP_NOPS (access_fn);
3170   ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
3171                                     &init_expr, &step_expr);
3172   gcc_assert (ok);
3173   pe = loop_preheader_edge (iv_loop);
3174
3175   scalar_type = TREE_TYPE (init_expr);
3176   vectype = get_vectype_for_scalar_type (scalar_type);
3177   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3178   gcc_assert (vectype);
3179   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3180   ncopies = vf / nunits;
3181
3182   gcc_assert (phi_info);
3183   gcc_assert (ncopies >= 1);
3184
3185   /* Find the first insertion point in the BB.  */
3186   si = gsi_after_labels (bb);
3187
3188   /* Create the vector that holds the initial_value of the induction.  */
3189   if (nested_in_vect_loop)
3190     {
3191       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3192          been created during vectorization of previous stmts.  We obtain it
3193          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3194       tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3195                                            loop_preheader_edge (iv_loop));
3196       vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
3197       /* If the initial value is not of proper type, convert it.  */
3198       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3199         {
3200           new_stmt = gimple_build_assign_with_ops
3201               (VIEW_CONVERT_EXPR,
3202                vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"),
3203                build1 (VIEW_CONVERT_EXPR, vectype, vec_init), NULL_TREE);
3204           vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3205           gimple_assign_set_lhs (new_stmt, vec_init);
3206           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3207                                                  new_stmt);
3208           gcc_assert (!new_bb);
3209           set_vinfo_for_stmt (new_stmt,
3210                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3211         }
3212     }
3213   else
3214     {
3215       vec<constructor_elt, va_gc> *v;
3216
3217       /* iv_loop is the loop to be vectorized. Create:
3218          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3219       new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
3220       new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
3221       if (stmts)
3222         {
3223           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3224           gcc_assert (!new_bb);
3225         }
3226
3227       vec_alloc (v, nunits);
3228       bool constant_p = is_gimple_min_invariant (new_name);
3229       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3230       for (i = 1; i < nunits; i++)
3231         {
3232           /* Create: new_name_i = new_name + step_expr  */
3233           enum tree_code code = POINTER_TYPE_P (scalar_type)
3234                                 ? POINTER_PLUS_EXPR : PLUS_EXPR;
3235           new_name = fold_build2 (code, scalar_type, new_name, step_expr);
3236           if (!is_gimple_min_invariant (new_name))
3237             {
3238               init_stmt = gimple_build_assign (new_var, new_name);
3239               new_name = make_ssa_name (new_var, init_stmt);
3240               gimple_assign_set_lhs (init_stmt, new_name);
3241               new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3242               gcc_assert (!new_bb);
3243               if (dump_enabled_p ())
3244                 {
3245                   dump_printf_loc (MSG_NOTE, vect_location,
3246                                    "created new init_stmt: ");
3247                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3248                 }
3249               constant_p = false;
3250             }
3251           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3252         }
3253       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3254       if (constant_p)
3255         new_vec = build_vector_from_ctor (vectype, v);
3256       else
3257         new_vec = build_constructor (vectype, v);
3258       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3259     }
3260
3261
3262   /* Create the vector that holds the step of the induction.  */
3263   if (nested_in_vect_loop)
3264     /* iv_loop is nested in the loop to be vectorized. Generate:
3265        vec_step = [S, S, S, S]  */
3266     new_name = step_expr;
3267   else
3268     {
3269       /* iv_loop is the loop to be vectorized. Generate:
3270           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3271       expr = build_int_cst (TREE_TYPE (step_expr), vf);
3272       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3273                               expr, step_expr);
3274     }
3275
3276   t = unshare_expr (new_name);
3277   gcc_assert (CONSTANT_CLASS_P (new_name));
3278   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3279   gcc_assert (stepvectype);
3280   new_vec = build_vector_from_val (stepvectype, t);
3281   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3282
3283
3284   /* Create the following def-use cycle:
3285      loop prolog:
3286          vec_init = ...
3287          vec_step = ...
3288      loop:
3289          vec_iv = PHI <vec_init, vec_loop>
3290          ...
3291          STMT
3292          ...
3293          vec_loop = vec_iv + vec_step;  */
3294
3295   /* Create the induction-phi that defines the induction-operand.  */
3296   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3297   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3298   set_vinfo_for_stmt (induction_phi,
3299                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3300   induc_def = PHI_RESULT (induction_phi);
3301
3302   /* Create the iv update inside the loop  */
3303   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3304                                            induc_def, vec_step);
3305   vec_def = make_ssa_name (vec_dest, new_stmt);
3306   gimple_assign_set_lhs (new_stmt, vec_def);
3307   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3308   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3309                                                    NULL));
3310
3311   /* Set the arguments of the phi node:  */
3312   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3313   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3314                UNKNOWN_LOCATION);
3315
3316
3317   /* In case that vectorization factor (VF) is bigger than the number
3318      of elements that we can fit in a vectype (nunits), we have to generate
3319      more than one vector stmt - i.e - we need to "unroll" the
3320      vector stmt by a factor VF/nunits.  For more details see documentation
3321      in vectorizable_operation.  */
3322
3323   if (ncopies > 1)
3324     {
3325       stmt_vec_info prev_stmt_vinfo;
3326       /* FORNOW. This restriction should be relaxed.  */
3327       gcc_assert (!nested_in_vect_loop);
3328
3329       /* Create the vector that holds the step of the induction.  */
3330       expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3331       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3332                               expr, step_expr);
3333       t = unshare_expr (new_name);
3334       gcc_assert (CONSTANT_CLASS_P (new_name));
3335       new_vec = build_vector_from_val (stepvectype, t);
3336       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3337
3338       vec_def = induc_def;
3339       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3340       for (i = 1; i < ncopies; i++)
3341         {
3342           /* vec_i = vec_prev + vec_step  */
3343           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3344                                                    vec_def, vec_step);
3345           vec_def = make_ssa_name (vec_dest, new_stmt);
3346           gimple_assign_set_lhs (new_stmt, vec_def);
3347
3348           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3349           if (!useless_type_conversion_p (resvectype, vectype))
3350             {
3351               new_stmt = gimple_build_assign_with_ops
3352                   (VIEW_CONVERT_EXPR,
3353                    vect_get_new_vect_var (resvectype, vect_simple_var,
3354                                           "vec_iv_"),
3355                    build1 (VIEW_CONVERT_EXPR, resvectype,
3356                            gimple_assign_lhs (new_stmt)), NULL_TREE);
3357               gimple_assign_set_lhs (new_stmt,
3358                                      make_ssa_name
3359                                        (gimple_assign_lhs (new_stmt), new_stmt));
3360               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3361             }
3362           set_vinfo_for_stmt (new_stmt,
3363                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3364           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3365           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3366         }
3367     }
3368
3369   if (nested_in_vect_loop)
3370     {
3371       /* Find the loop-closed exit-phi of the induction, and record
3372          the final vector of induction results:  */
3373       exit_phi = NULL;
3374       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3375         {
3376           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3377             {
3378               exit_phi = USE_STMT (use_p);
3379               break;
3380             }
3381         }
3382       if (exit_phi)
3383         {
3384           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3385           /* FORNOW. Currently not supporting the case that an inner-loop induction
3386              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3387           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3388                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3389
3390           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3391           if (dump_enabled_p ())
3392             {
3393               dump_printf_loc (MSG_NOTE, vect_location,
3394                                "vector of inductions after inner-loop:");
3395               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3396             }
3397         }
3398     }
3399
3400
3401   if (dump_enabled_p ())
3402     {
3403       dump_printf_loc (MSG_NOTE, vect_location,
3404                        "transform induction: created def-use cycle: ");
3405       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3406       dump_printf (MSG_NOTE, "\n");
3407       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3408                         SSA_NAME_DEF_STMT (vec_def), 0);
3409     }
3410
3411   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3412   if (!useless_type_conversion_p (resvectype, vectype))
3413     {
3414       new_stmt = gimple_build_assign_with_ops
3415          (VIEW_CONVERT_EXPR,
3416           vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3417           build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3418       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3419       gimple_assign_set_lhs (new_stmt, induc_def);
3420       si = gsi_after_labels (bb);
3421       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3422       set_vinfo_for_stmt (new_stmt,
3423                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3424       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3425         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3426     }
3427
3428   return induc_def;
3429 }
3430
3431
3432 /* Function get_initial_def_for_reduction
3433
3434    Input:
3435    STMT - a stmt that performs a reduction operation in the loop.
3436    INIT_VAL - the initial value of the reduction variable
3437
3438    Output:
3439    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3440         of the reduction (used for adjusting the epilog - see below).
3441    Return a vector variable, initialized according to the operation that STMT
3442         performs. This vector will be used as the initial value of the
3443         vector of partial results.
3444
3445    Option1 (adjust in epilog): Initialize the vector as follows:
3446      add/bit or/xor:    [0,0,...,0,0]
3447      mult/bit and:      [1,1,...,1,1]
3448      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3449    and when necessary (e.g. add/mult case) let the caller know
3450    that it needs to adjust the result by init_val.
3451
3452    Option2: Initialize the vector as follows:
3453      add/bit or/xor:    [init_val,0,0,...,0]
3454      mult/bit and:      [init_val,1,1,...,1]
3455      min/max/cond_expr: [init_val,init_val,...,init_val]
3456    and no adjustments are needed.
3457
3458    For example, for the following code:
3459
3460    s = init_val;
3461    for (i=0;i<n;i++)
3462      s = s + a[i];
3463
3464    STMT is 's = s + a[i]', and the reduction variable is 's'.
3465    For a vector of 4 units, we want to return either [0,0,0,init_val],
3466    or [0,0,0,0] and let the caller know that it needs to adjust
3467    the result at the end by 'init_val'.
3468
3469    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3470    initialization vector is simpler (same element in all entries), if
3471    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3472
3473    A cost model should help decide between these two schemes.  */
3474
3475 tree
3476 get_initial_def_for_reduction (gimple stmt, tree init_val,
3477                                tree *adjustment_def)
3478 {
3479   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3480   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3481   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3482   tree scalar_type = TREE_TYPE (init_val);
3483   tree vectype = get_vectype_for_scalar_type (scalar_type);
3484   int nunits;
3485   enum tree_code code = gimple_assign_rhs_code (stmt);
3486   tree def_for_init;
3487   tree init_def;
3488   tree *elts;
3489   int i;
3490   bool nested_in_vect_loop = false;
3491   tree init_value;
3492   REAL_VALUE_TYPE real_init_val = dconst0;
3493   int int_init_val = 0;
3494   gimple def_stmt = NULL;
3495
3496   gcc_assert (vectype);
3497   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3498
3499   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3500               || SCALAR_FLOAT_TYPE_P (scalar_type));
3501
3502   if (nested_in_vect_loop_p (loop, stmt))
3503     nested_in_vect_loop = true;
3504   else
3505     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3506
3507   /* In case of double reduction we only create a vector variable to be put
3508      in the reduction phi node.  The actual statement creation is done in
3509      vect_create_epilog_for_reduction.  */
3510   if (adjustment_def && nested_in_vect_loop
3511       && TREE_CODE (init_val) == SSA_NAME
3512       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3513       && gimple_code (def_stmt) == GIMPLE_PHI
3514       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3515       && vinfo_for_stmt (def_stmt)
3516       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3517           == vect_double_reduction_def)
3518     {
3519       *adjustment_def = NULL;
3520       return vect_create_destination_var (init_val, vectype);
3521     }
3522
3523   if (TREE_CONSTANT (init_val))
3524     {
3525       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3526         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3527       else
3528         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3529     }
3530   else
3531     init_value = init_val;
3532
3533   switch (code)
3534     {
3535       case WIDEN_SUM_EXPR:
3536       case DOT_PROD_EXPR:
3537       case PLUS_EXPR:
3538       case MINUS_EXPR:
3539       case BIT_IOR_EXPR:
3540       case BIT_XOR_EXPR:
3541       case MULT_EXPR:
3542       case BIT_AND_EXPR:
3543         /* ADJUSMENT_DEF is NULL when called from
3544            vect_create_epilog_for_reduction to vectorize double reduction.  */
3545         if (adjustment_def)
3546           {
3547             if (nested_in_vect_loop)
3548               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3549                                                               NULL);
3550             else
3551               *adjustment_def = init_val;
3552           }
3553
3554         if (code == MULT_EXPR)
3555           {
3556             real_init_val = dconst1;
3557             int_init_val = 1;
3558           }
3559
3560         if (code == BIT_AND_EXPR)
3561           int_init_val = -1;
3562
3563         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3564           def_for_init = build_real (scalar_type, real_init_val);
3565         else
3566           def_for_init = build_int_cst (scalar_type, int_init_val);
3567
3568         /* Create a vector of '0' or '1' except the first element.  */
3569         elts = XALLOCAVEC (tree, nunits);
3570         for (i = nunits - 2; i >= 0; --i)
3571           elts[i + 1] = def_for_init;
3572
3573         /* Option1: the first element is '0' or '1' as well.  */
3574         if (adjustment_def)
3575           {
3576             elts[0] = def_for_init;
3577             init_def = build_vector (vectype, elts);
3578             break;
3579           }
3580
3581         /* Option2: the first element is INIT_VAL.  */
3582         elts[0] = init_val;
3583         if (TREE_CONSTANT (init_val))
3584           init_def = build_vector (vectype, elts);
3585         else
3586           {
3587             vec<constructor_elt, va_gc> *v;
3588             vec_alloc (v, nunits);
3589             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3590             for (i = 1; i < nunits; ++i)
3591               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3592             init_def = build_constructor (vectype, v);
3593           }
3594
3595         break;
3596
3597       case MIN_EXPR:
3598       case MAX_EXPR:
3599       case COND_EXPR:
3600         if (adjustment_def)
3601           {
3602             *adjustment_def = NULL_TREE;
3603             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3604             break;
3605           }
3606
3607         init_def = build_vector_from_val (vectype, init_value);
3608         break;
3609
3610       default:
3611         gcc_unreachable ();
3612     }
3613
3614   return init_def;
3615 }
3616
3617
3618 /* Function vect_create_epilog_for_reduction
3619
3620    Create code at the loop-epilog to finalize the result of a reduction
3621    computation.
3622
3623    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3624      reduction statements.
3625    STMT is the scalar reduction stmt that is being vectorized.
3626    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3627      number of elements that we can fit in a vectype (nunits).  In this case
3628      we have to generate more than one vector stmt - i.e - we need to "unroll"
3629      the vector stmt by a factor VF/nunits.  For more details see documentation
3630      in vectorizable_operation.
3631    REDUC_CODE is the tree-code for the epilog reduction.
3632    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3633      computation.
3634    REDUC_INDEX is the index of the operand in the right hand side of the
3635      statement that is defined by REDUCTION_PHI.
3636    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3637    SLP_NODE is an SLP node containing a group of reduction statements. The
3638      first one in this group is STMT.
3639
3640    This function:
3641    1. Creates the reduction def-use cycles: sets the arguments for
3642       REDUCTION_PHIS:
3643       The loop-entry argument is the vectorized initial-value of the reduction.
3644       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3645       sums.
3646    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3647       by applying the operation specified by REDUC_CODE if available, or by
3648       other means (whole-vector shifts or a scalar loop).
3649       The function also creates a new phi node at the loop exit to preserve
3650       loop-closed form, as illustrated below.
3651
3652      The flow at the entry to this function:
3653
3654         loop:
3655           vec_def = phi <null, null>            # REDUCTION_PHI
3656           VECT_DEF = vector_stmt                # vectorized form of STMT
3657           s_loop = scalar_stmt                  # (scalar) STMT
3658         loop_exit:
3659           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3660           use <s_out0>
3661           use <s_out0>
3662
3663      The above is transformed by this function into:
3664
3665         loop:
3666           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3667           VECT_DEF = vector_stmt                # vectorized form of STMT
3668           s_loop = scalar_stmt                  # (scalar) STMT
3669         loop_exit:
3670           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3671           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3672           v_out2 = reduce <v_out1>
3673           s_out3 = extract_field <v_out2, 0>
3674           s_out4 = adjust_result <s_out3>
3675           use <s_out4>
3676           use <s_out4>
3677 */
3678
3679 static void
3680 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3681                                   int ncopies, enum tree_code reduc_code,
3682                                   vec<gimple> reduction_phis,
3683                                   int reduc_index, bool double_reduc,
3684                                   slp_tree slp_node)
3685 {
3686   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3687   stmt_vec_info prev_phi_info;
3688   tree vectype;
3689   enum machine_mode mode;
3690   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3691   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3692   basic_block exit_bb;
3693   tree scalar_dest;
3694   tree scalar_type;
3695   gimple new_phi = NULL, phi;
3696   gimple_stmt_iterator exit_gsi;
3697   tree vec_dest;
3698   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3699   gimple epilog_stmt = NULL;
3700   enum tree_code code = gimple_assign_rhs_code (stmt);
3701   gimple exit_phi;
3702   tree bitsize, bitpos;
3703   tree adjustment_def = NULL;
3704   tree vec_initial_def = NULL;
3705   tree reduction_op, expr, def;
3706   tree orig_name, scalar_result;
3707   imm_use_iterator imm_iter, phi_imm_iter;
3708   use_operand_p use_p, phi_use_p;
3709   bool extract_scalar_result = false;
3710   gimple use_stmt, orig_stmt, reduction_phi = NULL;
3711   bool nested_in_vect_loop = false;
3712   vec<gimple> new_phis = vNULL;
3713   vec<gimple> inner_phis = vNULL;
3714   enum vect_def_type dt = vect_unknown_def_type;
3715   int j, i;
3716   vec<tree> scalar_results = vNULL;
3717   unsigned int group_size = 1, k, ratio;
3718   vec<tree> vec_initial_defs = vNULL;
3719   vec<gimple> phis;
3720   bool slp_reduc = false;
3721   tree new_phi_result;
3722   gimple inner_phi = NULL;
3723
3724   if (slp_node)
3725     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3726
3727   if (nested_in_vect_loop_p (loop, stmt))
3728     {
3729       outer_loop = loop;
3730       loop = loop->inner;
3731       nested_in_vect_loop = true;
3732       gcc_assert (!slp_node);
3733     }
3734
3735   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3736     {
3737     case GIMPLE_SINGLE_RHS:
3738       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3739                   == ternary_op);
3740       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3741       break;
3742     case GIMPLE_UNARY_RHS:
3743       reduction_op = gimple_assign_rhs1 (stmt);
3744       break;
3745     case GIMPLE_BINARY_RHS:
3746       reduction_op = reduc_index ?
3747                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3748       break;
3749     case GIMPLE_TERNARY_RHS:
3750       reduction_op = gimple_op (stmt, reduc_index + 1);
3751       break;
3752     default:
3753       gcc_unreachable ();
3754     }
3755
3756   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3757   gcc_assert (vectype);
3758   mode = TYPE_MODE (vectype);
3759
3760   /* 1. Create the reduction def-use cycle:
3761      Set the arguments of REDUCTION_PHIS, i.e., transform
3762
3763         loop:
3764           vec_def = phi <null, null>            # REDUCTION_PHI
3765           VECT_DEF = vector_stmt                # vectorized form of STMT
3766           ...
3767
3768      into:
3769
3770         loop:
3771           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3772           VECT_DEF = vector_stmt                # vectorized form of STMT
3773           ...
3774
3775      (in case of SLP, do it for all the phis). */
3776
3777   /* Get the loop-entry arguments.  */
3778   if (slp_node)
3779     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3780                        NULL, slp_node, reduc_index);
3781   else
3782     {
3783       vec_initial_defs.create (1);
3784      /* For the case of reduction, vect_get_vec_def_for_operand returns
3785         the scalar def before the loop, that defines the initial value
3786         of the reduction variable.  */
3787       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3788                                                       &adjustment_def);
3789       vec_initial_defs.quick_push (vec_initial_def);
3790     }
3791
3792   /* Set phi nodes arguments.  */
3793   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
3794     {
3795       tree vec_init_def = vec_initial_defs[i];
3796       tree def = vect_defs[i];
3797       for (j = 0; j < ncopies; j++)
3798         {
3799           /* Set the loop-entry arg of the reduction-phi.  */
3800           add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3801                        UNKNOWN_LOCATION);
3802
3803           /* Set the loop-latch arg for the reduction-phi.  */
3804           if (j > 0)
3805             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3806
3807           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3808
3809           if (dump_enabled_p ())
3810             {
3811               dump_printf_loc (MSG_NOTE, vect_location,
3812                                "transform reduction: created def-use cycle: ");
3813               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
3814               dump_printf (MSG_NOTE, "\n");
3815               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
3816             }
3817
3818           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3819         }
3820     }
3821
3822   vec_initial_defs.release ();
3823
3824   /* 2. Create epilog code.
3825         The reduction epilog code operates across the elements of the vector
3826         of partial results computed by the vectorized loop.
3827         The reduction epilog code consists of:
3828
3829         step 1: compute the scalar result in a vector (v_out2)
3830         step 2: extract the scalar result (s_out3) from the vector (v_out2)
3831         step 3: adjust the scalar result (s_out3) if needed.
3832
3833         Step 1 can be accomplished using one the following three schemes:
3834           (scheme 1) using reduc_code, if available.
3835           (scheme 2) using whole-vector shifts, if available.
3836           (scheme 3) using a scalar loop. In this case steps 1+2 above are
3837                      combined.
3838
3839           The overall epilog code looks like this:
3840
3841           s_out0 = phi <s_loop>         # original EXIT_PHI
3842           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
3843           v_out2 = reduce <v_out1>              # step 1
3844           s_out3 = extract_field <v_out2, 0>    # step 2
3845           s_out4 = adjust_result <s_out3>       # step 3
3846
3847           (step 3 is optional, and steps 1 and 2 may be combined).
3848           Lastly, the uses of s_out0 are replaced by s_out4.  */
3849
3850
3851   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3852          v_out1 = phi <VECT_DEF>
3853          Store them in NEW_PHIS.  */
3854
3855   exit_bb = single_exit (loop)->dest;
3856   prev_phi_info = NULL;
3857   new_phis.create (vect_defs.length ());
3858   FOR_EACH_VEC_ELT (vect_defs, i, def)
3859     {
3860       for (j = 0; j < ncopies; j++)
3861         {
3862           tree new_def = copy_ssa_name (def, NULL);
3863           phi = create_phi_node (new_def, exit_bb);
3864           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3865           if (j == 0)
3866             new_phis.quick_push (phi);
3867           else
3868             {
3869               def = vect_get_vec_def_for_stmt_copy (dt, def);
3870               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3871             }
3872
3873           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3874           prev_phi_info = vinfo_for_stmt (phi);
3875         }
3876     }
3877
3878   /* The epilogue is created for the outer-loop, i.e., for the loop being
3879      vectorized.  Create exit phis for the outer loop.  */
3880   if (double_reduc)
3881     {
3882       loop = outer_loop;
3883       exit_bb = single_exit (loop)->dest;
3884       inner_phis.create (vect_defs.length ());
3885       FOR_EACH_VEC_ELT (new_phis, i, phi)
3886         {
3887           tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3888           gimple outer_phi = create_phi_node (new_result, exit_bb);
3889           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3890                            PHI_RESULT (phi));
3891           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3892                                                             loop_vinfo, NULL));
3893           inner_phis.quick_push (phi);
3894           new_phis[i] = outer_phi;
3895           prev_phi_info = vinfo_for_stmt (outer_phi);
3896           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
3897             {
3898               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3899               new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3900               outer_phi = create_phi_node (new_result, exit_bb);
3901               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3902                                PHI_RESULT (phi));
3903               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3904                                                         loop_vinfo, NULL));
3905               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
3906               prev_phi_info = vinfo_for_stmt (outer_phi);
3907             }
3908         }
3909     }
3910
3911   exit_gsi = gsi_after_labels (exit_bb);
3912
3913   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3914          (i.e. when reduc_code is not available) and in the final adjustment
3915          code (if needed).  Also get the original scalar reduction variable as
3916          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
3917          represents a reduction pattern), the tree-code and scalar-def are
3918          taken from the original stmt that the pattern-stmt (STMT) replaces.
3919          Otherwise (it is a regular reduction) - the tree-code and scalar-def
3920          are taken from STMT.  */
3921
3922   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3923   if (!orig_stmt)
3924     {
3925       /* Regular reduction  */
3926       orig_stmt = stmt;
3927     }
3928   else
3929     {
3930       /* Reduction pattern  */
3931       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3932       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3933       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3934     }
3935
3936   code = gimple_assign_rhs_code (orig_stmt);
3937   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3938      partial results are added and not subtracted.  */
3939   if (code == MINUS_EXPR)
3940     code = PLUS_EXPR;
3941
3942   scalar_dest = gimple_assign_lhs (orig_stmt);
3943   scalar_type = TREE_TYPE (scalar_dest);
3944   scalar_results.create (group_size);
3945   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3946   bitsize = TYPE_SIZE (scalar_type);
3947
3948   /* In case this is a reduction in an inner-loop while vectorizing an outer
3949      loop - we don't need to extract a single scalar result at the end of the
3950      inner-loop (unless it is double reduction, i.e., the use of reduction is
3951      outside the outer-loop).  The final vector of partial results will be used
3952      in the vectorized outer-loop, or reduced to a scalar result at the end of
3953      the outer-loop.  */
3954   if (nested_in_vect_loop && !double_reduc)
3955     goto vect_finalize_reduction;
3956
3957   /* SLP reduction without reduction chain, e.g.,
3958      # a1 = phi <a2, a0>
3959      # b1 = phi <b2, b0>
3960      a2 = operation (a1)
3961      b2 = operation (b1)  */
3962   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
3963
3964   /* In case of reduction chain, e.g.,
3965      # a1 = phi <a3, a0>
3966      a2 = operation (a1)
3967      a3 = operation (a2),
3968
3969      we may end up with more than one vector result.  Here we reduce them to
3970      one vector.  */
3971   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
3972     {
3973       tree first_vect = PHI_RESULT (new_phis[0]);
3974       tree tmp;
3975       gimple new_vec_stmt = NULL;
3976
3977       vec_dest = vect_create_destination_var (scalar_dest, vectype);
3978       for (k = 1; k < new_phis.length (); k++)
3979         {
3980           gimple next_phi = new_phis[k];
3981           tree second_vect = PHI_RESULT (next_phi);
3982
3983           tmp = build2 (code, vectype,  first_vect, second_vect);
3984           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
3985           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
3986           gimple_assign_set_lhs (new_vec_stmt, first_vect);
3987           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
3988         }
3989
3990       new_phi_result = first_vect;
3991       if (new_vec_stmt)
3992         {
3993           new_phis.truncate (0);
3994           new_phis.safe_push (new_vec_stmt);
3995         }
3996     }
3997   else
3998     new_phi_result = PHI_RESULT (new_phis[0]);
3999
4000   /* 2.3 Create the reduction code, using one of the three schemes described
4001          above. In SLP we simply need to extract all the elements from the
4002          vector (without reducing them), so we use scalar shifts.  */
4003   if (reduc_code != ERROR_MARK && !slp_reduc)
4004     {
4005       tree tmp;
4006
4007       /*** Case 1:  Create:
4008            v_out2 = reduc_expr <v_out1>  */
4009
4010       if (dump_enabled_p ())
4011         dump_printf_loc (MSG_NOTE, vect_location,
4012                          "Reduce using direct vector reduction.");
4013
4014       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4015       tmp = build1 (reduc_code, vectype, new_phi_result);
4016       epilog_stmt = gimple_build_assign (vec_dest, tmp);
4017       new_temp = make_ssa_name (vec_dest, epilog_stmt);
4018       gimple_assign_set_lhs (epilog_stmt, new_temp);
4019       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4020
4021       extract_scalar_result = true;
4022     }
4023   else
4024     {
4025       enum tree_code shift_code = ERROR_MARK;
4026       bool have_whole_vector_shift = true;
4027       int bit_offset;
4028       int element_bitsize = tree_low_cst (bitsize, 1);
4029       int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4030       tree vec_temp;
4031
4032       if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4033         shift_code = VEC_RSHIFT_EXPR;
4034       else
4035         have_whole_vector_shift = false;
4036
4037       /* Regardless of whether we have a whole vector shift, if we're
4038          emulating the operation via tree-vect-generic, we don't want
4039          to use it.  Only the first round of the reduction is likely
4040          to still be profitable via emulation.  */
4041       /* ??? It might be better to emit a reduction tree code here, so that
4042          tree-vect-generic can expand the first round via bit tricks.  */
4043       if (!VECTOR_MODE_P (mode))
4044         have_whole_vector_shift = false;
4045       else
4046         {
4047           optab optab = optab_for_tree_code (code, vectype, optab_default);
4048           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4049             have_whole_vector_shift = false;
4050         }
4051
4052       if (have_whole_vector_shift && !slp_reduc)
4053         {
4054           /*** Case 2: Create:
4055              for (offset = VS/2; offset >= element_size; offset/=2)
4056                 {
4057                   Create:  va' = vec_shift <va, offset>
4058                   Create:  va = vop <va, va'>
4059                 }  */
4060
4061           if (dump_enabled_p ())
4062             dump_printf_loc (MSG_NOTE, vect_location,
4063                              "Reduce using vector shifts");
4064
4065           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4066           new_temp = new_phi_result;
4067           for (bit_offset = vec_size_in_bits/2;
4068                bit_offset >= element_bitsize;
4069                bit_offset /= 2)
4070             {
4071               tree bitpos = size_int (bit_offset);
4072
4073               epilog_stmt = gimple_build_assign_with_ops (shift_code,
4074                                                vec_dest, new_temp, bitpos);
4075               new_name = make_ssa_name (vec_dest, epilog_stmt);
4076               gimple_assign_set_lhs (epilog_stmt, new_name);
4077               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4078
4079               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4080                                                           new_name, new_temp);
4081               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4082               gimple_assign_set_lhs (epilog_stmt, new_temp);
4083               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4084             }
4085
4086           extract_scalar_result = true;
4087         }
4088       else
4089         {
4090           tree rhs;
4091
4092           /*** Case 3: Create:
4093              s = extract_field <v_out2, 0>
4094              for (offset = element_size;
4095                   offset < vector_size;
4096                   offset += element_size;)
4097                {
4098                  Create:  s' = extract_field <v_out2, offset>
4099                  Create:  s = op <s, s'>  // For non SLP cases
4100                }  */
4101
4102           if (dump_enabled_p ())
4103             dump_printf_loc (MSG_NOTE, vect_location,
4104                              "Reduce using scalar code. ");
4105
4106           vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4107           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4108             {
4109               if (gimple_code (new_phi) == GIMPLE_PHI)
4110                 vec_temp = PHI_RESULT (new_phi);
4111               else
4112                 vec_temp = gimple_assign_lhs (new_phi);
4113               rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4114                             bitsize_zero_node);
4115               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4116               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4117               gimple_assign_set_lhs (epilog_stmt, new_temp);
4118               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4119
4120               /* In SLP we don't need to apply reduction operation, so we just
4121                  collect s' values in SCALAR_RESULTS.  */
4122               if (slp_reduc)
4123                 scalar_results.safe_push (new_temp);
4124
4125               for (bit_offset = element_bitsize;
4126                    bit_offset < vec_size_in_bits;
4127                    bit_offset += element_bitsize)
4128                 {
4129                   tree bitpos = bitsize_int (bit_offset);
4130                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4131                                      bitsize, bitpos);
4132
4133                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4134                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4135                   gimple_assign_set_lhs (epilog_stmt, new_name);
4136                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4137
4138                   if (slp_reduc)
4139                     {
4140                       /* In SLP we don't need to apply reduction operation, so
4141                          we just collect s' values in SCALAR_RESULTS.  */
4142                       new_temp = new_name;
4143                       scalar_results.safe_push (new_name);
4144                     }
4145                   else
4146                     {
4147                       epilog_stmt = gimple_build_assign_with_ops (code,
4148                                           new_scalar_dest, new_name, new_temp);
4149                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4150                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4151                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4152                     }
4153                 }
4154             }
4155
4156           /* The only case where we need to reduce scalar results in SLP, is
4157              unrolling.  If the size of SCALAR_RESULTS is greater than
4158              GROUP_SIZE, we reduce them combining elements modulo
4159              GROUP_SIZE.  */
4160           if (slp_reduc)
4161             {
4162               tree res, first_res, new_res;
4163               gimple new_stmt;
4164
4165               /* Reduce multiple scalar results in case of SLP unrolling.  */
4166               for (j = group_size; scalar_results.iterate (j, &res);
4167                    j++)
4168                 {
4169                   first_res = scalar_results[j % group_size];
4170                   new_stmt = gimple_build_assign_with_ops (code,
4171                                               new_scalar_dest, first_res, res);
4172                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4173                   gimple_assign_set_lhs (new_stmt, new_res);
4174                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4175                   scalar_results[j % group_size] = new_res;
4176                 }
4177             }
4178           else
4179             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4180             scalar_results.safe_push (new_temp);
4181
4182           extract_scalar_result = false;
4183         }
4184     }
4185
4186   /* 2.4  Extract the final scalar result.  Create:
4187           s_out3 = extract_field <v_out2, bitpos>  */
4188
4189   if (extract_scalar_result)
4190     {
4191       tree rhs;
4192
4193       if (dump_enabled_p ())
4194         dump_printf_loc (MSG_NOTE, vect_location,
4195                          "extract scalar result");
4196
4197       if (BYTES_BIG_ENDIAN)
4198         bitpos = size_binop (MULT_EXPR,
4199                              bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4200                              TYPE_SIZE (scalar_type));
4201       else
4202         bitpos = bitsize_zero_node;
4203
4204       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4205       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4206       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4207       gimple_assign_set_lhs (epilog_stmt, new_temp);
4208       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4209       scalar_results.safe_push (new_temp);
4210     }
4211
4212 vect_finalize_reduction:
4213
4214   if (double_reduc)
4215     loop = loop->inner;
4216
4217   /* 2.5 Adjust the final result by the initial value of the reduction
4218          variable. (When such adjustment is not needed, then
4219          'adjustment_def' is zero).  For example, if code is PLUS we create:
4220          new_temp = loop_exit_def + adjustment_def  */
4221
4222   if (adjustment_def)
4223     {
4224       gcc_assert (!slp_reduc);
4225       if (nested_in_vect_loop)
4226         {
4227           new_phi = new_phis[0];
4228           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4229           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4230           new_dest = vect_create_destination_var (scalar_dest, vectype);
4231         }
4232       else
4233         {
4234           new_temp = scalar_results[0];
4235           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4236           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4237           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4238         }
4239
4240       epilog_stmt = gimple_build_assign (new_dest, expr);
4241       new_temp = make_ssa_name (new_dest, epilog_stmt);
4242       gimple_assign_set_lhs (epilog_stmt, new_temp);
4243       SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
4244       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4245       if (nested_in_vect_loop)
4246         {
4247           set_vinfo_for_stmt (epilog_stmt,
4248                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
4249                                                  NULL));
4250           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4251                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4252
4253           if (!double_reduc)
4254             scalar_results.quick_push (new_temp);
4255           else
4256             scalar_results[0] = new_temp;
4257         }
4258       else
4259         scalar_results[0] = new_temp;
4260
4261       new_phis[0] = epilog_stmt;
4262     }
4263
4264   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4265           phis with new adjusted scalar results, i.e., replace use <s_out0>
4266           with use <s_out4>.
4267
4268      Transform:
4269         loop_exit:
4270           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4271           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4272           v_out2 = reduce <v_out1>
4273           s_out3 = extract_field <v_out2, 0>
4274           s_out4 = adjust_result <s_out3>
4275           use <s_out0>
4276           use <s_out0>
4277
4278      into:
4279
4280         loop_exit:
4281           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4282           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4283           v_out2 = reduce <v_out1>
4284           s_out3 = extract_field <v_out2, 0>
4285           s_out4 = adjust_result <s_out3>
4286           use <s_out4>
4287           use <s_out4> */
4288
4289
4290   /* In SLP reduction chain we reduce vector results into one vector if
4291      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4292      the last stmt in the reduction chain, since we are looking for the loop
4293      exit phi node.  */
4294   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4295     {
4296       scalar_dest = gimple_assign_lhs (
4297                         SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4298       group_size = 1;
4299     }
4300
4301   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4302      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4303      need to match SCALAR_RESULTS with corresponding statements.  The first
4304      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4305      the first vector stmt, etc.
4306      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4307   if (group_size > new_phis.length ())
4308     {
4309       ratio = group_size / new_phis.length ();
4310       gcc_assert (!(group_size % new_phis.length ()));
4311     }
4312   else
4313     ratio = 1;
4314
4315   for (k = 0; k < group_size; k++)
4316     {
4317       if (k % ratio == 0)
4318         {
4319           epilog_stmt = new_phis[k / ratio];
4320           reduction_phi = reduction_phis[k / ratio];
4321           if (double_reduc)
4322             inner_phi = inner_phis[k / ratio];
4323         }
4324
4325       if (slp_reduc)
4326         {
4327           gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4328
4329           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4330           /* SLP statements can't participate in patterns.  */
4331           gcc_assert (!orig_stmt);
4332           scalar_dest = gimple_assign_lhs (current_stmt);
4333         }
4334
4335       phis.create (3);
4336       /* Find the loop-closed-use at the loop exit of the original scalar
4337          result.  (The reduction result is expected to have two immediate uses -
4338          one at the latch block, and one at the loop exit).  */
4339       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4340         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4341           phis.safe_push (USE_STMT (use_p));
4342
4343       /* We expect to have found an exit_phi because of loop-closed-ssa
4344          form.  */
4345       gcc_assert (!phis.is_empty ());
4346
4347       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4348         {
4349           if (outer_loop)
4350             {
4351               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4352               gimple vect_phi;
4353
4354               /* FORNOW. Currently not supporting the case that an inner-loop
4355                  reduction is not used in the outer-loop (but only outside the
4356                  outer-loop), unless it is double reduction.  */
4357               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4358                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4359                           || double_reduc);
4360
4361               STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4362               if (!double_reduc
4363                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4364                       != vect_double_reduction_def)
4365                 continue;
4366
4367               /* Handle double reduction:
4368
4369                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4370                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4371                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4372                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4373
4374                  At that point the regular reduction (stmt2 and stmt3) is
4375                  already vectorized, as well as the exit phi node, stmt4.
4376                  Here we vectorize the phi node of double reduction, stmt1, and
4377                  update all relevant statements.  */
4378
4379               /* Go through all the uses of s2 to find double reduction phi
4380                  node, i.e., stmt1 above.  */
4381               orig_name = PHI_RESULT (exit_phi);
4382               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4383                 {
4384                   stmt_vec_info use_stmt_vinfo;
4385                   stmt_vec_info new_phi_vinfo;
4386                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4387                   basic_block bb = gimple_bb (use_stmt);
4388                   gimple use;
4389
4390                   /* Check that USE_STMT is really double reduction phi
4391                      node.  */
4392                   if (gimple_code (use_stmt) != GIMPLE_PHI
4393                       || gimple_phi_num_args (use_stmt) != 2
4394                       || bb->loop_father != outer_loop)
4395                     continue;
4396                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4397                   if (!use_stmt_vinfo
4398                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4399                           != vect_double_reduction_def)
4400                     continue;
4401
4402                   /* Create vector phi node for double reduction:
4403                      vs1 = phi <vs0, vs2>
4404                      vs1 was created previously in this function by a call to
4405                        vect_get_vec_def_for_operand and is stored in
4406                        vec_initial_def;
4407                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4408                      vs0 is created here.  */
4409
4410                   /* Create vector phi node.  */
4411                   vect_phi = create_phi_node (vec_initial_def, bb);
4412                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4413                                     loop_vec_info_for_loop (outer_loop), NULL);
4414                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4415
4416                   /* Create vs0 - initial def of the double reduction phi.  */
4417                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4418                                              loop_preheader_edge (outer_loop));
4419                   init_def = get_initial_def_for_reduction (stmt,
4420                                                           preheader_arg, NULL);
4421                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4422                                                     vectype, NULL);
4423
4424                   /* Update phi node arguments with vs0 and vs2.  */
4425                   add_phi_arg (vect_phi, vect_phi_init,
4426                                loop_preheader_edge (outer_loop),
4427                                UNKNOWN_LOCATION);
4428                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4429                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4430                   if (dump_enabled_p ())
4431                     {
4432                       dump_printf_loc (MSG_NOTE, vect_location,
4433                                        "created double reduction phi node: ");
4434                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4435                     }
4436
4437                   vect_phi_res = PHI_RESULT (vect_phi);
4438
4439                   /* Replace the use, i.e., set the correct vs1 in the regular
4440                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4441                      loop is redundant.  */
4442                   use = reduction_phi;
4443                   for (j = 0; j < ncopies; j++)
4444                     {
4445                       edge pr_edge = loop_preheader_edge (loop);
4446                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4447                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4448                     }
4449                 }
4450             }
4451         }
4452
4453       phis.release ();
4454       if (nested_in_vect_loop)
4455         {
4456           if (double_reduc)
4457             loop = outer_loop;
4458           else
4459             continue;
4460         }
4461
4462       phis.create (3);
4463       /* Find the loop-closed-use at the loop exit of the original scalar
4464          result.  (The reduction result is expected to have two immediate uses,
4465          one at the latch block, and one at the loop exit).  For double
4466          reductions we are looking for exit phis of the outer loop.  */
4467       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4468         {
4469           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4470             phis.safe_push (USE_STMT (use_p));
4471           else
4472             {
4473               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4474                 {
4475                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4476
4477                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4478                     {
4479                       if (!flow_bb_inside_loop_p (loop,
4480                                              gimple_bb (USE_STMT (phi_use_p))))
4481                         phis.safe_push (USE_STMT (phi_use_p));
4482                     }
4483                 }
4484             }
4485         }
4486
4487       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4488         {
4489           /* Replace the uses:  */
4490           orig_name = PHI_RESULT (exit_phi);
4491           scalar_result = scalar_results[k];
4492           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4493             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4494               SET_USE (use_p, scalar_result);
4495         }
4496
4497       phis.release ();
4498     }
4499
4500   scalar_results.release ();
4501   inner_phis.release ();
4502   new_phis.release ();
4503 }
4504
4505
4506 /* Function vectorizable_reduction.
4507
4508    Check if STMT performs a reduction operation that can be vectorized.
4509    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4510    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4511    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4512
4513    This function also handles reduction idioms (patterns) that have been
4514    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4515    of this form:
4516      X = pattern_expr (arg0, arg1, ..., X)
4517    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4518    sequence that had been detected and replaced by the pattern-stmt (STMT).
4519
4520    In some cases of reduction patterns, the type of the reduction variable X is
4521    different than the type of the other arguments of STMT.
4522    In such cases, the vectype that is used when transforming STMT into a vector
4523    stmt is different than the vectype that is used to determine the
4524    vectorization factor, because it consists of a different number of elements
4525    than the actual number of elements that are being operated upon in parallel.
4526
4527    For example, consider an accumulation of shorts into an int accumulator.
4528    On some targets it's possible to vectorize this pattern operating on 8
4529    shorts at a time (hence, the vectype for purposes of determining the
4530    vectorization factor should be V8HI); on the other hand, the vectype that
4531    is used to create the vector form is actually V4SI (the type of the result).
4532
4533    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4534    indicates what is the actual level of parallelism (V8HI in the example), so
4535    that the right vectorization factor would be derived.  This vectype
4536    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4537    be used to create the vectorized stmt.  The right vectype for the vectorized
4538    stmt is obtained from the type of the result X:
4539         get_vectype_for_scalar_type (TREE_TYPE (X))
4540
4541    This means that, contrary to "regular" reductions (or "regular" stmts in
4542    general), the following equation:
4543       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4544    does *NOT* necessarily hold for reduction patterns.  */
4545
4546 bool
4547 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4548                         gimple *vec_stmt, slp_tree slp_node)
4549 {
4550   tree vec_dest;
4551   tree scalar_dest;
4552   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4553   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4554   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4555   tree vectype_in = NULL_TREE;
4556   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4557   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4558   enum tree_code code, orig_code, epilog_reduc_code;
4559   enum machine_mode vec_mode;
4560   int op_type;
4561   optab optab, reduc_optab;
4562   tree new_temp = NULL_TREE;
4563   tree def;
4564   gimple def_stmt;
4565   enum vect_def_type dt;
4566   gimple new_phi = NULL;
4567   tree scalar_type;
4568   bool is_simple_use;
4569   gimple orig_stmt;
4570   stmt_vec_info orig_stmt_info;
4571   tree expr = NULL_TREE;
4572   int i;
4573   int ncopies;
4574   int epilog_copies;
4575   stmt_vec_info prev_stmt_info, prev_phi_info;
4576   bool single_defuse_cycle = false;
4577   tree reduc_def = NULL_TREE;
4578   gimple new_stmt = NULL;
4579   int j;
4580   tree ops[3];
4581   bool nested_cycle = false, found_nested_cycle_def = false;
4582   gimple reduc_def_stmt = NULL;
4583   /* The default is that the reduction variable is the last in statement.  */
4584   int reduc_index = 2;
4585   bool double_reduc = false, dummy;
4586   basic_block def_bb;
4587   struct loop * def_stmt_loop, *outer_loop = NULL;
4588   tree def_arg;
4589   gimple def_arg_stmt;
4590   vec<tree> vec_oprnds0 = vNULL;
4591   vec<tree> vec_oprnds1 = vNULL;
4592   vec<tree> vect_defs = vNULL;
4593   vec<gimple> phis = vNULL;
4594   int vec_num;
4595   tree def0, def1, tem, op0, op1 = NULL_TREE;
4596
4597   /* In case of reduction chain we switch to the first stmt in the chain, but
4598      we don't update STMT_INFO, since only the last stmt is marked as reduction
4599      and has reduction properties.  */
4600   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4601     stmt = GROUP_FIRST_ELEMENT (stmt_info);
4602
4603   if (nested_in_vect_loop_p (loop, stmt))
4604     {
4605       outer_loop = loop;
4606       loop = loop->inner;
4607       nested_cycle = true;
4608     }
4609
4610   /* 1. Is vectorizable reduction?  */
4611   /* Not supportable if the reduction variable is used in the loop, unless
4612      it's a reduction chain.  */
4613   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4614       && !GROUP_FIRST_ELEMENT (stmt_info))
4615     return false;
4616
4617   /* Reductions that are not used even in an enclosing outer-loop,
4618      are expected to be "live" (used out of the loop).  */
4619   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4620       && !STMT_VINFO_LIVE_P (stmt_info))
4621     return false;
4622
4623   /* Make sure it was already recognized as a reduction computation.  */
4624   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4625       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4626     return false;
4627
4628   /* 2. Has this been recognized as a reduction pattern?
4629
4630      Check if STMT represents a pattern that has been recognized
4631      in earlier analysis stages.  For stmts that represent a pattern,
4632      the STMT_VINFO_RELATED_STMT field records the last stmt in
4633      the original sequence that constitutes the pattern.  */
4634
4635   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4636   if (orig_stmt)
4637     {
4638       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4639       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4640       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4641     }
4642
4643   /* 3. Check the operands of the operation.  The first operands are defined
4644         inside the loop body. The last operand is the reduction variable,
4645         which is defined by the loop-header-phi.  */
4646
4647   gcc_assert (is_gimple_assign (stmt));
4648
4649   /* Flatten RHS.  */
4650   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4651     {
4652     case GIMPLE_SINGLE_RHS:
4653       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4654       if (op_type == ternary_op)
4655         {
4656           tree rhs = gimple_assign_rhs1 (stmt);
4657           ops[0] = TREE_OPERAND (rhs, 0);
4658           ops[1] = TREE_OPERAND (rhs, 1);
4659           ops[2] = TREE_OPERAND (rhs, 2);
4660           code = TREE_CODE (rhs);
4661         }
4662       else
4663         return false;
4664       break;
4665
4666     case GIMPLE_BINARY_RHS:
4667       code = gimple_assign_rhs_code (stmt);
4668       op_type = TREE_CODE_LENGTH (code);
4669       gcc_assert (op_type == binary_op);
4670       ops[0] = gimple_assign_rhs1 (stmt);
4671       ops[1] = gimple_assign_rhs2 (stmt);
4672       break;
4673
4674     case GIMPLE_TERNARY_RHS:
4675       code = gimple_assign_rhs_code (stmt);
4676       op_type = TREE_CODE_LENGTH (code);
4677       gcc_assert (op_type == ternary_op);
4678       ops[0] = gimple_assign_rhs1 (stmt);
4679       ops[1] = gimple_assign_rhs2 (stmt);
4680       ops[2] = gimple_assign_rhs3 (stmt);
4681       break;
4682
4683     case GIMPLE_UNARY_RHS:
4684       return false;
4685
4686     default:
4687       gcc_unreachable ();
4688     }
4689
4690   if (code == COND_EXPR && slp_node)
4691     return false;
4692
4693   scalar_dest = gimple_assign_lhs (stmt);
4694   scalar_type = TREE_TYPE (scalar_dest);
4695   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4696       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4697     return false;
4698
4699   /* Do not try to vectorize bit-precision reductions.  */
4700   if ((TYPE_PRECISION (scalar_type)
4701        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4702     return false;
4703
4704   /* All uses but the last are expected to be defined in the loop.
4705      The last use is the reduction variable.  In case of nested cycle this
4706      assumption is not true: we use reduc_index to record the index of the
4707      reduction variable.  */
4708   for (i = 0; i < op_type - 1; i++)
4709     {
4710       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4711       if (i == 0 && code == COND_EXPR)
4712         continue;
4713
4714       is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4715                                             &def_stmt, &def, &dt, &tem);
4716       if (!vectype_in)
4717         vectype_in = tem;
4718       gcc_assert (is_simple_use);
4719
4720       if (dt != vect_internal_def
4721           && dt != vect_external_def
4722           && dt != vect_constant_def
4723           && dt != vect_induction_def
4724           && !(dt == vect_nested_cycle && nested_cycle))
4725         return false;
4726
4727       if (dt == vect_nested_cycle)
4728         {
4729           found_nested_cycle_def = true;
4730           reduc_def_stmt = def_stmt;
4731           reduc_index = i;
4732         }
4733     }
4734
4735   is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4736                                         &def_stmt, &def, &dt, &tem);
4737   if (!vectype_in)
4738     vectype_in = tem;
4739   gcc_assert (is_simple_use);
4740   if (!(dt == vect_reduction_def
4741         || dt == vect_nested_cycle
4742         || ((dt == vect_internal_def || dt == vect_external_def
4743              || dt == vect_constant_def || dt == vect_induction_def)
4744             && nested_cycle && found_nested_cycle_def)))
4745     {
4746       /* For pattern recognized stmts, orig_stmt might be a reduction,
4747          but some helper statements for the pattern might not, or
4748          might be COND_EXPRs with reduction uses in the condition.  */
4749       gcc_assert (orig_stmt);
4750       return false;
4751     }
4752   if (!found_nested_cycle_def)
4753     reduc_def_stmt = def_stmt;
4754
4755   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4756   if (orig_stmt)
4757     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4758                                                        reduc_def_stmt,
4759                                                        !nested_cycle,
4760                                                        &dummy));
4761   else
4762     {
4763       gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4764                                              !nested_cycle, &dummy);
4765       /* We changed STMT to be the first stmt in reduction chain, hence we
4766          check that in this case the first element in the chain is STMT.  */
4767       gcc_assert (stmt == tmp
4768                   || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4769     }
4770
4771   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4772     return false;
4773
4774   if (slp_node || PURE_SLP_STMT (stmt_info))
4775     ncopies = 1;
4776   else
4777     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4778                / TYPE_VECTOR_SUBPARTS (vectype_in));
4779
4780   gcc_assert (ncopies >= 1);
4781
4782   vec_mode = TYPE_MODE (vectype_in);
4783
4784   if (code == COND_EXPR)
4785     {
4786       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4787         {
4788           if (dump_enabled_p ())
4789             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4790                              "unsupported condition in reduction");
4791
4792             return false;
4793         }
4794     }
4795   else
4796     {
4797       /* 4. Supportable by target?  */
4798
4799       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
4800           || code == LROTATE_EXPR || code == RROTATE_EXPR)
4801         {
4802           /* Shifts and rotates are only supported by vectorizable_shifts,
4803              not vectorizable_reduction.  */
4804           if (dump_enabled_p ())
4805             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4806                              "unsupported shift or rotation.");
4807           return false;
4808         }
4809
4810       /* 4.1. check support for the operation in the loop  */
4811       optab = optab_for_tree_code (code, vectype_in, optab_default);
4812       if (!optab)
4813         {
4814           if (dump_enabled_p ())
4815             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4816                              "no optab.");
4817
4818           return false;
4819         }
4820
4821       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4822         {
4823           if (dump_enabled_p ())
4824             dump_printf (MSG_NOTE, "op not supported by target.");
4825
4826           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4827               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4828                   < vect_min_worthwhile_factor (code))
4829             return false;
4830
4831           if (dump_enabled_p ())
4832             dump_printf (MSG_NOTE, "proceeding using word mode.");
4833         }
4834
4835       /* Worthwhile without SIMD support?  */
4836       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4837           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4838              < vect_min_worthwhile_factor (code))
4839         {
4840           if (dump_enabled_p ())
4841             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4842                              "not worthwhile without SIMD support.");
4843
4844           return false;
4845         }
4846     }
4847
4848   /* 4.2. Check support for the epilog operation.
4849
4850           If STMT represents a reduction pattern, then the type of the
4851           reduction variable may be different than the type of the rest
4852           of the arguments.  For example, consider the case of accumulation
4853           of shorts into an int accumulator; The original code:
4854                         S1: int_a = (int) short_a;
4855           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
4856
4857           was replaced with:
4858                         STMT: int_acc = widen_sum <short_a, int_acc>
4859
4860           This means that:
4861           1. The tree-code that is used to create the vector operation in the
4862              epilog code (that reduces the partial results) is not the
4863              tree-code of STMT, but is rather the tree-code of the original
4864              stmt from the pattern that STMT is replacing.  I.e, in the example
4865              above we want to use 'widen_sum' in the loop, but 'plus' in the
4866              epilog.
4867           2. The type (mode) we use to check available target support
4868              for the vector operation to be created in the *epilog*, is
4869              determined by the type of the reduction variable (in the example
4870              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4871              However the type (mode) we use to check available target support
4872              for the vector operation to be created *inside the loop*, is
4873              determined by the type of the other arguments to STMT (in the
4874              example we'd check this: optab_handler (widen_sum_optab,
4875              vect_short_mode)).
4876
4877           This is contrary to "regular" reductions, in which the types of all
4878           the arguments are the same as the type of the reduction variable.
4879           For "regular" reductions we can therefore use the same vector type
4880           (and also the same tree-code) when generating the epilog code and
4881           when generating the code inside the loop.  */
4882
4883   if (orig_stmt)
4884     {
4885       /* This is a reduction pattern: get the vectype from the type of the
4886          reduction variable, and get the tree-code from orig_stmt.  */
4887       orig_code = gimple_assign_rhs_code (orig_stmt);
4888       gcc_assert (vectype_out);
4889       vec_mode = TYPE_MODE (vectype_out);
4890     }
4891   else
4892     {
4893       /* Regular reduction: use the same vectype and tree-code as used for
4894          the vector code inside the loop can be used for the epilog code. */
4895       orig_code = code;
4896     }
4897
4898   if (nested_cycle)
4899     {
4900       def_bb = gimple_bb (reduc_def_stmt);
4901       def_stmt_loop = def_bb->loop_father;
4902       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4903                                        loop_preheader_edge (def_stmt_loop));
4904       if (TREE_CODE (def_arg) == SSA_NAME
4905           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4906           && gimple_code (def_arg_stmt) == GIMPLE_PHI
4907           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4908           && vinfo_for_stmt (def_arg_stmt)
4909           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4910               == vect_double_reduction_def)
4911         double_reduc = true;
4912     }
4913
4914   epilog_reduc_code = ERROR_MARK;
4915   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4916     {
4917       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4918                                          optab_default);
4919       if (!reduc_optab)
4920         {
4921           if (dump_enabled_p ())
4922             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4923                              "no optab for reduction.");
4924
4925           epilog_reduc_code = ERROR_MARK;
4926         }
4927
4928       if (reduc_optab
4929           && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4930         {
4931           if (dump_enabled_p ())
4932             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4933                              "reduc op not supported by target.");
4934
4935           epilog_reduc_code = ERROR_MARK;
4936         }
4937     }
4938   else
4939     {
4940       if (!nested_cycle || double_reduc)
4941         {
4942           if (dump_enabled_p ())
4943             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4944                              "no reduc code for scalar code.");
4945
4946           return false;
4947         }
4948     }
4949
4950   if (double_reduc && ncopies > 1)
4951     {
4952       if (dump_enabled_p ())
4953         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4954                          "multiple types in double reduction");
4955
4956       return false;
4957     }
4958
4959   /* In case of widenning multiplication by a constant, we update the type
4960      of the constant to be the type of the other operand.  We check that the
4961      constant fits the type in the pattern recognition pass.  */
4962   if (code == DOT_PROD_EXPR
4963       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
4964     {
4965       if (TREE_CODE (ops[0]) == INTEGER_CST)
4966         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
4967       else if (TREE_CODE (ops[1]) == INTEGER_CST)
4968         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
4969       else
4970         {
4971           if (dump_enabled_p ())
4972             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4973                              "invalid types in dot-prod");
4974
4975           return false;
4976         }
4977     }
4978
4979   if (!vec_stmt) /* transformation not required.  */
4980     {
4981       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4982         return false;
4983       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4984       return true;
4985     }
4986
4987   /** Transform.  **/
4988
4989   if (dump_enabled_p ())
4990     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.");
4991
4992   /* FORNOW: Multiple types are not supported for condition.  */
4993   if (code == COND_EXPR)
4994     gcc_assert (ncopies == 1);
4995
4996   /* Create the destination vector  */
4997   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4998
4999   /* In case the vectorization factor (VF) is bigger than the number
5000      of elements that we can fit in a vectype (nunits), we have to generate
5001      more than one vector stmt - i.e - we need to "unroll" the
5002      vector stmt by a factor VF/nunits.  For more details see documentation
5003      in vectorizable_operation.  */
5004
5005   /* If the reduction is used in an outer loop we need to generate
5006      VF intermediate results, like so (e.g. for ncopies=2):
5007         r0 = phi (init, r0)
5008         r1 = phi (init, r1)
5009         r0 = x0 + r0;
5010         r1 = x1 + r1;
5011     (i.e. we generate VF results in 2 registers).
5012     In this case we have a separate def-use cycle for each copy, and therefore
5013     for each copy we get the vector def for the reduction variable from the
5014     respective phi node created for this copy.
5015
5016     Otherwise (the reduction is unused in the loop nest), we can combine
5017     together intermediate results, like so (e.g. for ncopies=2):
5018         r = phi (init, r)
5019         r = x0 + r;
5020         r = x1 + r;
5021    (i.e. we generate VF/2 results in a single register).
5022    In this case for each copy we get the vector def for the reduction variable
5023    from the vectorized reduction operation generated in the previous iteration.
5024   */
5025
5026   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5027     {
5028       single_defuse_cycle = true;
5029       epilog_copies = 1;
5030     }
5031   else
5032     epilog_copies = ncopies;
5033
5034   prev_stmt_info = NULL;
5035   prev_phi_info = NULL;
5036   if (slp_node)
5037     {
5038       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5039       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5040                   == TYPE_VECTOR_SUBPARTS (vectype_in));
5041     }
5042   else
5043     {
5044       vec_num = 1;
5045       vec_oprnds0.create (1);
5046       if (op_type == ternary_op)
5047         vec_oprnds1.create (1);
5048     }
5049
5050   phis.create (vec_num);
5051   vect_defs.create (vec_num);
5052   if (!slp_node)
5053     vect_defs.quick_push (NULL_TREE);
5054
5055   for (j = 0; j < ncopies; j++)
5056     {
5057       if (j == 0 || !single_defuse_cycle)
5058         {
5059           for (i = 0; i < vec_num; i++)
5060             {
5061               /* Create the reduction-phi that defines the reduction
5062                  operand.  */
5063               new_phi = create_phi_node (vec_dest, loop->header);
5064               set_vinfo_for_stmt (new_phi,
5065                                   new_stmt_vec_info (new_phi, loop_vinfo,
5066                                                      NULL));
5067                if (j == 0 || slp_node)
5068                  phis.quick_push (new_phi);
5069             }
5070         }
5071
5072       if (code == COND_EXPR)
5073         {
5074           gcc_assert (!slp_node);
5075           vectorizable_condition (stmt, gsi, vec_stmt,
5076                                   PHI_RESULT (phis[0]),
5077                                   reduc_index, NULL);
5078           /* Multiple types are not supported for condition.  */
5079           break;
5080         }
5081
5082       /* Handle uses.  */
5083       if (j == 0)
5084         {
5085           op0 = ops[!reduc_index];
5086           if (op_type == ternary_op)
5087             {
5088               if (reduc_index == 0)
5089                 op1 = ops[2];
5090               else
5091                 op1 = ops[1];
5092             }
5093
5094           if (slp_node)
5095             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5096                                slp_node, -1);
5097           else
5098             {
5099               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5100                                                             stmt, NULL);
5101               vec_oprnds0.quick_push (loop_vec_def0);
5102               if (op_type == ternary_op)
5103                {
5104                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5105                                                                NULL);
5106                  vec_oprnds1.quick_push (loop_vec_def1);
5107                }
5108             }
5109         }
5110       else
5111         {
5112           if (!slp_node)
5113             {
5114               enum vect_def_type dt;
5115               gimple dummy_stmt;
5116               tree dummy;
5117
5118               vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5119                                   &dummy_stmt, &dummy, &dt);
5120               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5121                                                               loop_vec_def0);
5122               vec_oprnds0[0] = loop_vec_def0;
5123               if (op_type == ternary_op)
5124                 {
5125                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5126                                       &dummy, &dt);
5127                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5128                                                                 loop_vec_def1);
5129                   vec_oprnds1[0] = loop_vec_def1;
5130                 }
5131             }
5132
5133           if (single_defuse_cycle)
5134             reduc_def = gimple_assign_lhs (new_stmt);
5135
5136           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5137         }
5138
5139       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5140         {
5141           if (slp_node)
5142             reduc_def = PHI_RESULT (phis[i]);
5143           else
5144             {
5145               if (!single_defuse_cycle || j == 0)
5146                 reduc_def = PHI_RESULT (new_phi);
5147             }
5148
5149           def1 = ((op_type == ternary_op)
5150                   ? vec_oprnds1[i] : NULL);
5151           if (op_type == binary_op)
5152             {
5153               if (reduc_index == 0)
5154                 expr = build2 (code, vectype_out, reduc_def, def0);
5155               else
5156                 expr = build2 (code, vectype_out, def0, reduc_def);
5157             }
5158           else
5159             {
5160               if (reduc_index == 0)
5161                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5162               else
5163                 {
5164                   if (reduc_index == 1)
5165                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5166                   else
5167                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5168                 }
5169             }
5170
5171           new_stmt = gimple_build_assign (vec_dest, expr);
5172           new_temp = make_ssa_name (vec_dest, new_stmt);
5173           gimple_assign_set_lhs (new_stmt, new_temp);
5174           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5175
5176           if (slp_node)
5177             {
5178               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5179               vect_defs.quick_push (new_temp);
5180             }
5181           else
5182             vect_defs[0] = new_temp;
5183         }
5184
5185       if (slp_node)
5186         continue;
5187
5188       if (j == 0)
5189         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5190       else
5191         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5192
5193       prev_stmt_info = vinfo_for_stmt (new_stmt);
5194       prev_phi_info = vinfo_for_stmt (new_phi);
5195     }
5196
5197   /* Finalize the reduction-phi (set its arguments) and create the
5198      epilog reduction code.  */
5199   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5200     {
5201       new_temp = gimple_assign_lhs (*vec_stmt);
5202       vect_defs[0] = new_temp;
5203     }
5204
5205   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5206                                     epilog_reduc_code, phis, reduc_index,
5207                                     double_reduc, slp_node);
5208
5209   phis.release ();
5210   vect_defs.release ();
5211   vec_oprnds0.release ();
5212   vec_oprnds1.release ();
5213
5214   return true;
5215 }
5216
5217 /* Function vect_min_worthwhile_factor.
5218
5219    For a loop where we could vectorize the operation indicated by CODE,
5220    return the minimum vectorization factor that makes it worthwhile
5221    to use generic vectors.  */
5222 int
5223 vect_min_worthwhile_factor (enum tree_code code)
5224 {
5225   switch (code)
5226     {
5227     case PLUS_EXPR:
5228     case MINUS_EXPR:
5229     case NEGATE_EXPR:
5230       return 4;
5231
5232     case BIT_AND_EXPR:
5233     case BIT_IOR_EXPR:
5234     case BIT_XOR_EXPR:
5235     case BIT_NOT_EXPR:
5236       return 2;
5237
5238     default:
5239       return INT_MAX;
5240     }
5241 }
5242
5243
5244 /* Function vectorizable_induction
5245
5246    Check if PHI performs an induction computation that can be vectorized.
5247    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5248    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5249    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5250
5251 bool
5252 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5253                         gimple *vec_stmt)
5254 {
5255   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5256   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5257   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5258   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5259   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5260   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5261   tree vec_def;
5262
5263   gcc_assert (ncopies >= 1);
5264   /* FORNOW. These restrictions should be relaxed.  */
5265   if (nested_in_vect_loop_p (loop, phi))
5266     {
5267       imm_use_iterator imm_iter;
5268       use_operand_p use_p;
5269       gimple exit_phi;
5270       edge latch_e;
5271       tree loop_arg;
5272
5273       if (ncopies > 1)
5274         {
5275           if (dump_enabled_p ())
5276             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5277                              "multiple types in nested loop.");
5278           return false;
5279         }
5280
5281       exit_phi = NULL;
5282       latch_e = loop_latch_edge (loop->inner);
5283       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5284       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5285         {
5286           if (!flow_bb_inside_loop_p (loop->inner,
5287                                       gimple_bb (USE_STMT (use_p))))
5288             {
5289               exit_phi = USE_STMT (use_p);
5290               break;
5291             }
5292         }
5293       if (exit_phi)
5294         {
5295           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5296           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5297                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5298             {
5299               if (dump_enabled_p ())
5300                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5301                                  "inner-loop induction only used outside "
5302                                  "of the outer vectorized loop.");
5303               return false;
5304             }
5305         }
5306     }
5307
5308   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5309     return false;
5310
5311   /* FORNOW: SLP not supported.  */
5312   if (STMT_SLP_TYPE (stmt_info))
5313     return false;
5314
5315   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5316
5317   if (gimple_code (phi) != GIMPLE_PHI)
5318     return false;
5319
5320   if (!vec_stmt) /* transformation not required.  */
5321     {
5322       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5323       if (dump_enabled_p ())
5324         dump_printf_loc (MSG_NOTE, vect_location,
5325                          "=== vectorizable_induction ===");
5326       vect_model_induction_cost (stmt_info, ncopies);
5327       return true;
5328     }
5329
5330   /** Transform.  **/
5331
5332   if (dump_enabled_p ())
5333     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.");
5334
5335   vec_def = get_initial_def_for_induction (phi);
5336   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5337   return true;
5338 }
5339
5340 /* Function vectorizable_live_operation.
5341
5342    STMT computes a value that is used outside the loop.  Check if
5343    it can be supported.  */
5344
5345 bool
5346 vectorizable_live_operation (gimple stmt,
5347                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5348                              gimple *vec_stmt ATTRIBUTE_UNUSED)
5349 {
5350   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5351   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5352   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5353   int i;
5354   int op_type;
5355   tree op;
5356   tree def;
5357   gimple def_stmt;
5358   enum vect_def_type dt;
5359   enum tree_code code;
5360   enum gimple_rhs_class rhs_class;
5361
5362   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5363
5364   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5365     return false;
5366
5367   if (!is_gimple_assign (stmt))
5368     return false;
5369
5370   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5371     return false;
5372
5373   /* FORNOW. CHECKME. */
5374   if (nested_in_vect_loop_p (loop, stmt))
5375     return false;
5376
5377   code = gimple_assign_rhs_code (stmt);
5378   op_type = TREE_CODE_LENGTH (code);
5379   rhs_class = get_gimple_rhs_class (code);
5380   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5381   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5382
5383   /* FORNOW: support only if all uses are invariant.  This means
5384      that the scalar operations can remain in place, unvectorized.
5385      The original last scalar value that they compute will be used.  */
5386
5387   for (i = 0; i < op_type; i++)
5388     {
5389       if (rhs_class == GIMPLE_SINGLE_RHS)
5390         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5391       else
5392         op = gimple_op (stmt, i + 1);
5393       if (op
5394           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5395                                   &dt))
5396         {
5397           if (dump_enabled_p ())
5398             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5399                              "use not simple.");
5400           return false;
5401         }
5402
5403       if (dt != vect_external_def && dt != vect_constant_def)
5404         return false;
5405     }
5406
5407   /* No transformation is required for the cases we currently support.  */
5408   return true;
5409 }
5410
5411 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5412
5413 static void
5414 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5415 {
5416   ssa_op_iter op_iter;
5417   imm_use_iterator imm_iter;
5418   def_operand_p def_p;
5419   gimple ustmt;
5420
5421   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5422     {
5423       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5424         {
5425           basic_block bb;
5426
5427           if (!is_gimple_debug (ustmt))
5428             continue;
5429
5430           bb = gimple_bb (ustmt);
5431
5432           if (!flow_bb_inside_loop_p (loop, bb))
5433             {
5434               if (gimple_debug_bind_p (ustmt))
5435                 {
5436                   if (dump_enabled_p ())
5437                     dump_printf_loc (MSG_NOTE, vect_location,
5438                                      "killing debug use");
5439
5440                   gimple_debug_bind_reset_value (ustmt);
5441                   update_stmt (ustmt);
5442                 }
5443               else
5444                 gcc_unreachable ();
5445             }
5446         }
5447     }
5448 }
5449
5450 /* Function vect_transform_loop.
5451
5452    The analysis phase has determined that the loop is vectorizable.
5453    Vectorize the loop - created vectorized stmts to replace the scalar
5454    stmts in the loop, and update the loop exit condition.  */
5455
5456 void
5457 vect_transform_loop (loop_vec_info loop_vinfo)
5458 {
5459   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5460   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5461   int nbbs = loop->num_nodes;
5462   gimple_stmt_iterator si;
5463   int i;
5464   tree ratio = NULL;
5465   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5466   bool grouped_store;
5467   bool slp_scheduled = false;
5468   unsigned int nunits;
5469   gimple stmt, pattern_stmt;
5470   gimple_seq pattern_def_seq = NULL;
5471   gimple_stmt_iterator pattern_def_si = gsi_none ();
5472   bool transform_pattern_stmt = false;
5473   bool check_profitability = false;
5474   int th;
5475   /* Record number of iterations before we started tampering with the profile. */
5476   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5477
5478   if (dump_enabled_p ())
5479     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===");
5480
5481   /* If profile is inprecise, we have chance to fix it up.  */
5482   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5483     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5484
5485   /* Use the more conservative vectorization threshold.  If the number
5486      of iterations is constant assume the cost check has been performed
5487      by our caller.  If the threshold makes all loops profitable that
5488      run at least the vectorization factor number of times checking
5489      is pointless, too.  */
5490   th = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
5491          * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
5492   th = MAX (th, LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo));
5493   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5494       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5495     {
5496       if (dump_enabled_p ())
5497         dump_printf_loc (MSG_NOTE, vect_location,
5498                          "Profitability threshold is %d loop iterations.", th);
5499       check_profitability = true;
5500     }
5501
5502   /* Version the loop first, if required, so the profitability check
5503      comes first.  */
5504
5505   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5506       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5507     {
5508       vect_loop_versioning (loop_vinfo, th, check_profitability);
5509       check_profitability = false;
5510     }
5511
5512   /* Peel the loop if there are data refs with unknown alignment.
5513      Only one data ref with unknown store is allowed.  */
5514
5515   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5516     {
5517       vect_do_peeling_for_alignment (loop_vinfo, th, check_profitability);
5518       check_profitability = false;
5519     }
5520
5521   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5522      compile time constant), or it is a constant that doesn't divide by the
5523      vectorization factor, then an epilog loop needs to be created.
5524      We therefore duplicate the loop: the original loop will be vectorized,
5525      and will compute the first (n/VF) iterations.  The second copy of the loop
5526      will remain scalar and will compute the remaining (n%VF) iterations.
5527      (VF is the vectorization factor).  */
5528
5529   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5530        || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5531            && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
5532        || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5533     vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
5534                                     th, check_profitability);
5535   else
5536     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5537                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5538
5539   /* 1) Make sure the loop header has exactly two entries
5540      2) Make sure we have a preheader basic block.  */
5541
5542   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5543
5544   split_edge (loop_preheader_edge (loop));
5545
5546   /* FORNOW: the vectorizer supports only loops which body consist
5547      of one basic block (header + empty latch). When the vectorizer will
5548      support more involved loop forms, the order by which the BBs are
5549      traversed need to be reconsidered.  */
5550
5551   for (i = 0; i < nbbs; i++)
5552     {
5553       basic_block bb = bbs[i];
5554       stmt_vec_info stmt_info;
5555       gimple phi;
5556
5557       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5558         {
5559           phi = gsi_stmt (si);
5560           if (dump_enabled_p ())
5561             {
5562               dump_printf_loc (MSG_NOTE, vect_location,
5563                                "------>vectorizing phi: ");
5564               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5565             }
5566           stmt_info = vinfo_for_stmt (phi);
5567           if (!stmt_info)
5568             continue;
5569
5570           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5571             vect_loop_kill_debug_uses (loop, phi);
5572
5573           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5574               && !STMT_VINFO_LIVE_P (stmt_info))
5575             continue;
5576
5577           if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5578                 != (unsigned HOST_WIDE_INT) vectorization_factor)
5579               && dump_enabled_p ())
5580             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.");
5581
5582           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5583             {
5584               if (dump_enabled_p ())
5585                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.");
5586               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5587             }
5588         }
5589
5590       pattern_stmt = NULL;
5591       for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5592         {
5593           bool is_store;
5594
5595           if (transform_pattern_stmt)
5596             stmt = pattern_stmt;
5597           else
5598             stmt = gsi_stmt (si);
5599
5600           if (dump_enabled_p ())
5601             {
5602               dump_printf_loc (MSG_NOTE, vect_location,
5603                                "------>vectorizing statement: ");
5604               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5605             }
5606
5607           stmt_info = vinfo_for_stmt (stmt);
5608
5609           /* vector stmts created in the outer-loop during vectorization of
5610              stmts in an inner-loop may not have a stmt_info, and do not
5611              need to be vectorized.  */
5612           if (!stmt_info)
5613             {
5614               gsi_next (&si);
5615               continue;
5616             }
5617
5618           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5619             vect_loop_kill_debug_uses (loop, stmt);
5620
5621           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5622               && !STMT_VINFO_LIVE_P (stmt_info))
5623             {
5624               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5625                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5626                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5627                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5628                 {
5629                   stmt = pattern_stmt;
5630                   stmt_info = vinfo_for_stmt (stmt);
5631                 }
5632               else
5633                 {
5634                   gsi_next (&si);
5635                   continue;
5636                 }
5637             }
5638           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5639                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5640                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5641                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5642             transform_pattern_stmt = true;
5643
5644           /* If pattern statement has def stmts, vectorize them too.  */
5645           if (is_pattern_stmt_p (stmt_info))
5646             {
5647               if (pattern_def_seq == NULL)
5648                 {
5649                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
5650                   pattern_def_si = gsi_start (pattern_def_seq);
5651                 }
5652               else if (!gsi_end_p (pattern_def_si))
5653                 gsi_next (&pattern_def_si);
5654               if (pattern_def_seq != NULL)
5655                 {
5656                   gimple pattern_def_stmt = NULL;
5657                   stmt_vec_info pattern_def_stmt_info = NULL;
5658
5659                   while (!gsi_end_p (pattern_def_si))
5660                     {
5661                       pattern_def_stmt = gsi_stmt (pattern_def_si);
5662                       pattern_def_stmt_info
5663                         = vinfo_for_stmt (pattern_def_stmt);
5664                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
5665                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
5666                         break;
5667                       gsi_next (&pattern_def_si);
5668                     }
5669
5670                   if (!gsi_end_p (pattern_def_si))
5671                     {
5672                       if (dump_enabled_p ())
5673                         {
5674                           dump_printf_loc (MSG_NOTE, vect_location,
5675                                            "==> vectorizing pattern def "
5676                                            "stmt: ");
5677                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
5678                                             pattern_def_stmt, 0);
5679                         }
5680
5681                       stmt = pattern_def_stmt;
5682                       stmt_info = pattern_def_stmt_info;
5683                     }
5684                   else
5685                     {
5686                       pattern_def_si = gsi_none ();
5687                       transform_pattern_stmt = false;
5688                     }
5689                 }
5690               else
5691                 transform_pattern_stmt = false;
5692             }
5693
5694           gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5695           nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
5696                                                STMT_VINFO_VECTYPE (stmt_info));
5697           if (!STMT_SLP_TYPE (stmt_info)
5698               && nunits != (unsigned int) vectorization_factor
5699               && dump_enabled_p ())
5700             /* For SLP VF is set according to unrolling factor, and not to
5701                vector size, hence for SLP this print is not valid.  */
5702             dump_printf_loc (MSG_NOTE, vect_location,
5703                              "multiple-types.");
5704
5705           /* SLP. Schedule all the SLP instances when the first SLP stmt is
5706              reached.  */
5707           if (STMT_SLP_TYPE (stmt_info))
5708             {
5709               if (!slp_scheduled)
5710                 {
5711                   slp_scheduled = true;
5712
5713                   if (dump_enabled_p ())
5714                     dump_printf_loc (MSG_NOTE, vect_location,
5715                                      "=== scheduling SLP instances ===");
5716
5717                   vect_schedule_slp (loop_vinfo, NULL);
5718                 }
5719
5720               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
5721               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
5722                 {
5723                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5724                     {
5725                       pattern_def_seq = NULL;
5726                       gsi_next (&si);
5727                     }
5728                   continue;
5729                 }
5730             }
5731
5732           /* -------- vectorize statement ------------ */
5733           if (dump_enabled_p ())
5734             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.");
5735
5736           grouped_store = false;
5737           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
5738           if (is_store)
5739             {
5740               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
5741                 {
5742                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5743                      interleaving chain was completed - free all the stores in
5744                      the chain.  */
5745                   gsi_next (&si);
5746                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
5747                   continue;
5748                 }
5749               else
5750                 {
5751                   /* Free the attached stmt_vec_info and remove the stmt.  */
5752                   gimple store = gsi_stmt (si);
5753                   free_stmt_vec_info (store);
5754                   unlink_stmt_vdef (store);
5755                   gsi_remove (&si, true);
5756                   release_defs (store);
5757                   continue;
5758                 }
5759             }
5760
5761           if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5762             {
5763               pattern_def_seq = NULL;
5764               gsi_next (&si);
5765             }
5766         }                       /* stmts in BB */
5767     }                           /* BBs in loop */
5768
5769   slpeel_make_loop_iterate_ntimes (loop, ratio);
5770
5771   /* Reduce loop iterations by the vectorization factor.  */
5772   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
5773                       expected_iterations / vectorization_factor);
5774   loop->nb_iterations_upper_bound
5775     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (vectorization_factor),
5776                                             FLOOR_DIV_EXPR);
5777   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5778       && loop->nb_iterations_upper_bound != double_int_zero)
5779     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - double_int_one;
5780   if (loop->any_estimate)
5781     {
5782       loop->nb_iterations_estimate
5783         = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (vectorization_factor),
5784                                              FLOOR_DIV_EXPR);
5785        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5786            && loop->nb_iterations_estimate != double_int_zero)
5787          loop->nb_iterations_estimate = loop->nb_iterations_estimate - double_int_one;
5788     }
5789
5790   if (dump_enabled_p ())
5791     {
5792       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
5793                        "LOOP VECTORIZED\n");
5794       if (loop->inner)
5795         dump_printf_loc (MSG_NOTE, vect_location,
5796                          "OUTER LOOP VECTORIZED\n");
5797     }
5798 }