gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
   3    Free Software Foundation, Inc.
   4    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   5    Ira Rosen <irar@il.ibm.com>
   6
   7 This file is part of GCC.
   8
   9 GCC is free software; you can redistribute it and/or modify it under
  10 the terms of the GNU General Public License as published by the Free
  11 Software Foundation; either version 3, or (at your option) any later
  12 version.
  13
  14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  16 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  17 for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with GCC; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "tm.h"
  27 #include "ggc.h"
  28 #include "tree.h"
  29 #include "basic-block.h"
  30 #include "tree-pretty-print.h"
  31 #include "gimple-pretty-print.h"
  32 #include "tree-flow.h"
  33 #include "tree-dump.h"
  34 #include "cfgloop.h"
  35 #include "cfglayout.h"
  36 #include "expr.h"
  37 #include "recog.h"
  38 #include "optabs.h"
  39 #include "params.h"
  40 #include "diagnostic-core.h"
  41 #include "tree-chrec.h"
  42 #include "tree-scalar-evolution.h"
  43 #include "tree-vectorizer.h"
  44 #include "target.h"
  45
  46 /* Loop Vectorization Pass.
  47
  48    This pass tries to vectorize loops.
  49
  50    For example, the vectorizer transforms the following simple loop:
  51
  52         short a[N]; short b[N]; short c[N]; int i;
  53
  54         for (i=0; i<N; i++){
  55           a[i] = b[i] + c[i];
  56         }
  57
  58    as if it was manually vectorized by rewriting the source code into:
  59
  60         typedef int __attribute__((mode(V8HI))) v8hi;
  61         short a[N];  short b[N]; short c[N];   int i;
  62         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  63         v8hi va, vb, vc;
  64
  65         for (i=0; i<N/8; i++){
  66           vb = pb[i];
  67           vc = pc[i];
  68           va = vb + vc;
  69           pa[i] = va;
  70         }
  71
  72         The main entry to this pass is vectorize_loops(), in which
  73    the vectorizer applies a set of analyses on a given set of loops,
  74    followed by the actual vectorization transformation for the loops that
  75    had successfully passed the analysis phase.
  76         Throughout this pass we make a distinction between two types of
  77    data: scalars (which are represented by SSA_NAMES), and memory references
  78    ("data-refs").  These two types of data require different handling both
  79    during analysis and transformation. The types of data-refs that the
  80    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  81    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  82    accesses are required to have a simple (consecutive) access pattern.
  83
  84    Analysis phase:
  85    ===============
  86         The driver for the analysis phase is vect_analyze_loop().
  87    It applies a set of analyses, some of which rely on the scalar evolution
  88    analyzer (scev) developed by Sebastian Pop.
  89
  90         During the analysis phase the vectorizer records some information
  91    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  92    loop, as well as general information about the loop as a whole, which is
  93    recorded in a "loop_vec_info" struct attached to each loop.
  94
  95    Transformation phase:
  96    =====================
  97         The loop transformation phase scans all the stmts in the loop, and
  98    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
  99    the loop that needs to be vectorized.  It inserts the vector code sequence
 100    just before the scalar stmt S, and records a pointer to the vector code
 101    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 102    attached to S).  This pointer will be used for the vectorization of following
 103    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 104    otherwise, we rely on dead code elimination for removing it.
 105
 106         For example, say stmt S1 was vectorized into stmt VS1:
 107
 108    VS1: vb = px[i];
 109    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 110    S2:  a = b;
 111
 112    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 113    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 114    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 115    resulting sequence would be:
 116
 117    VS1: vb = px[i];
 118    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 119    VS2: va = vb;
 120    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 121
 122         Operands that are not SSA_NAMEs, are data-refs that appear in
 123    load/store operations (like 'x[i]' in S1), and are handled differently.
 124
 125    Target modeling:
 126    =================
 127         Currently the only target specific information that is used is the
 128    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 129    Targets that can support different sizes of vectors, for now will need
 130    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 131    flexibility will be added in the future.
 132
 133         Since we only vectorize operations which vector form can be
 134    expressed using existing tree codes, to verify that an operation is
 135    supported, the vectorizer checks the relevant optab at the relevant
 136    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 137    the value found is CODE_FOR_nothing, then there's no target support, and
 138    we can't vectorize the stmt.
 139
 140    For additional information on this project see:
 141    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 142 */
 143
 144 /* Function vect_determine_vectorization_factor
 145
 146    Determine the vectorization factor (VF).  VF is the number of data elements
 147    that are operated upon in parallel in a single iteration of the vectorized
 148    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 149    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 150    elements can fit in a single vector register.
 151
 152    We currently support vectorization of loops in which all types operated upon
 153    are of the same size.  Therefore this function currently sets VF according to
 154    the size of the types operated upon, and fails if there are multiple sizes
 155    in the loop.
 156
 157    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 158    original loop:
 159         for (i=0; i<N; i++){
 160           a[i] = b[i] + c[i];
 161         }
 162
 163    vectorized loop:
 164         for (i=0; i<N; i+=VF){
 165           a[i:VF] = b[i:VF] + c[i:VF];
 166         }
 167 */
 168
 169 static bool
 170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 171 {
 172   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 173   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 174   int nbbs = loop->num_nodes;
 175   gimple_stmt_iterator si;
 176   unsigned int vectorization_factor = 0;
 177   tree scalar_type;
 178   gimple phi;
 179   tree vectype;
 180   unsigned int nunits;
 181   stmt_vec_info stmt_info;
 182   int i;
 183   HOST_WIDE_INT dummy;
 184   gimple stmt, pattern_stmt = NULL;
 185   gimple_seq pattern_def_seq = NULL;
 186   gimple_stmt_iterator pattern_def_si = gsi_none ();
 187   bool analyze_pattern_stmt = false;
 188
 189   if (vect_print_dump_info (REPORT_DETAILS))
 190     fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
 191
 192   for (i = 0; i < nbbs; i++)
 193     {
 194       basic_block bb = bbs[i];
 195
 196       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 197         {
 198           phi = gsi_stmt (si);
 199           stmt_info = vinfo_for_stmt (phi);
 200           if (vect_print_dump_info (REPORT_DETAILS))
 201             {
 202               fprintf (vect_dump, "==> examining phi: ");
 203               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 204             }
 205
 206           gcc_assert (stmt_info);
 207
 208           if (STMT_VINFO_RELEVANT_P (stmt_info))
 209             {
 210               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 211               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 212
 213               if (vect_print_dump_info (REPORT_DETAILS))
 214                 {
 215                   fprintf (vect_dump, "get vectype for scalar type:  ");
 216                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 217                 }
 218
 219               vectype = get_vectype_for_scalar_type (scalar_type);
 220               if (!vectype)
 221                 {
 222                   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 223                     {
 224                       fprintf (vect_dump,
 225                                "not vectorized: unsupported data-type ");
 226                       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 227                     }
 228                   return false;
 229                 }
 230               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 231
 232               if (vect_print_dump_info (REPORT_DETAILS))
 233                 {
 234                   fprintf (vect_dump, "vectype: ");
 235                   print_generic_expr (vect_dump, vectype, TDF_SLIM);
 236                 }
 237
 238               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 239               if (vect_print_dump_info (REPORT_DETAILS))
 240                 fprintf (vect_dump, "nunits = %d", nunits);
 241
 242               if (!vectorization_factor
 243                   || (nunits > vectorization_factor))
 244                 vectorization_factor = nunits;
 245             }
 246         }
 247
 248       for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
 249         {
 250           tree vf_vectype;
 251
 252           if (analyze_pattern_stmt)
 253             stmt = pattern_stmt;
 254           else
 255             stmt = gsi_stmt (si);
 256
 257           stmt_info = vinfo_for_stmt (stmt);
 258
 259           if (vect_print_dump_info (REPORT_DETAILS))
 260             {
 261               fprintf (vect_dump, "==> examining statement: ");
 262               print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 263             }
 264
 265           gcc_assert (stmt_info);
 266
 267           /* Skip stmts which do not need to be vectorized.  */
 268           if (!STMT_VINFO_RELEVANT_P (stmt_info)
 269               && !STMT_VINFO_LIVE_P (stmt_info))
 270             {
 271               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 272                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 273                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 274                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 275                 {
 276                   stmt = pattern_stmt;
 277                   stmt_info = vinfo_for_stmt (pattern_stmt);
 278                   if (vect_print_dump_info (REPORT_DETAILS))
 279                     {
 280                       fprintf (vect_dump, "==> examining pattern statement: ");
 281                       print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 282                     }
 283                 }
 284               else
 285                 {
 286                   if (vect_print_dump_info (REPORT_DETAILS))
 287                     fprintf (vect_dump, "skip.");
 288                   gsi_next (&si);
 289                   continue;
 290                 }
 291             }
 292           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 293                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 294                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 295                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 296             analyze_pattern_stmt = true;
 297
 298           /* If a pattern statement has def stmts, analyze them too.  */
 299           if (is_pattern_stmt_p (stmt_info))
 300             {
 301               if (pattern_def_seq == NULL)
 302                 {
 303                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 304                   pattern_def_si = gsi_start (pattern_def_seq);
 305                 }
 306               else if (!gsi_end_p (pattern_def_si))
 307                 gsi_next (&pattern_def_si);
 308               if (pattern_def_seq != NULL)
 309                 {
 310                   gimple pattern_def_stmt = NULL;
 311                   stmt_vec_info pattern_def_stmt_info = NULL;
 312
 313                   while (!gsi_end_p (pattern_def_si))
 314                     {
 315                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 316                       pattern_def_stmt_info
 317                         = vinfo_for_stmt (pattern_def_stmt);
 318                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 319                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 320                         break;
 321                       gsi_next (&pattern_def_si);
 322                     }
 323
 324                   if (!gsi_end_p (pattern_def_si))
 325                     {
 326                       if (vect_print_dump_info (REPORT_DETAILS))
 327                         {
 328                           fprintf (vect_dump,
 329                                    "==> examining pattern def stmt: ");
 330                           print_gimple_stmt (vect_dump, pattern_def_stmt, 0,
 331                                              TDF_SLIM);
 332                         }
 333
 334                       stmt = pattern_def_stmt;
 335                       stmt_info = pattern_def_stmt_info;
 336                     }
 337                   else
 338                     {
 339                       pattern_def_si = gsi_none ();
 340                       analyze_pattern_stmt = false;
 341                     }
 342                 }
 343               else
 344                 analyze_pattern_stmt = false;
 345             }
 346
 347           if (gimple_get_lhs (stmt) == NULL_TREE)
 348             {
 349               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 350                 {
 351                   fprintf (vect_dump, "not vectorized: irregular stmt.");
 352                   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 353                 }
 354               return false;
 355             }
 356
 357           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 358             {
 359               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 360                 {
 361                   fprintf (vect_dump, "not vectorized: vector stmt in loop:");
 362                   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 363                 }
 364               return false;
 365             }
 366
 367           if (STMT_VINFO_VECTYPE (stmt_info))
 368             {
 369               /* The only case when a vectype had been already set is for stmts
 370                  that contain a dataref, or for "pattern-stmts" (stmts
 371                  generated by the vectorizer to represent/replace a certain
 372                  idiom).  */
 373               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 374                           || is_pattern_stmt_p (stmt_info)
 375                           || !gsi_end_p (pattern_def_si));
 376               vectype = STMT_VINFO_VECTYPE (stmt_info);
 377             }
 378           else
 379             {
 380               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 381               scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 382               if (vect_print_dump_info (REPORT_DETAILS))
 383                 {
 384                   fprintf (vect_dump, "get vectype for scalar type:  ");
 385                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 386                 }
 387               vectype = get_vectype_for_scalar_type (scalar_type);
 388               if (!vectype)
 389                 {
 390                   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 391                     {
 392                       fprintf (vect_dump,
 393                                "not vectorized: unsupported data-type ");
 394                       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 395                     }
 396                   return false;
 397                 }
 398
 399               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 400             }
 401
 402           /* The vectorization factor is according to the smallest
 403              scalar type (or the largest vector size, but we only
 404              support one vector size per loop).  */
 405           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 406                                                        &dummy);
 407           if (vect_print_dump_info (REPORT_DETAILS))
 408             {
 409               fprintf (vect_dump, "get vectype for scalar type:  ");
 410               print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 411             }
 412           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 413           if (!vf_vectype)
 414             {
 415               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 416                 {
 417                   fprintf (vect_dump,
 418                            "not vectorized: unsupported data-type ");
 419                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 420                 }
 421               return false;
 422             }
 423
 424           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 425                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 426             {
 427               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 428                 {
 429                   fprintf (vect_dump,
 430                            "not vectorized: different sized vector "
 431                            "types in statement, ");
 432                   print_generic_expr (vect_dump, vectype, TDF_SLIM);
 433                   fprintf (vect_dump, " and ");
 434                   print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
 435                 }
 436               return false;
 437             }
 438
 439           if (vect_print_dump_info (REPORT_DETAILS))
 440             {
 441               fprintf (vect_dump, "vectype: ");
 442               print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
 443             }
 444
 445           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 446           if (vect_print_dump_info (REPORT_DETAILS))
 447             fprintf (vect_dump, "nunits = %d", nunits);
 448
 449           if (!vectorization_factor
 450               || (nunits > vectorization_factor))
 451             vectorization_factor = nunits;
 452
 453           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 454             {
 455               pattern_def_seq = NULL;
 456               gsi_next (&si);
 457             }
 458         }
 459     }
 460
 461   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 462   if (vect_print_dump_info (REPORT_DETAILS))
 463     fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
 464   if (vectorization_factor <= 1)
 465     {
 466       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 467         fprintf (vect_dump, "not vectorized: unsupported data-type");
 468       return false;
 469     }
 470   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 471
 472   return true;
 473 }
 474
 475
 476 /* Function vect_is_simple_iv_evolution.
 477
 478    FORNOW: A simple evolution of an induction variables in the loop is
 479    considered a polynomial evolution with constant step.  */
 480
 481 static bool
 482 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 483                              tree * step)
 484 {
 485   tree init_expr;
 486   tree step_expr;
 487   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 488
 489   /* When there is no evolution in this loop, the evolution function
 490      is not "simple".  */
 491   if (evolution_part == NULL_TREE)
 492     return false;
 493
 494   /* When the evolution is a polynomial of degree >= 2
 495      the evolution function is not "simple".  */
 496   if (tree_is_chrec (evolution_part))
 497     return false;
 498
 499   step_expr = evolution_part;
 500   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 501
 502   if (vect_print_dump_info (REPORT_DETAILS))
 503     {
 504       fprintf (vect_dump, "step: ");
 505       print_generic_expr (vect_dump, step_expr, TDF_SLIM);
 506       fprintf (vect_dump, ",  init: ");
 507       print_generic_expr (vect_dump, init_expr, TDF_SLIM);
 508     }
 509
 510   *init = init_expr;
 511   *step = step_expr;
 512
 513   if (TREE_CODE (step_expr) != INTEGER_CST)
 514     {
 515       if (vect_print_dump_info (REPORT_DETAILS))
 516         fprintf (vect_dump, "step unknown.");
 517       return false;
 518     }
 519
 520   return true;
 521 }
 522
 523 /* Function vect_analyze_scalar_cycles_1.
 524
 525    Examine the cross iteration def-use cycles of scalar variables
 526    in LOOP.  LOOP_VINFO represents the loop that is now being
 527    considered for vectorization (can be LOOP, or an outer-loop
 528    enclosing LOOP).  */
 529
 530 static void
 531 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 532 {
 533   basic_block bb = loop->header;
 534   tree dumy;
 535   VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
 536   gimple_stmt_iterator gsi;
 537   bool double_reduc;
 538
 539   if (vect_print_dump_info (REPORT_DETAILS))
 540     fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
 541
 542   /* First - identify all inductions.  Reduction detection assumes that all the
 543      inductions have been identified, therefore, this order must not be
 544      changed.  */
 545   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 546     {
 547       gimple phi = gsi_stmt (gsi);
 548       tree access_fn = NULL;
 549       tree def = PHI_RESULT (phi);
 550       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 551
 552       if (vect_print_dump_info (REPORT_DETAILS))
 553         {
 554           fprintf (vect_dump, "Analyze phi: ");
 555           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 556         }
 557
 558       /* Skip virtual phi's.  The data dependences that are associated with
 559          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 560       if (!is_gimple_reg (SSA_NAME_VAR (def)))
 561         continue;
 562
 563       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 564
 565       /* Analyze the evolution function.  */
 566       access_fn = analyze_scalar_evolution (loop, def);
 567       if (access_fn)
 568         {
 569           STRIP_NOPS (access_fn);
 570           if (vect_print_dump_info (REPORT_DETAILS))
 571             {
 572               fprintf (vect_dump, "Access function of PHI: ");
 573               print_generic_expr (vect_dump, access_fn, TDF_SLIM);
 574             }
 575           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 576             = evolution_part_in_loop_num (access_fn, loop->num);
 577         }
 578
 579       if (!access_fn
 580           || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
 581         {
 582           VEC_safe_push (gimple, heap, worklist, phi);
 583           continue;
 584         }
 585
 586       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 587
 588       if (vect_print_dump_info (REPORT_DETAILS))
 589         fprintf (vect_dump, "Detected induction.");
 590       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 591     }
 592
 593
 594   /* Second - identify all reductions and nested cycles.  */
 595   while (VEC_length (gimple, worklist) > 0)
 596     {
 597       gimple phi = VEC_pop (gimple, worklist);
 598       tree def = PHI_RESULT (phi);
 599       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 600       gimple reduc_stmt;
 601       bool nested_cycle;
 602
 603       if (vect_print_dump_info (REPORT_DETAILS))
 604         {
 605           fprintf (vect_dump, "Analyze phi: ");
 606           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 607         }
 608
 609       gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
 610       gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 611
 612       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 613       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 614                                                 &double_reduc);
 615       if (reduc_stmt)
 616         {
 617           if (double_reduc)
 618             {
 619               if (vect_print_dump_info (REPORT_DETAILS))
 620                 fprintf (vect_dump, "Detected double reduction.");
 621
 622               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 623               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 624                                                     vect_double_reduction_def;
 625             }
 626           else
 627             {
 628               if (nested_cycle)
 629                 {
 630                   if (vect_print_dump_info (REPORT_DETAILS))
 631                     fprintf (vect_dump, "Detected vectorizable nested cycle.");
 632
 633                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 634                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 635                                                              vect_nested_cycle;
 636                 }
 637               else
 638                 {
 639                   if (vect_print_dump_info (REPORT_DETAILS))
 640                     fprintf (vect_dump, "Detected reduction.");
 641
 642                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 643                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 644                                                            vect_reduction_def;
 645                   /* Store the reduction cycles for possible vectorization in
 646                      loop-aware SLP.  */
 647                   VEC_safe_push (gimple, heap,
 648                                  LOOP_VINFO_REDUCTIONS (loop_vinfo),
 649                                  reduc_stmt);
 650                 }
 651             }
 652         }
 653       else
 654         if (vect_print_dump_info (REPORT_DETAILS))
 655           fprintf (vect_dump, "Unknown def-use cycle pattern.");
 656     }
 657
 658   VEC_free (gimple, heap, worklist);
 659 }
 660
 661
 662 /* Function vect_analyze_scalar_cycles.
 663
 664    Examine the cross iteration def-use cycles of scalar variables, by
 665    analyzing the loop-header PHIs of scalar variables.  Classify each
 666    cycle as one of the following: invariant, induction, reduction, unknown.
 667    We do that for the loop represented by LOOP_VINFO, and also to its
 668    inner-loop, if exists.
 669    Examples for scalar cycles:
 670
 671    Example1: reduction:
 672
 673               loop1:
 674               for (i=0; i<N; i++)
 675                  sum += a[i];
 676
 677    Example2: induction:
 678
 679               loop2:
 680               for (i=0; i<N; i++)
 681                  a[i] = i;  */
 682
 683 static void
 684 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 685 {
 686   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 687
 688   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 689
 690   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 691      Reductions in such inner-loop therefore have different properties than
 692      the reductions in the nest that gets vectorized:
 693      1. When vectorized, they are executed in the same order as in the original
 694         scalar loop, so we can't change the order of computation when
 695         vectorizing them.
 696      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 697         current checks are too strict.  */
 698
 699   if (loop->inner)
 700     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 701 }
 702
 703 /* Function vect_get_loop_niters.
 704
 705    Determine how many iterations the loop is executed.
 706    If an expression that represents the number of iterations
 707    can be constructed, place it in NUMBER_OF_ITERATIONS.
 708    Return the loop exit condition.  */
 709
 710 static gimple
 711 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
 712 {
 713   tree niters;
 714
 715   if (vect_print_dump_info (REPORT_DETAILS))
 716     fprintf (vect_dump, "=== get_loop_niters ===");
 717
 718   niters = number_of_exit_cond_executions (loop);
 719
 720   if (niters != NULL_TREE
 721       && niters != chrec_dont_know)
 722     {
 723       *number_of_iterations = niters;
 724
 725       if (vect_print_dump_info (REPORT_DETAILS))
 726         {
 727           fprintf (vect_dump, "==> get_loop_niters:" );
 728           print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
 729         }
 730     }
 731
 732   return get_loop_exit_condition (loop);
 733 }
 734
 735
 736 /* Function bb_in_loop_p
 737
 738    Used as predicate for dfs order traversal of the loop bbs.  */
 739
 740 static bool
 741 bb_in_loop_p (const_basic_block bb, const void *data)
 742 {
 743   const struct loop *const loop = (const struct loop *)data;
 744   if (flow_bb_inside_loop_p (loop, bb))
 745     return true;
 746   return false;
 747 }
 748
 749
 750 /* Function new_loop_vec_info.
 751
 752    Create and initialize a new loop_vec_info struct for LOOP, as well as
 753    stmt_vec_info structs for all the stmts in LOOP.  */
 754
 755 static loop_vec_info
 756 new_loop_vec_info (struct loop *loop)
 757 {
 758   loop_vec_info res;
 759   basic_block *bbs;
 760   gimple_stmt_iterator si;
 761   unsigned int i, nbbs;
 762
 763   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 764   LOOP_VINFO_LOOP (res) = loop;
 765
 766   bbs = get_loop_body (loop);
 767
 768   /* Create/Update stmt_info for all stmts in the loop.  */
 769   for (i = 0; i < loop->num_nodes; i++)
 770     {
 771       basic_block bb = bbs[i];
 772
 773       /* BBs in a nested inner-loop will have been already processed (because
 774          we will have called vect_analyze_loop_form for any nested inner-loop).
 775          Therefore, for stmts in an inner-loop we just want to update the
 776          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 777          loop_info of the outer-loop we are currently considering to vectorize
 778          (instead of the loop_info of the inner-loop).
 779          For stmts in other BBs we need to create a stmt_info from scratch.  */
 780       if (bb->loop_father != loop)
 781         {
 782           /* Inner-loop bb.  */
 783           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 784           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 785             {
 786               gimple phi = gsi_stmt (si);
 787               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 788               loop_vec_info inner_loop_vinfo =
 789                 STMT_VINFO_LOOP_VINFO (stmt_info);
 790               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 791               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 792             }
 793           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 794            {
 795               gimple stmt = gsi_stmt (si);
 796               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 797               loop_vec_info inner_loop_vinfo =
 798                  STMT_VINFO_LOOP_VINFO (stmt_info);
 799               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 800               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 801            }
 802         }
 803       else
 804         {
 805           /* bb in current nest.  */
 806           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 807             {
 808               gimple phi = gsi_stmt (si);
 809               gimple_set_uid (phi, 0);
 810               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 811             }
 812
 813           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 814             {
 815               gimple stmt = gsi_stmt (si);
 816               gimple_set_uid (stmt, 0);
 817               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 818             }
 819         }
 820     }
 821
 822   /* CHECKME: We want to visit all BBs before their successors (except for
 823      latch blocks, for which this assertion wouldn't hold).  In the simple
 824      case of the loop forms we allow, a dfs order of the BBs would the same
 825      as reversed postorder traversal, so we are safe.  */
 826
 827    free (bbs);
 828    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 829    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 830                               bbs, loop->num_nodes, loop);
 831    gcc_assert (nbbs == loop->num_nodes);
 832
 833   LOOP_VINFO_BBS (res) = bbs;
 834   LOOP_VINFO_NITERS (res) = NULL;
 835   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 836   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 837   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 838   LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
 839   LOOP_VINFO_VECT_FACTOR (res) = 0;
 840   LOOP_VINFO_LOOP_NEST (res) = VEC_alloc (loop_p, heap, 3);
 841   LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
 842   LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
 843   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 844   LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
 845     VEC_alloc (gimple, heap,
 846                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 847   LOOP_VINFO_MAY_ALIAS_DDRS (res) =
 848     VEC_alloc (ddr_p, heap,
 849                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 850   LOOP_VINFO_GROUPED_STORES (res) = VEC_alloc (gimple, heap, 10);
 851   LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
 852   LOOP_VINFO_REDUCTION_CHAINS (res) = VEC_alloc (gimple, heap, 10);
 853   LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
 854   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 855   LOOP_VINFO_PEELING_HTAB (res) = NULL;
 856   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
 857
 858   return res;
 859 }
 860
 861
 862 /* Function destroy_loop_vec_info.
 863
 864    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 865    stmts in the loop.  */
 866
 867 void
 868 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 869 {
 870   struct loop *loop;
 871   basic_block *bbs;
 872   int nbbs;
 873   gimple_stmt_iterator si;
 874   int j;
 875   VEC (slp_instance, heap) *slp_instances;
 876   slp_instance instance;
 877
 878   if (!loop_vinfo)
 879     return;
 880
 881   loop = LOOP_VINFO_LOOP (loop_vinfo);
 882
 883   bbs = LOOP_VINFO_BBS (loop_vinfo);
 884   nbbs = loop->num_nodes;
 885
 886   if (!clean_stmts)
 887     {
 888       free (LOOP_VINFO_BBS (loop_vinfo));
 889       free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 890       free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 891       VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
 892       VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 893       VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
 894
 895       free (loop_vinfo);
 896       loop->aux = NULL;
 897       return;
 898     }
 899
 900   for (j = 0; j < nbbs; j++)
 901     {
 902       basic_block bb = bbs[j];
 903       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 904         free_stmt_vec_info (gsi_stmt (si));
 905
 906       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 907         {
 908           gimple stmt = gsi_stmt (si);
 909           /* Free stmt_vec_info.  */
 910           free_stmt_vec_info (stmt);
 911           gsi_next (&si);
 912         }
 913     }
 914
 915   free (LOOP_VINFO_BBS (loop_vinfo));
 916   free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 917   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 918   VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
 919   VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 920   VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
 921   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
 922   FOR_EACH_VEC_ELT (slp_instance, slp_instances, j, instance)
 923     vect_free_slp_instance (instance);
 924
 925   VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
 926   VEC_free (gimple, heap, LOOP_VINFO_GROUPED_STORES (loop_vinfo));
 927   VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
 928   VEC_free (gimple, heap, LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo));
 929
 930   if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
 931     htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
 932
 933   free (loop_vinfo);
 934   loop->aux = NULL;
 935 }
 936
 937
 938 /* Function vect_analyze_loop_1.
 939
 940    Apply a set of analyses on LOOP, and create a loop_vec_info struct
 941    for it. The different analyses will record information in the
 942    loop_vec_info struct.  This is a subset of the analyses applied in
 943    vect_analyze_loop, to be applied on an inner-loop nested in the loop
 944    that is now considered for (outer-loop) vectorization.  */
 945
 946 static loop_vec_info
 947 vect_analyze_loop_1 (struct loop *loop)
 948 {
 949   loop_vec_info loop_vinfo;
 950
 951   if (vect_print_dump_info (REPORT_DETAILS))
 952     fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
 953
 954   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
 955
 956   loop_vinfo = vect_analyze_loop_form (loop);
 957   if (!loop_vinfo)
 958     {
 959       if (vect_print_dump_info (REPORT_DETAILS))
 960         fprintf (vect_dump, "bad inner-loop form.");
 961       return NULL;
 962     }
 963
 964   return loop_vinfo;
 965 }
 966
 967
 968 /* Function vect_analyze_loop_form.
 969
 970    Verify that certain CFG restrictions hold, including:
 971    - the loop has a pre-header
 972    - the loop has a single entry and exit
 973    - the loop exit condition is simple enough, and the number of iterations
 974      can be analyzed (a countable loop).  */
 975
 976 loop_vec_info
 977 vect_analyze_loop_form (struct loop *loop)
 978 {
 979   loop_vec_info loop_vinfo;
 980   gimple loop_cond;
 981   tree number_of_iterations = NULL;
 982   loop_vec_info inner_loop_vinfo = NULL;
 983
 984   if (vect_print_dump_info (REPORT_DETAILS))
 985     fprintf (vect_dump, "=== vect_analyze_loop_form ===");
 986
 987   /* Different restrictions apply when we are considering an inner-most loop,
 988      vs. an outer (nested) loop.
 989      (FORNOW. May want to relax some of these restrictions in the future).  */
 990
 991   if (!loop->inner)
 992     {
 993       /* Inner-most loop.  We currently require that the number of BBs is
 994          exactly 2 (the header and latch).  Vectorizable inner-most loops
 995          look like this:
 996
 997                         (pre-header)
 998                            |
 999                           header <--------+
1000                            | |            |
1001                            | +--> latch --+
1002                            |
1003                         (exit-bb)  */
1004
1005       if (loop->num_nodes != 2)
1006         {
1007           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1008             fprintf (vect_dump, "not vectorized: control flow in loop.");
1009           return NULL;
1010         }
1011
1012       if (empty_block_p (loop->header))
1013     {
1014           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1015             fprintf (vect_dump, "not vectorized: empty loop.");
1016       return NULL;
1017     }
1018     }
1019   else
1020     {
1021       struct loop *innerloop = loop->inner;
1022       edge entryedge;
1023
1024       /* Nested loop. We currently require that the loop is doubly-nested,
1025          contains a single inner loop, and the number of BBs is exactly 5.
1026          Vectorizable outer-loops look like this:
1027
1028                         (pre-header)
1029                            |
1030                           header <---+
1031                            |         |
1032                           inner-loop |
1033                            |         |
1034                           tail ------+
1035                            |
1036                         (exit-bb)
1037
1038          The inner-loop has the properties expected of inner-most loops
1039          as described above.  */
1040
1041       if ((loop->inner)->inner || (loop->inner)->next)
1042         {
1043           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1044             fprintf (vect_dump, "not vectorized: multiple nested loops.");
1045           return NULL;
1046         }
1047
1048       /* Analyze the inner-loop.  */
1049       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1050       if (!inner_loop_vinfo)
1051         {
1052           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1053             fprintf (vect_dump, "not vectorized: Bad inner loop.");
1054           return NULL;
1055         }
1056
1057       if (!expr_invariant_in_loop_p (loop,
1058                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
1059         {
1060           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1061             fprintf (vect_dump,
1062                      "not vectorized: inner-loop count not invariant.");
1063           destroy_loop_vec_info (inner_loop_vinfo, true);
1064           return NULL;
1065         }
1066
1067       if (loop->num_nodes != 5)
1068         {
1069           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1070             fprintf (vect_dump, "not vectorized: control flow in loop.");
1071           destroy_loop_vec_info (inner_loop_vinfo, true);
1072           return NULL;
1073         }
1074
1075       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1076       entryedge = EDGE_PRED (innerloop->header, 0);
1077       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1078         entryedge = EDGE_PRED (innerloop->header, 1);
1079
1080       if (entryedge->src != loop->header
1081           || !single_exit (innerloop)
1082           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1083         {
1084           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1085             fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
1086           destroy_loop_vec_info (inner_loop_vinfo, true);
1087           return NULL;
1088         }
1089
1090       if (vect_print_dump_info (REPORT_DETAILS))
1091         fprintf (vect_dump, "Considering outer-loop vectorization.");
1092     }
1093
1094   if (!single_exit (loop)
1095       || EDGE_COUNT (loop->header->preds) != 2)
1096     {
1097       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1098         {
1099           if (!single_exit (loop))
1100             fprintf (vect_dump, "not vectorized: multiple exits.");
1101           else if (EDGE_COUNT (loop->header->preds) != 2)
1102             fprintf (vect_dump, "not vectorized: too many incoming edges.");
1103         }
1104       if (inner_loop_vinfo)
1105         destroy_loop_vec_info (inner_loop_vinfo, true);
1106       return NULL;
1107     }
1108
1109   /* We assume that the loop exit condition is at the end of the loop. i.e,
1110      that the loop is represented as a do-while (with a proper if-guard
1111      before the loop if needed), where the loop header contains all the
1112      executable statements, and the latch is empty.  */
1113   if (!empty_block_p (loop->latch)
1114         || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1115     {
1116       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1117         fprintf (vect_dump, "not vectorized: unexpected loop form.");
1118       if (inner_loop_vinfo)
1119         destroy_loop_vec_info (inner_loop_vinfo, true);
1120       return NULL;
1121     }
1122
1123   /* Make sure there exists a single-predecessor exit bb:  */
1124   if (!single_pred_p (single_exit (loop)->dest))
1125     {
1126       edge e = single_exit (loop);
1127       if (!(e->flags & EDGE_ABNORMAL))
1128         {
1129           split_loop_exit_edge (e);
1130           if (vect_print_dump_info (REPORT_DETAILS))
1131             fprintf (vect_dump, "split exit edge.");
1132         }
1133       else
1134         {
1135           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1136             fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1137           if (inner_loop_vinfo)
1138             destroy_loop_vec_info (inner_loop_vinfo, true);
1139           return NULL;
1140         }
1141     }
1142
1143   loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1144   if (!loop_cond)
1145     {
1146       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1147         fprintf (vect_dump, "not vectorized: complicated exit condition.");
1148       if (inner_loop_vinfo)
1149         destroy_loop_vec_info (inner_loop_vinfo, true);
1150       return NULL;
1151     }
1152
1153   if (!number_of_iterations)
1154     {
1155       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1156         fprintf (vect_dump,
1157                  "not vectorized: number of iterations cannot be computed.");
1158       if (inner_loop_vinfo)
1159         destroy_loop_vec_info (inner_loop_vinfo, true);
1160       return NULL;
1161     }
1162
1163   if (chrec_contains_undetermined (number_of_iterations))
1164     {
1165       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1166         fprintf (vect_dump, "Infinite number of iterations.");
1167       if (inner_loop_vinfo)
1168         destroy_loop_vec_info (inner_loop_vinfo, true);
1169       return NULL;
1170     }
1171
1172   if (!NITERS_KNOWN_P (number_of_iterations))
1173     {
1174       if (vect_print_dump_info (REPORT_DETAILS))
1175         {
1176           fprintf (vect_dump, "Symbolic number of iterations is ");
1177           print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1178         }
1179     }
1180   else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1181     {
1182       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1183         fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1184       if (inner_loop_vinfo)
1185         destroy_loop_vec_info (inner_loop_vinfo, false);
1186       return NULL;
1187     }
1188
1189   loop_vinfo = new_loop_vec_info (loop);
1190   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1191   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1192
1193   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1194
1195   /* CHECKME: May want to keep it around it in the future.  */
1196   if (inner_loop_vinfo)
1197     destroy_loop_vec_info (inner_loop_vinfo, false);
1198
1199   gcc_assert (!loop->aux);
1200   loop->aux = loop_vinfo;
1201   return loop_vinfo;
1202 }
1203
1204
1205 /* Get cost by calling cost target builtin.  */
1206
1207 static inline int
1208 vect_get_cost (enum vect_cost_for_stmt type_of_cost)
1209 {
1210   tree dummy_type = NULL;
1211   int dummy = 0;
1212
1213   return targetm.vectorize.builtin_vectorization_cost (type_of_cost,
1214                                                        dummy_type, dummy);
1215 }
1216
1217
1218 /* Function vect_analyze_loop_operations.
1219
1220    Scan the loop stmts and make sure they are all vectorizable.  */
1221
1222 static bool
1223 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1224 {
1225   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1226   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1227   int nbbs = loop->num_nodes;
1228   gimple_stmt_iterator si;
1229   unsigned int vectorization_factor = 0;
1230   int i;
1231   gimple phi;
1232   stmt_vec_info stmt_info;
1233   bool need_to_vectorize = false;
1234   int min_profitable_iters;
1235   int min_scalar_loop_bound;
1236   unsigned int th;
1237   bool only_slp_in_loop = true, ok;
1238   HOST_WIDE_INT max_niter;
1239
1240   if (vect_print_dump_info (REPORT_DETAILS))
1241     fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1242
1243   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1244   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1245   if (slp)
1246     {
1247       /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1248          vectorization factor of the loop is the unrolling factor required by
1249          the SLP instances.  If that unrolling factor is 1, we say, that we
1250          perform pure SLP on loop - cross iteration parallelism is not
1251          exploited.  */
1252       for (i = 0; i < nbbs; i++)
1253         {
1254           basic_block bb = bbs[i];
1255           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1256             {
1257               gimple stmt = gsi_stmt (si);
1258               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1259               gcc_assert (stmt_info);
1260               if ((STMT_VINFO_RELEVANT_P (stmt_info)
1261                    || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1262                   && !PURE_SLP_STMT (stmt_info))
1263                 /* STMT needs both SLP and loop-based vectorization.  */
1264                 only_slp_in_loop = false;
1265             }
1266         }
1267
1268       if (only_slp_in_loop)
1269         vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1270       else
1271         vectorization_factor = least_common_multiple (vectorization_factor,
1272                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1273
1274       LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1275       if (vect_print_dump_info (REPORT_DETAILS))
1276         fprintf (vect_dump, "Updating vectorization factor to %d ",
1277                             vectorization_factor);
1278     }
1279
1280   for (i = 0; i < nbbs; i++)
1281     {
1282       basic_block bb = bbs[i];
1283
1284       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1285         {
1286           phi = gsi_stmt (si);
1287           ok = true;
1288
1289           stmt_info = vinfo_for_stmt (phi);
1290           if (vect_print_dump_info (REPORT_DETAILS))
1291             {
1292               fprintf (vect_dump, "examining phi: ");
1293               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1294             }
1295
1296           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1297              (i.e., a phi in the tail of the outer-loop).  */
1298           if (! is_loop_header_bb_p (bb))
1299             {
1300               /* FORNOW: we currently don't support the case that these phis
1301                  are not used in the outerloop (unless it is double reduction,
1302                  i.e., this phi is vect_reduction_def), cause this case
1303                  requires to actually do something here.  */
1304               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1305                    || STMT_VINFO_LIVE_P (stmt_info))
1306                   && STMT_VINFO_DEF_TYPE (stmt_info)
1307                      != vect_double_reduction_def)
1308                 {
1309                   if (vect_print_dump_info (REPORT_DETAILS))
1310                     fprintf (vect_dump,
1311                              "Unsupported loop-closed phi in outer-loop.");
1312                   return false;
1313                 }
1314
1315               /* If PHI is used in the outer loop, we check that its operand
1316                  is defined in the inner loop.  */
1317               if (STMT_VINFO_RELEVANT_P (stmt_info))
1318                 {
1319                   tree phi_op;
1320                   gimple op_def_stmt;
1321
1322                   if (gimple_phi_num_args (phi) != 1)
1323                     return false;
1324
1325                   phi_op = PHI_ARG_DEF (phi, 0);
1326                   if (TREE_CODE (phi_op) != SSA_NAME)
1327                     return false;
1328
1329                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1330                   if (!op_def_stmt
1331                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1332                       || !vinfo_for_stmt (op_def_stmt))
1333                     return false;
1334
1335                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1336                         != vect_used_in_outer
1337                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1338                            != vect_used_in_outer_by_reduction)
1339                     return false;
1340                 }
1341
1342               continue;
1343             }
1344
1345           gcc_assert (stmt_info);
1346
1347           if (STMT_VINFO_LIVE_P (stmt_info))
1348             {
1349               /* FORNOW: not yet supported.  */
1350               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1351                 fprintf (vect_dump, "not vectorized: value used after loop.");
1352               return false;
1353             }
1354
1355           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1356               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1357             {
1358               /* A scalar-dependence cycle that we don't support.  */
1359               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1360                 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1361               return false;
1362             }
1363
1364           if (STMT_VINFO_RELEVANT_P (stmt_info))
1365             {
1366               need_to_vectorize = true;
1367               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1368                 ok = vectorizable_induction (phi, NULL, NULL);
1369             }
1370
1371           if (!ok)
1372             {
1373               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1374                 {
1375                   fprintf (vect_dump,
1376                            "not vectorized: relevant phi not supported: ");
1377                   print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1378                 }
1379               return false;
1380             }
1381         }
1382
1383       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1384         {
1385           gimple stmt = gsi_stmt (si);
1386           if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1387             return false;
1388         }
1389     } /* bbs */
1390
1391   /* All operations in the loop are either irrelevant (deal with loop
1392      control, or dead), or only used outside the loop and can be moved
1393      out of the loop (e.g. invariants, inductions).  The loop can be
1394      optimized away by scalar optimizations.  We're better off not
1395      touching this loop.  */
1396   if (!need_to_vectorize)
1397     {
1398       if (vect_print_dump_info (REPORT_DETAILS))
1399         fprintf (vect_dump,
1400                  "All the computation can be taken out of the loop.");
1401       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1402         fprintf (vect_dump,
1403                  "not vectorized: redundant loop. no profit to vectorize.");
1404       return false;
1405     }
1406
1407   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1408       && vect_print_dump_info (REPORT_DETAILS))
1409     fprintf (vect_dump,
1410         "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1411         vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1412
1413   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1414        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1415       || ((max_niter = max_stmt_executions_int (loop)) != -1
1416           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1417     {
1418       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1419         fprintf (vect_dump, "not vectorized: iteration count too small.");
1420       if (vect_print_dump_info (REPORT_DETAILS))
1421         fprintf (vect_dump,"not vectorized: iteration count smaller than "
1422                  "vectorization factor.");
1423       return false;
1424     }
1425
1426   /* Analyze cost.  Decide if worth while to vectorize.  */
1427
1428   /* Once VF is set, SLP costs should be updated since the number of created
1429      vector stmts depends on VF.  */
1430   vect_update_slp_costs_according_to_vf (loop_vinfo);
1431
1432   min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1433   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1434
1435   if (min_profitable_iters < 0)
1436     {
1437       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1438         fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1439       if (vect_print_dump_info (REPORT_DETAILS))
1440         fprintf (vect_dump, "not vectorized: vector version will never be "
1441                  "profitable.");
1442       return false;
1443     }
1444
1445   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1446                             * vectorization_factor) - 1);
1447
1448   /* Use the cost model only if it is more conservative than user specified
1449      threshold.  */
1450
1451   th = (unsigned) min_scalar_loop_bound;
1452   if (min_profitable_iters
1453       && (!min_scalar_loop_bound
1454           || min_profitable_iters > min_scalar_loop_bound))
1455     th = (unsigned) min_profitable_iters;
1456
1457   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1458       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1459     {
1460       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1461         fprintf (vect_dump, "not vectorized: vectorization not "
1462                  "profitable.");
1463       if (vect_print_dump_info (REPORT_DETAILS))
1464         fprintf (vect_dump, "not vectorized: iteration count smaller than "
1465                  "user specified loop bound parameter or minimum "
1466                  "profitable iterations (whichever is more conservative).");
1467       return false;
1468     }
1469
1470   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1471       || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1472       || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1473     {
1474       if (vect_print_dump_info (REPORT_DETAILS))
1475         fprintf (vect_dump, "epilog loop required.");
1476       if (!vect_can_advance_ivs_p (loop_vinfo))
1477         {
1478           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1479             fprintf (vect_dump,
1480                      "not vectorized: can't create epilog loop 1.");
1481           return false;
1482         }
1483       if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1484         {
1485           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1486             fprintf (vect_dump,
1487                      "not vectorized: can't create epilog loop 2.");
1488           return false;
1489         }
1490     }
1491
1492   return true;
1493 }
1494
1495
1496 /* Function vect_analyze_loop_2.
1497
1498    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1499    for it.  The different analyses will record information in the
1500    loop_vec_info struct.  */
1501 static bool
1502 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1503 {
1504   bool ok, slp = false;
1505   int max_vf = MAX_VECTORIZATION_FACTOR;
1506   int min_vf = 2;
1507
1508   /* Find all data references in the loop (which correspond to vdefs/vuses)
1509      and analyze their evolution in the loop.  Also adjust the minimal
1510      vectorization factor according to the loads and stores.
1511
1512      FORNOW: Handle only simple, array references, which
1513      alignment can be forced, and aligned pointer-references.  */
1514
1515   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1516   if (!ok)
1517     {
1518       if (vect_print_dump_info (REPORT_DETAILS))
1519         fprintf (vect_dump, "bad data references.");
1520       return false;
1521     }
1522
1523   /* Classify all cross-iteration scalar data-flow cycles.
1524      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1525
1526   vect_analyze_scalar_cycles (loop_vinfo);
1527
1528   vect_pattern_recog (loop_vinfo, NULL);
1529
1530   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1531
1532   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1533   if (!ok)
1534     {
1535       if (vect_print_dump_info (REPORT_DETAILS))
1536         fprintf (vect_dump, "unexpected pattern.");
1537       return false;
1538     }
1539
1540   /* Analyze data dependences between the data-refs in the loop
1541      and adjust the maximum vectorization factor according to
1542      the dependences.
1543      FORNOW: fail at the first data dependence that we encounter.  */
1544
1545   ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf);
1546   if (!ok
1547       || max_vf < min_vf)
1548     {
1549       if (vect_print_dump_info (REPORT_DETAILS))
1550         fprintf (vect_dump, "bad data dependence.");
1551       return false;
1552     }
1553
1554   ok = vect_determine_vectorization_factor (loop_vinfo);
1555   if (!ok)
1556     {
1557       if (vect_print_dump_info (REPORT_DETAILS))
1558         fprintf (vect_dump, "can't determine vectorization factor.");
1559       return false;
1560     }
1561   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1562     {
1563       if (vect_print_dump_info (REPORT_DETAILS))
1564         fprintf (vect_dump, "bad data dependence.");
1565       return false;
1566     }
1567
1568   /* Analyze the alignment of the data-refs in the loop.
1569      Fail if a data reference is found that cannot be vectorized.  */
1570
1571   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1572   if (!ok)
1573     {
1574       if (vect_print_dump_info (REPORT_DETAILS))
1575         fprintf (vect_dump, "bad data alignment.");
1576       return false;
1577     }
1578
1579   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1580      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1581
1582   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1583   if (!ok)
1584     {
1585       if (vect_print_dump_info (REPORT_DETAILS))
1586         fprintf (vect_dump, "bad data access.");
1587       return false;
1588     }
1589
1590   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1591      It is important to call pruning after vect_analyze_data_ref_accesses,
1592      since we use grouping information gathered by interleaving analysis.  */
1593   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1594   if (!ok)
1595     {
1596       if (vect_print_dump_info (REPORT_DETAILS))
1597         fprintf (vect_dump, "too long list of versioning for alias "
1598                             "run-time tests.");
1599       return false;
1600     }
1601
1602   /* This pass will decide on using loop versioning and/or loop peeling in
1603      order to enhance the alignment of data references in the loop.  */
1604
1605   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1606   if (!ok)
1607     {
1608       if (vect_print_dump_info (REPORT_DETAILS))
1609         fprintf (vect_dump, "bad data alignment.");
1610       return false;
1611     }
1612
1613   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1614   ok = vect_analyze_slp (loop_vinfo, NULL);
1615   if (ok)
1616     {
1617       /* Decide which possible SLP instances to SLP.  */
1618       slp = vect_make_slp_decision (loop_vinfo);
1619
1620       /* Find stmts that need to be both vectorized and SLPed.  */
1621       vect_detect_hybrid_slp (loop_vinfo);
1622     }
1623   else
1624     return false;
1625
1626   /* Scan all the operations in the loop and make sure they are
1627      vectorizable.  */
1628
1629   ok = vect_analyze_loop_operations (loop_vinfo, slp);
1630   if (!ok)
1631     {
1632       if (vect_print_dump_info (REPORT_DETAILS))
1633         fprintf (vect_dump, "bad operation or unsupported loop bound.");
1634       return false;
1635     }
1636
1637   return true;
1638 }
1639
1640 /* Function vect_analyze_loop.
1641
1642    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1643    for it.  The different analyses will record information in the
1644    loop_vec_info struct.  */
1645 loop_vec_info
1646 vect_analyze_loop (struct loop *loop)
1647 {
1648   loop_vec_info loop_vinfo;
1649   unsigned int vector_sizes;
1650
1651   /* Autodetect first vector size we try.  */
1652   current_vector_size = 0;
1653   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1654
1655   if (vect_print_dump_info (REPORT_DETAILS))
1656     fprintf (vect_dump, "===== analyze_loop_nest =====");
1657
1658   if (loop_outer (loop)
1659       && loop_vec_info_for_loop (loop_outer (loop))
1660       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1661     {
1662       if (vect_print_dump_info (REPORT_DETAILS))
1663         fprintf (vect_dump, "outer-loop already vectorized.");
1664       return NULL;
1665     }
1666
1667   while (1)
1668     {
1669       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1670       loop_vinfo = vect_analyze_loop_form (loop);
1671       if (!loop_vinfo)
1672         {
1673           if (vect_print_dump_info (REPORT_DETAILS))
1674             fprintf (vect_dump, "bad loop form.");
1675           return NULL;
1676         }
1677
1678       if (vect_analyze_loop_2 (loop_vinfo))
1679         {
1680           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1681
1682           return loop_vinfo;
1683         }
1684
1685       destroy_loop_vec_info (loop_vinfo, true);
1686
1687       vector_sizes &= ~current_vector_size;
1688       if (vector_sizes == 0
1689           || current_vector_size == 0)
1690         return NULL;
1691
1692       /* Try the next biggest vector size.  */
1693       current_vector_size = 1 << floor_log2 (vector_sizes);
1694       if (vect_print_dump_info (REPORT_DETAILS))
1695         fprintf (vect_dump, "***** Re-trying analysis with "
1696                  "vector size %d\n", current_vector_size);
1697     }
1698 }
1699
1700
1701 /* Function reduction_code_for_scalar_code
1702
1703    Input:
1704    CODE - tree_code of a reduction operations.
1705
1706    Output:
1707    REDUC_CODE - the corresponding tree-code to be used to reduce the
1708       vector of partial results into a single scalar result (which
1709       will also reside in a vector) or ERROR_MARK if the operation is
1710       a supported reduction operation, but does not have such tree-code.
1711
1712    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1713
1714 static bool
1715 reduction_code_for_scalar_code (enum tree_code code,
1716                                 enum tree_code *reduc_code)
1717 {
1718   switch (code)
1719     {
1720       case MAX_EXPR:
1721         *reduc_code = REDUC_MAX_EXPR;
1722         return true;
1723
1724       case MIN_EXPR:
1725         *reduc_code = REDUC_MIN_EXPR;
1726         return true;
1727
1728       case PLUS_EXPR:
1729         *reduc_code = REDUC_PLUS_EXPR;
1730         return true;
1731
1732       case MULT_EXPR:
1733       case MINUS_EXPR:
1734       case BIT_IOR_EXPR:
1735       case BIT_XOR_EXPR:
1736       case BIT_AND_EXPR:
1737         *reduc_code = ERROR_MARK;
1738         return true;
1739
1740       default:
1741        return false;
1742     }
1743 }
1744
1745
1746 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
1747    STMT is printed with a message MSG. */
1748
1749 static void
1750 report_vect_op (gimple stmt, const char *msg)
1751 {
1752   fprintf (vect_dump, "%s", msg);
1753   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1754 }
1755
1756
1757 /* Detect SLP reduction of the form:
1758
1759    #a1 = phi <a5, a0>
1760    a2 = operation (a1)
1761    a3 = operation (a2)
1762    a4 = operation (a3)
1763    a5 = operation (a4)
1764
1765    #a = phi <a5>
1766
1767    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1768    FIRST_STMT is the first reduction stmt in the chain
1769    (a2 = operation (a1)).
1770
1771    Return TRUE if a reduction chain was detected.  */
1772
1773 static bool
1774 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1775 {
1776   struct loop *loop = (gimple_bb (phi))->loop_father;
1777   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1778   enum tree_code code;
1779   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1780   stmt_vec_info use_stmt_info, current_stmt_info;
1781   tree lhs;
1782   imm_use_iterator imm_iter;
1783   use_operand_p use_p;
1784   int nloop_uses, size = 0, n_out_of_loop_uses;
1785   bool found = false;
1786
1787   if (loop != vect_loop)
1788     return false;
1789
1790   lhs = PHI_RESULT (phi);
1791   code = gimple_assign_rhs_code (first_stmt);
1792   while (1)
1793     {
1794       nloop_uses = 0;
1795       n_out_of_loop_uses = 0;
1796       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1797         {
1798           gimple use_stmt = USE_STMT (use_p);
1799           if (is_gimple_debug (use_stmt))
1800             continue;
1801
1802           use_stmt = USE_STMT (use_p);
1803
1804           /* Check if we got back to the reduction phi.  */
1805           if (use_stmt == phi)
1806             {
1807               loop_use_stmt = use_stmt;
1808               found = true;
1809               break;
1810             }
1811
1812           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1813             {
1814               if (vinfo_for_stmt (use_stmt)
1815                   && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1816                 {
1817                   loop_use_stmt = use_stmt;
1818                   nloop_uses++;
1819                 }
1820             }
1821            else
1822              n_out_of_loop_uses++;
1823
1824            /* There are can be either a single use in the loop or two uses in
1825               phi nodes.  */
1826            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1827              return false;
1828         }
1829
1830       if (found)
1831         break;
1832
1833       /* We reached a statement with no loop uses.  */
1834       if (nloop_uses == 0)
1835         return false;
1836
1837       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
1838       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
1839         return false;
1840
1841       if (!is_gimple_assign (loop_use_stmt)
1842           || code != gimple_assign_rhs_code (loop_use_stmt)
1843           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
1844         return false;
1845
1846       /* Insert USE_STMT into reduction chain.  */
1847       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
1848       if (current_stmt)
1849         {
1850           current_stmt_info = vinfo_for_stmt (current_stmt);
1851           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
1852           GROUP_FIRST_ELEMENT (use_stmt_info)
1853             = GROUP_FIRST_ELEMENT (current_stmt_info);
1854         }
1855       else
1856         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
1857
1858       lhs = gimple_assign_lhs (loop_use_stmt);
1859       current_stmt = loop_use_stmt;
1860       size++;
1861    }
1862
1863   if (!found || loop_use_stmt != phi || size < 2)
1864     return false;
1865
1866   /* Swap the operands, if needed, to make the reduction operand be the second
1867      operand.  */
1868   lhs = PHI_RESULT (phi);
1869   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1870   while (next_stmt)
1871     {
1872       if (gimple_assign_rhs2 (next_stmt) == lhs)
1873         {
1874           tree op = gimple_assign_rhs1 (next_stmt);
1875           gimple def_stmt = NULL;
1876
1877           if (TREE_CODE (op) == SSA_NAME)
1878             def_stmt = SSA_NAME_DEF_STMT (op);
1879
1880           /* Check that the other def is either defined in the loop
1881              ("vect_internal_def"), or it's an induction (defined by a
1882              loop-header phi-node).  */
1883           if (def_stmt
1884               && gimple_bb (def_stmt)
1885               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1886               && (is_gimple_assign (def_stmt)
1887                   || is_gimple_call (def_stmt)
1888                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1889                            == vect_induction_def
1890                   || (gimple_code (def_stmt) == GIMPLE_PHI
1891                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1892                                   == vect_internal_def
1893                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
1894             {
1895               lhs = gimple_assign_lhs (next_stmt);
1896               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1897               continue;
1898             }
1899
1900           return false;
1901         }
1902       else
1903         {
1904           tree op = gimple_assign_rhs2 (next_stmt);
1905           gimple def_stmt = NULL;
1906
1907           if (TREE_CODE (op) == SSA_NAME)
1908             def_stmt = SSA_NAME_DEF_STMT (op);
1909
1910           /* Check that the other def is either defined in the loop
1911             ("vect_internal_def"), or it's an induction (defined by a
1912             loop-header phi-node).  */
1913           if (def_stmt
1914               && gimple_bb (def_stmt)
1915               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1916               && (is_gimple_assign (def_stmt)
1917                   || is_gimple_call (def_stmt)
1918                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1919                               == vect_induction_def
1920                   || (gimple_code (def_stmt) == GIMPLE_PHI
1921                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1922                                   == vect_internal_def
1923                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
1924             {
1925               if (vect_print_dump_info (REPORT_DETAILS))
1926                 {
1927                   fprintf (vect_dump, "swapping oprnds: ");
1928                   print_gimple_stmt (vect_dump, next_stmt, 0, TDF_SLIM);
1929                 }
1930
1931               swap_tree_operands (next_stmt,
1932                                   gimple_assign_rhs1_ptr (next_stmt),
1933                                   gimple_assign_rhs2_ptr (next_stmt));
1934               update_stmt (next_stmt);
1935             }
1936           else
1937             return false;
1938         }
1939
1940       lhs = gimple_assign_lhs (next_stmt);
1941       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1942     }
1943
1944   /* Save the chain for further analysis in SLP detection.  */
1945   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1946   VEC_safe_push (gimple, heap, LOOP_VINFO_REDUCTION_CHAINS (loop_info), first);
1947   GROUP_SIZE (vinfo_for_stmt (first)) = size;
1948
1949   return true;
1950 }
1951
1952
1953 /* Function vect_is_simple_reduction_1
1954
1955    (1) Detect a cross-iteration def-use cycle that represents a simple
1956    reduction computation.  We look for the following pattern:
1957
1958    loop_header:
1959      a1 = phi < a0, a2 >
1960      a3 = ...
1961      a2 = operation (a3, a1)
1962
1963    such that:
1964    1. operation is commutative and associative and it is safe to
1965       change the order of the computation (if CHECK_REDUCTION is true)
1966    2. no uses for a2 in the loop (a2 is used out of the loop)
1967    3. no uses of a1 in the loop besides the reduction operation
1968    4. no uses of a1 outside the loop.
1969
1970    Conditions 1,4 are tested here.
1971    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1972
1973    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1974    nested cycles, if CHECK_REDUCTION is false.
1975
1976    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1977    reductions:
1978
1979      a1 = phi < a0, a2 >
1980      inner loop (def of a3)
1981      a2 = phi < a3 >
1982
1983    If MODIFY is true it tries also to rework the code in-place to enable
1984    detection of more reduction patterns.  For the time being we rewrite
1985    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
1986 */
1987
1988 static gimple
1989 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
1990                             bool check_reduction, bool *double_reduc,
1991                             bool modify)
1992 {
1993   struct loop *loop = (gimple_bb (phi))->loop_father;
1994   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1995   edge latch_e = loop_latch_edge (loop);
1996   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1997   gimple def_stmt, def1 = NULL, def2 = NULL;
1998   enum tree_code orig_code, code;
1999   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2000   tree type;
2001   int nloop_uses;
2002   tree name;
2003   imm_use_iterator imm_iter;
2004   use_operand_p use_p;
2005   bool phi_def;
2006
2007   *double_reduc = false;
2008
2009   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2010      otherwise, we assume outer loop vectorization.  */
2011   gcc_assert ((check_reduction && loop == vect_loop)
2012               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2013
2014   name = PHI_RESULT (phi);
2015   nloop_uses = 0;
2016   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2017     {
2018       gimple use_stmt = USE_STMT (use_p);
2019       if (is_gimple_debug (use_stmt))
2020         continue;
2021
2022       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2023         {
2024           if (vect_print_dump_info (REPORT_DETAILS))
2025             fprintf (vect_dump, "intermediate value used outside loop.");
2026
2027           return NULL;
2028         }
2029
2030       if (vinfo_for_stmt (use_stmt)
2031           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2032         nloop_uses++;
2033       if (nloop_uses > 1)
2034         {
2035           if (vect_print_dump_info (REPORT_DETAILS))
2036             fprintf (vect_dump, "reduction used in loop.");
2037           return NULL;
2038         }
2039     }
2040
2041   if (TREE_CODE (loop_arg) != SSA_NAME)
2042     {
2043       if (vect_print_dump_info (REPORT_DETAILS))
2044         {
2045           fprintf (vect_dump, "reduction: not ssa_name: ");
2046           print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
2047         }
2048       return NULL;
2049     }
2050
2051   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2052   if (!def_stmt)
2053     {
2054       if (vect_print_dump_info (REPORT_DETAILS))
2055         fprintf (vect_dump, "reduction: no def_stmt.");
2056       return NULL;
2057     }
2058
2059   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2060     {
2061       if (vect_print_dump_info (REPORT_DETAILS))
2062         print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
2063       return NULL;
2064     }
2065
2066   if (is_gimple_assign (def_stmt))
2067     {
2068       name = gimple_assign_lhs (def_stmt);
2069       phi_def = false;
2070     }
2071   else
2072     {
2073       name = PHI_RESULT (def_stmt);
2074       phi_def = true;
2075     }
2076
2077   nloop_uses = 0;
2078   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2079     {
2080       gimple use_stmt = USE_STMT (use_p);
2081       if (is_gimple_debug (use_stmt))
2082         continue;
2083       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2084           && vinfo_for_stmt (use_stmt)
2085           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2086         nloop_uses++;
2087       if (nloop_uses > 1)
2088         {
2089           if (vect_print_dump_info (REPORT_DETAILS))
2090             fprintf (vect_dump, "reduction used in loop.");
2091           return NULL;
2092         }
2093     }
2094
2095   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2096      defined in the inner loop.  */
2097   if (phi_def)
2098     {
2099       op1 = PHI_ARG_DEF (def_stmt, 0);
2100
2101       if (gimple_phi_num_args (def_stmt) != 1
2102           || TREE_CODE (op1) != SSA_NAME)
2103         {
2104           if (vect_print_dump_info (REPORT_DETAILS))
2105             fprintf (vect_dump, "unsupported phi node definition.");
2106
2107           return NULL;
2108         }
2109
2110       def1 = SSA_NAME_DEF_STMT (op1);
2111       if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2112           && loop->inner
2113           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2114           && is_gimple_assign (def1))
2115         {
2116           if (vect_print_dump_info (REPORT_DETAILS))
2117             report_vect_op (def_stmt, "detected double reduction: ");
2118
2119           *double_reduc = true;
2120           return def_stmt;
2121         }
2122
2123       return NULL;
2124     }
2125
2126   code = orig_code = gimple_assign_rhs_code (def_stmt);
2127
2128   /* We can handle "res -= x[i]", which is non-associative by
2129      simply rewriting this into "res += -x[i]".  Avoid changing
2130      gimple instruction for the first simple tests and only do this
2131      if we're allowed to change code at all.  */
2132   if (code == MINUS_EXPR
2133       && modify
2134       && (op1 = gimple_assign_rhs1 (def_stmt))
2135       && TREE_CODE (op1) == SSA_NAME
2136       && SSA_NAME_DEF_STMT (op1) == phi)
2137     code = PLUS_EXPR;
2138
2139   if (check_reduction
2140       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2141     {
2142       if (vect_print_dump_info (REPORT_DETAILS))
2143         report_vect_op (def_stmt, "reduction: not commutative/associative: ");
2144       return NULL;
2145     }
2146
2147   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2148     {
2149       if (code != COND_EXPR)
2150         {
2151           if (vect_print_dump_info (REPORT_DETAILS))
2152             report_vect_op (def_stmt, "reduction: not binary operation: ");
2153
2154           return NULL;
2155         }
2156
2157       op3 = gimple_assign_rhs1 (def_stmt);
2158       if (COMPARISON_CLASS_P (op3))
2159         {
2160           op4 = TREE_OPERAND (op3, 1);
2161           op3 = TREE_OPERAND (op3, 0);
2162         }
2163
2164       op1 = gimple_assign_rhs2 (def_stmt);
2165       op2 = gimple_assign_rhs3 (def_stmt);
2166
2167       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2168         {
2169           if (vect_print_dump_info (REPORT_DETAILS))
2170             report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
2171
2172           return NULL;
2173         }
2174     }
2175   else
2176     {
2177       op1 = gimple_assign_rhs1 (def_stmt);
2178       op2 = gimple_assign_rhs2 (def_stmt);
2179
2180       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2181         {
2182           if (vect_print_dump_info (REPORT_DETAILS))
2183             report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
2184
2185           return NULL;
2186         }
2187    }
2188
2189   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2190   if ((TREE_CODE (op1) == SSA_NAME
2191        && !types_compatible_p (type,TREE_TYPE (op1)))
2192       || (TREE_CODE (op2) == SSA_NAME
2193           && !types_compatible_p (type, TREE_TYPE (op2)))
2194       || (op3 && TREE_CODE (op3) == SSA_NAME
2195           && !types_compatible_p (type, TREE_TYPE (op3)))
2196       || (op4 && TREE_CODE (op4) == SSA_NAME
2197           && !types_compatible_p (type, TREE_TYPE (op4))))
2198     {
2199       if (vect_print_dump_info (REPORT_DETAILS))
2200         {
2201           fprintf (vect_dump, "reduction: multiple types: operation type: ");
2202           print_generic_expr (vect_dump, type, TDF_SLIM);
2203           fprintf (vect_dump, ", operands types: ");
2204           print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
2205           fprintf (vect_dump, ",");
2206           print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
2207           if (op3)
2208             {
2209               fprintf (vect_dump, ",");
2210               print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
2211             }
2212
2213           if (op4)
2214             {
2215               fprintf (vect_dump, ",");
2216               print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
2217             }
2218         }
2219
2220       return NULL;
2221     }
2222
2223   /* Check that it's ok to change the order of the computation.
2224      Generally, when vectorizing a reduction we change the order of the
2225      computation.  This may change the behavior of the program in some
2226      cases, so we need to check that this is ok.  One exception is when
2227      vectorizing an outer-loop: the inner-loop is executed sequentially,
2228      and therefore vectorizing reductions in the inner-loop during
2229      outer-loop vectorization is safe.  */
2230
2231   /* CHECKME: check for !flag_finite_math_only too?  */
2232   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2233       && check_reduction)
2234     {
2235       /* Changing the order of operations changes the semantics.  */
2236       if (vect_print_dump_info (REPORT_DETAILS))
2237         report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
2238       return NULL;
2239     }
2240   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2241            && check_reduction)
2242     {
2243       /* Changing the order of operations changes the semantics.  */
2244       if (vect_print_dump_info (REPORT_DETAILS))
2245         report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
2246       return NULL;
2247     }
2248   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2249     {
2250       /* Changing the order of operations changes the semantics.  */
2251       if (vect_print_dump_info (REPORT_DETAILS))
2252         report_vect_op (def_stmt,
2253                         "reduction: unsafe fixed-point math optimization: ");
2254       return NULL;
2255     }
2256
2257   /* If we detected "res -= x[i]" earlier, rewrite it into
2258      "res += -x[i]" now.  If this turns out to be useless reassoc
2259      will clean it up again.  */
2260   if (orig_code == MINUS_EXPR)
2261     {
2262       tree rhs = gimple_assign_rhs2 (def_stmt);
2263       tree negrhs = make_ssa_name (SSA_NAME_VAR (rhs), NULL);
2264       gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2265                                                          rhs, NULL);
2266       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2267       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2268                                                           loop_info, NULL));
2269       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2270       gimple_assign_set_rhs2 (def_stmt, negrhs);
2271       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2272       update_stmt (def_stmt);
2273     }
2274
2275   /* Reduction is safe. We're dealing with one of the following:
2276      1) integer arithmetic and no trapv
2277      2) floating point arithmetic, and special flags permit this optimization
2278      3) nested cycle (i.e., outer loop vectorization).  */
2279   if (TREE_CODE (op1) == SSA_NAME)
2280     def1 = SSA_NAME_DEF_STMT (op1);
2281
2282   if (TREE_CODE (op2) == SSA_NAME)
2283     def2 = SSA_NAME_DEF_STMT (op2);
2284
2285   if (code != COND_EXPR
2286       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2287     {
2288       if (vect_print_dump_info (REPORT_DETAILS))
2289         report_vect_op (def_stmt, "reduction: no defs for operands: ");
2290       return NULL;
2291     }
2292
2293   /* Check that one def is the reduction def, defined by PHI,
2294      the other def is either defined in the loop ("vect_internal_def"),
2295      or it's an induction (defined by a loop-header phi-node).  */
2296
2297   if (def2 && def2 == phi
2298       && (code == COND_EXPR
2299           || !def1 || gimple_nop_p (def1)
2300           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2301               && (is_gimple_assign (def1)
2302                   || is_gimple_call (def1)
2303                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2304                       == vect_induction_def
2305                   || (gimple_code (def1) == GIMPLE_PHI
2306                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2307                           == vect_internal_def
2308                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2309     {
2310       if (vect_print_dump_info (REPORT_DETAILS))
2311         report_vect_op (def_stmt, "detected reduction: ");
2312       return def_stmt;
2313     }
2314
2315   if (def1 && def1 == phi
2316       && (code == COND_EXPR
2317           || !def2 || gimple_nop_p (def2)
2318           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2319               && (is_gimple_assign (def2)
2320                   || is_gimple_call (def2)
2321                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2322                       == vect_induction_def
2323                   || (gimple_code (def2) == GIMPLE_PHI
2324                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2325                           == vect_internal_def
2326                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2327     {
2328       if (check_reduction)
2329         {
2330           /* Swap operands (just for simplicity - so that the rest of the code
2331              can assume that the reduction variable is always the last (second)
2332              argument).  */
2333           if (vect_print_dump_info (REPORT_DETAILS))
2334             report_vect_op (def_stmt,
2335                             "detected reduction: need to swap operands: ");
2336
2337           swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2338                               gimple_assign_rhs2_ptr (def_stmt));
2339         }
2340       else
2341         {
2342           if (vect_print_dump_info (REPORT_DETAILS))
2343             report_vect_op (def_stmt, "detected reduction: ");
2344         }
2345
2346       return def_stmt;
2347     }
2348
2349   /* Try to find SLP reduction chain.  */
2350   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2351     {
2352       if (vect_print_dump_info (REPORT_DETAILS))
2353         report_vect_op (def_stmt, "reduction: detected reduction chain: ");
2354
2355       return def_stmt;
2356     }
2357
2358   if (vect_print_dump_info (REPORT_DETAILS))
2359     report_vect_op (def_stmt, "reduction: unknown pattern: ");
2360
2361   return NULL;
2362 }
2363
2364 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2365    in-place.  Arguments as there.  */
2366
2367 static gimple
2368 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2369                           bool check_reduction, bool *double_reduc)
2370 {
2371   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2372                                      double_reduc, false);
2373 }
2374
2375 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2376    in-place if it enables detection of more reductions.  Arguments
2377    as there.  */
2378
2379 gimple
2380 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2381                           bool check_reduction, bool *double_reduc)
2382 {
2383   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2384                                      double_reduc, true);
2385 }
2386
2387 /* Calculate the cost of one scalar iteration of the loop.  */
2388 int
2389 vect_get_single_scalar_iteraion_cost (loop_vec_info loop_vinfo)
2390 {
2391   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2392   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2393   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2394   int innerloop_iters, i, stmt_cost;
2395
2396   /* Count statements in scalar loop.  Using this as scalar cost for a single
2397      iteration for now.
2398
2399      TODO: Add outer loop support.
2400
2401      TODO: Consider assigning different costs to different scalar
2402      statements.  */
2403
2404   /* FORNOW.  */
2405   innerloop_iters = 1;
2406   if (loop->inner)
2407     innerloop_iters = 50; /* FIXME */
2408
2409   for (i = 0; i < nbbs; i++)
2410     {
2411       gimple_stmt_iterator si;
2412       basic_block bb = bbs[i];
2413
2414       if (bb->loop_father == loop->inner)
2415         factor = innerloop_iters;
2416       else
2417         factor = 1;
2418
2419       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2420         {
2421           gimple stmt = gsi_stmt (si);
2422           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2423
2424           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2425             continue;
2426
2427           /* Skip stmts that are not vectorized inside the loop.  */
2428           if (stmt_info
2429               && !STMT_VINFO_RELEVANT_P (stmt_info)
2430               && (!STMT_VINFO_LIVE_P (stmt_info)
2431                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2432               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2433             continue;
2434
2435           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2436             {
2437               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2438                stmt_cost = vect_get_cost (scalar_load);
2439              else
2440                stmt_cost = vect_get_cost (scalar_store);
2441             }
2442           else
2443             stmt_cost = vect_get_cost (scalar_stmt);
2444
2445           scalar_single_iter_cost += stmt_cost * factor;
2446         }
2447     }
2448   return scalar_single_iter_cost;
2449 }
2450
2451 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2452 int
2453 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2454                              int *peel_iters_epilogue,
2455                              int scalar_single_iter_cost)
2456 {
2457   int peel_guard_costs = 0;
2458   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2459
2460   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2461     {
2462       *peel_iters_epilogue = vf/2;
2463       if (vect_print_dump_info (REPORT_COST))
2464         fprintf (vect_dump, "cost model: "
2465                             "epilogue peel iters set to vf/2 because "
2466                             "loop iterations are unknown .");
2467
2468       /* If peeled iterations are known but number of scalar loop
2469          iterations are unknown, count a taken branch per peeled loop.  */
2470       peel_guard_costs =  2 * vect_get_cost (cond_branch_taken);
2471     }
2472   else
2473     {
2474       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2475       peel_iters_prologue = niters < peel_iters_prologue ?
2476                             niters : peel_iters_prologue;
2477       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2478       /* If we need to peel for gaps, but no peeling is required, we have to
2479          peel VF iterations.  */
2480       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2481         *peel_iters_epilogue = vf;
2482     }
2483
2484    return (peel_iters_prologue * scalar_single_iter_cost)
2485             + (*peel_iters_epilogue * scalar_single_iter_cost)
2486            + peel_guard_costs;
2487 }
2488
2489 /* Function vect_estimate_min_profitable_iters
2490
2491    Return the number of iterations required for the vector version of the
2492    loop to be profitable relative to the cost of the scalar version of the
2493    loop.
2494
2495    TODO: Take profile info into account before making vectorization
2496    decisions, if available.  */
2497
2498 int
2499 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
2500 {
2501   int i;
2502   int min_profitable_iters;
2503   int peel_iters_prologue;
2504   int peel_iters_epilogue;
2505   int vec_inside_cost = 0;
2506   int vec_outside_cost = 0;
2507   int scalar_single_iter_cost = 0;
2508   int scalar_outside_cost = 0;
2509   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2510   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2511   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2512   int nbbs = loop->num_nodes;
2513   int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2514   int peel_guard_costs = 0;
2515   int innerloop_iters = 0, factor;
2516   VEC (slp_instance, heap) *slp_instances;
2517   slp_instance instance;
2518
2519   /* Cost model disabled.  */
2520   if (!flag_vect_cost_model)
2521     {
2522       if (vect_print_dump_info (REPORT_COST))
2523         fprintf (vect_dump, "cost model disabled.");
2524       return 0;
2525     }
2526
2527   /* Requires loop versioning tests to handle misalignment.  */
2528   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2529     {
2530       /*  FIXME: Make cost depend on complexity of individual check.  */
2531       vec_outside_cost +=
2532         VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
2533       if (vect_print_dump_info (REPORT_COST))
2534         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2535                  "versioning to treat misalignment.\n");
2536     }
2537
2538   /* Requires loop versioning with alias checks.  */
2539   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2540     {
2541       /*  FIXME: Make cost depend on complexity of individual check.  */
2542       vec_outside_cost +=
2543         VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
2544       if (vect_print_dump_info (REPORT_COST))
2545         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2546                  "versioning aliasing.\n");
2547     }
2548
2549   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2550       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2551     vec_outside_cost += vect_get_cost (cond_branch_taken);
2552
2553   /* Count statements in scalar loop.  Using this as scalar cost for a single
2554      iteration for now.
2555
2556      TODO: Add outer loop support.
2557
2558      TODO: Consider assigning different costs to different scalar
2559      statements.  */
2560
2561   /* FORNOW.  */
2562   if (loop->inner)
2563     innerloop_iters = 50; /* FIXME */
2564
2565   for (i = 0; i < nbbs; i++)
2566     {
2567       gimple_stmt_iterator si;
2568       basic_block bb = bbs[i];
2569
2570       if (bb->loop_father == loop->inner)
2571         factor = innerloop_iters;
2572       else
2573         factor = 1;
2574
2575       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2576         {
2577           gimple stmt = gsi_stmt (si);
2578           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2579
2580           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2581             {
2582               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2583               stmt_info = vinfo_for_stmt (stmt);
2584             }
2585
2586           /* Skip stmts that are not vectorized inside the loop.  */
2587           if (!STMT_VINFO_RELEVANT_P (stmt_info)
2588               && (!STMT_VINFO_LIVE_P (stmt_info)
2589                  || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))))
2590             continue;
2591
2592           vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
2593           /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
2594              some of the "outside" costs are generated inside the outer-loop.  */
2595           vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
2596           if (is_pattern_stmt_p (stmt_info)
2597               && STMT_VINFO_PATTERN_DEF_SEQ (stmt_info))
2598             {
2599               gimple_stmt_iterator gsi;
2600
2601               for (gsi = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2602                    !gsi_end_p (gsi); gsi_next (&gsi))
2603                 {
2604                   gimple pattern_def_stmt = gsi_stmt (gsi);
2605                   stmt_vec_info pattern_def_stmt_info
2606                     = vinfo_for_stmt (pattern_def_stmt);
2607                   if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
2608                       || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
2609                     {
2610                       vec_inside_cost
2611                         += STMT_VINFO_INSIDE_OF_LOOP_COST
2612                            (pattern_def_stmt_info) * factor;
2613                       vec_outside_cost
2614                         += STMT_VINFO_OUTSIDE_OF_LOOP_COST
2615                            (pattern_def_stmt_info);
2616                     }
2617                 }
2618             }
2619         }
2620     }
2621
2622   scalar_single_iter_cost = vect_get_single_scalar_iteraion_cost (loop_vinfo);
2623
2624   /* Add additional cost for the peeled instructions in prologue and epilogue
2625      loop.
2626
2627      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2628      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2629
2630      TODO: Build an expression that represents peel_iters for prologue and
2631      epilogue to be used in a run-time test.  */
2632
2633   if (npeel  < 0)
2634     {
2635       peel_iters_prologue = vf/2;
2636       if (vect_print_dump_info (REPORT_COST))
2637         fprintf (vect_dump, "cost model: "
2638                  "prologue peel iters set to vf/2.");
2639
2640       /* If peeling for alignment is unknown, loop bound of main loop becomes
2641          unknown.  */
2642       peel_iters_epilogue = vf/2;
2643       if (vect_print_dump_info (REPORT_COST))
2644         fprintf (vect_dump, "cost model: "
2645                  "epilogue peel iters set to vf/2 because "
2646                  "peeling for alignment is unknown .");
2647
2648       /* If peeled iterations are unknown, count a taken branch and a not taken
2649          branch per peeled loop. Even if scalar loop iterations are known,
2650          vector iterations are not known since peeled prologue iterations are
2651          not known. Hence guards remain the same.  */
2652       peel_guard_costs +=  2 * (vect_get_cost (cond_branch_taken)
2653                                 + vect_get_cost (cond_branch_not_taken));
2654       vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2655                            + (peel_iters_epilogue * scalar_single_iter_cost)
2656                            + peel_guard_costs;
2657     }
2658   else
2659     {
2660       peel_iters_prologue = npeel;
2661       vec_outside_cost += vect_get_known_peeling_cost (loop_vinfo,
2662                                     peel_iters_prologue, &peel_iters_epilogue,
2663                                     scalar_single_iter_cost);
2664     }
2665
2666   /* FORNOW: The scalar outside cost is incremented in one of the
2667      following ways:
2668
2669      1. The vectorizer checks for alignment and aliasing and generates
2670      a condition that allows dynamic vectorization.  A cost model
2671      check is ANDED with the versioning condition.  Hence scalar code
2672      path now has the added cost of the versioning check.
2673
2674        if (cost > th & versioning_check)
2675          jmp to vector code
2676
2677      Hence run-time scalar is incremented by not-taken branch cost.
2678
2679      2. The vectorizer then checks if a prologue is required.  If the
2680      cost model check was not done before during versioning, it has to
2681      be done before the prologue check.
2682
2683        if (cost <= th)
2684          prologue = scalar_iters
2685        if (prologue == 0)
2686          jmp to vector code
2687        else
2688          execute prologue
2689        if (prologue == num_iters)
2690          go to exit
2691
2692      Hence the run-time scalar cost is incremented by a taken branch,
2693      plus a not-taken branch, plus a taken branch cost.
2694
2695      3. The vectorizer then checks if an epilogue is required.  If the
2696      cost model check was not done before during prologue check, it
2697      has to be done with the epilogue check.
2698
2699        if (prologue == 0)
2700          jmp to vector code
2701        else
2702          execute prologue
2703        if (prologue == num_iters)
2704          go to exit
2705        vector code:
2706          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2707            jmp to epilogue
2708
2709      Hence the run-time scalar cost should be incremented by 2 taken
2710      branches.
2711
2712      TODO: The back end may reorder the BBS's differently and reverse
2713      conditions/branch directions.  Change the estimates below to
2714      something more reasonable.  */
2715
2716   /* If the number of iterations is known and we do not do versioning, we can
2717      decide whether to vectorize at compile time.  Hence the scalar version
2718      do not carry cost model guard costs.  */
2719   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2720       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2721       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2722     {
2723       /* Cost model check occurs at versioning.  */
2724       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2725           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2726         scalar_outside_cost += vect_get_cost (cond_branch_not_taken);
2727       else
2728         {
2729           /* Cost model check occurs at prologue generation.  */
2730           if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2731             scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken)
2732                                    + vect_get_cost (cond_branch_not_taken);
2733           /* Cost model check occurs at epilogue generation.  */
2734           else
2735             scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken);
2736         }
2737     }
2738
2739   /* Add SLP costs.  */
2740   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2741   FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance)
2742     {
2743       vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2744       vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2745     }
2746
2747   /* Calculate number of iterations required to make the vector version
2748      profitable, relative to the loop bodies only.  The following condition
2749      must hold true:
2750      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2751      where
2752      SIC = scalar iteration cost, VIC = vector iteration cost,
2753      VOC = vector outside cost, VF = vectorization factor,
2754      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2755      SOC = scalar outside cost for run time cost model check.  */
2756
2757   if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2758     {
2759       if (vec_outside_cost <= 0)
2760         min_profitable_iters = 1;
2761       else
2762         {
2763           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2764                                   - vec_inside_cost * peel_iters_prologue
2765                                   - vec_inside_cost * peel_iters_epilogue)
2766                                  / ((scalar_single_iter_cost * vf)
2767                                     - vec_inside_cost);
2768
2769           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2770               <= ((vec_inside_cost * min_profitable_iters)
2771                   + ((vec_outside_cost - scalar_outside_cost) * vf)))
2772             min_profitable_iters++;
2773         }
2774     }
2775   /* vector version will never be profitable.  */
2776   else
2777     {
2778       if (vect_print_dump_info (REPORT_COST))
2779         fprintf (vect_dump, "cost model: the vector iteration cost = %d "
2780                  "divided by the scalar iteration cost = %d "
2781                  "is greater or equal to the vectorization factor = %d.",
2782                  vec_inside_cost, scalar_single_iter_cost, vf);
2783       return -1;
2784     }
2785
2786   if (vect_print_dump_info (REPORT_COST))
2787     {
2788       fprintf (vect_dump, "Cost model analysis: \n");
2789       fprintf (vect_dump, "  Vector inside of loop cost: %d\n",
2790                vec_inside_cost);
2791       fprintf (vect_dump, "  Vector outside of loop cost: %d\n",
2792                vec_outside_cost);
2793       fprintf (vect_dump, "  Scalar iteration cost: %d\n",
2794                scalar_single_iter_cost);
2795       fprintf (vect_dump, "  Scalar outside cost: %d\n", scalar_outside_cost);
2796       fprintf (vect_dump, "  prologue iterations: %d\n",
2797                peel_iters_prologue);
2798       fprintf (vect_dump, "  epilogue iterations: %d\n",
2799                peel_iters_epilogue);
2800       fprintf (vect_dump, "  Calculated minimum iters for profitability: %d\n",
2801                min_profitable_iters);
2802     }
2803
2804   min_profitable_iters =
2805         min_profitable_iters < vf ? vf : min_profitable_iters;
2806
2807   /* Because the condition we create is:
2808      if (niters <= min_profitable_iters)
2809        then skip the vectorized loop.  */
2810   min_profitable_iters--;
2811
2812   if (vect_print_dump_info (REPORT_COST))
2813     fprintf (vect_dump, "  Profitability threshold = %d\n",
2814              min_profitable_iters);
2815
2816   return min_profitable_iters;
2817 }
2818
2819
2820 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2821    functions. Design better to avoid maintenance issues.  */
2822
2823 /* Function vect_model_reduction_cost.
2824
2825    Models cost for a reduction operation, including the vector ops
2826    generated within the strip-mine loop, the initial definition before
2827    the loop, and the epilogue code that must be generated.  */
2828
2829 static bool
2830 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2831                            int ncopies)
2832 {
2833   int outer_cost = 0;
2834   enum tree_code code;
2835   optab optab;
2836   tree vectype;
2837   gimple stmt, orig_stmt;
2838   tree reduction_op;
2839   enum machine_mode mode;
2840   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2841   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2842
2843
2844   /* Cost of reduction op inside loop.  */
2845   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2846     += ncopies * vect_get_cost (vector_stmt);
2847
2848   stmt = STMT_VINFO_STMT (stmt_info);
2849
2850   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2851     {
2852     case GIMPLE_SINGLE_RHS:
2853       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2854       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2855       break;
2856     case GIMPLE_UNARY_RHS:
2857       reduction_op = gimple_assign_rhs1 (stmt);
2858       break;
2859     case GIMPLE_BINARY_RHS:
2860       reduction_op = gimple_assign_rhs2 (stmt);
2861       break;
2862     case GIMPLE_TERNARY_RHS:
2863       reduction_op = gimple_assign_rhs3 (stmt);
2864       break;
2865     default:
2866       gcc_unreachable ();
2867     }
2868
2869   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2870   if (!vectype)
2871     {
2872       if (vect_print_dump_info (REPORT_COST))
2873         {
2874           fprintf (vect_dump, "unsupported data-type ");
2875           print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2876         }
2877       return false;
2878    }
2879
2880   mode = TYPE_MODE (vectype);
2881   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2882
2883   if (!orig_stmt)
2884     orig_stmt = STMT_VINFO_STMT (stmt_info);
2885
2886   code = gimple_assign_rhs_code (orig_stmt);
2887
2888   /* Add in cost for initial definition.  */
2889   outer_cost += vect_get_cost (scalar_to_vec);
2890
2891   /* Determine cost of epilogue code.
2892
2893      We have a reduction operator that will reduce the vector in one statement.
2894      Also requires scalar extract.  */
2895
2896   if (!nested_in_vect_loop_p (loop, orig_stmt))
2897     {
2898       if (reduc_code != ERROR_MARK)
2899         outer_cost += vect_get_cost (vector_stmt)
2900                       + vect_get_cost (vec_to_scalar);
2901       else
2902         {
2903           int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2904           tree bitsize =
2905             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2906           int element_bitsize = tree_low_cst (bitsize, 1);
2907           int nelements = vec_size_in_bits / element_bitsize;
2908
2909           optab = optab_for_tree_code (code, vectype, optab_default);
2910
2911           /* We have a whole vector shift available.  */
2912           if (VECTOR_MODE_P (mode)
2913               && optab_handler (optab, mode) != CODE_FOR_nothing
2914               && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
2915             /* Final reduction via vector shifts and the reduction operator. Also
2916                requires scalar extract.  */
2917             outer_cost += ((exact_log2(nelements) * 2)
2918               * vect_get_cost (vector_stmt)
2919               + vect_get_cost (vec_to_scalar));
2920           else
2921             /* Use extracts and reduction op for final reduction.  For N elements,
2922                we have N extracts and N-1 reduction ops.  */
2923             outer_cost += ((nelements + nelements - 1)
2924               * vect_get_cost (vector_stmt));
2925         }
2926     }
2927
2928   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2929
2930   if (vect_print_dump_info (REPORT_COST))
2931     fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2932              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2933              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2934
2935   return true;
2936 }
2937
2938
2939 /* Function vect_model_induction_cost.
2940
2941    Models cost for induction operations.  */
2942
2943 static void
2944 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2945 {
2946   /* loop cost for vec_loop.  */
2947   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2948     = ncopies * vect_get_cost (vector_stmt);
2949   /* prologue cost for vec_init and vec_step.  */
2950   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)
2951     = 2 * vect_get_cost (scalar_to_vec);
2952
2953   if (vect_print_dump_info (REPORT_COST))
2954     fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2955              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2956              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2957 }
2958
2959
2960 /* Function get_initial_def_for_induction
2961
2962    Input:
2963    STMT - a stmt that performs an induction operation in the loop.
2964    IV_PHI - the initial value of the induction variable
2965
2966    Output:
2967    Return a vector variable, initialized with the first VF values of
2968    the induction variable.  E.g., for an iv with IV_PHI='X' and
2969    evolution S, for a vector of 4 units, we want to return:
2970    [X, X + S, X + 2*S, X + 3*S].  */
2971
2972 static tree
2973 get_initial_def_for_induction (gimple iv_phi)
2974 {
2975   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2976   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2977   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2978   tree scalar_type;
2979   tree vectype;
2980   int nunits;
2981   edge pe = loop_preheader_edge (loop);
2982   struct loop *iv_loop;
2983   basic_block new_bb;
2984   tree vec, vec_init, vec_step, t;
2985   tree access_fn;
2986   tree new_var;
2987   tree new_name;
2988   gimple init_stmt, induction_phi, new_stmt;
2989   tree induc_def, vec_def, vec_dest;
2990   tree init_expr, step_expr;
2991   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2992   int i;
2993   bool ok;
2994   int ncopies;
2995   tree expr;
2996   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2997   bool nested_in_vect_loop = false;
2998   gimple_seq stmts = NULL;
2999   imm_use_iterator imm_iter;
3000   use_operand_p use_p;
3001   gimple exit_phi;
3002   edge latch_e;
3003   tree loop_arg;
3004   gimple_stmt_iterator si;
3005   basic_block bb = gimple_bb (iv_phi);
3006   tree stepvectype;
3007   tree resvectype;
3008
3009   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3010   if (nested_in_vect_loop_p (loop, iv_phi))
3011     {
3012       nested_in_vect_loop = true;
3013       iv_loop = loop->inner;
3014     }
3015   else
3016     iv_loop = loop;
3017   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3018
3019   latch_e = loop_latch_edge (iv_loop);
3020   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3021
3022   access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
3023   gcc_assert (access_fn);
3024   STRIP_NOPS (access_fn);
3025   ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
3026                                     &init_expr, &step_expr);
3027   gcc_assert (ok);
3028   pe = loop_preheader_edge (iv_loop);
3029
3030   scalar_type = TREE_TYPE (init_expr);
3031   vectype = get_vectype_for_scalar_type (scalar_type);
3032   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3033   gcc_assert (vectype);
3034   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3035   ncopies = vf / nunits;
3036
3037   gcc_assert (phi_info);
3038   gcc_assert (ncopies >= 1);
3039
3040   /* Find the first insertion point in the BB.  */
3041   si = gsi_after_labels (bb);
3042
3043   /* Create the vector that holds the initial_value of the induction.  */
3044   if (nested_in_vect_loop)
3045     {
3046       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3047          been created during vectorization of previous stmts.  We obtain it
3048          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3049       tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3050                                            loop_preheader_edge (iv_loop));
3051       vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
3052     }
3053   else
3054     {
3055       VEC(constructor_elt,gc) *v;
3056
3057       /* iv_loop is the loop to be vectorized. Create:
3058          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3059       new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
3060       add_referenced_var (new_var);
3061
3062       new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
3063       if (stmts)
3064         {
3065           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3066           gcc_assert (!new_bb);
3067         }
3068
3069       v = VEC_alloc (constructor_elt, gc, nunits);
3070       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3071       for (i = 1; i < nunits; i++)
3072         {
3073           /* Create: new_name_i = new_name + step_expr  */
3074           enum tree_code code = POINTER_TYPE_P (scalar_type)
3075                                 ? POINTER_PLUS_EXPR : PLUS_EXPR;
3076           init_stmt = gimple_build_assign_with_ops (code, new_var,
3077                                                     new_name, step_expr);
3078           new_name = make_ssa_name (new_var, init_stmt);
3079           gimple_assign_set_lhs (init_stmt, new_name);
3080
3081           new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3082           gcc_assert (!new_bb);
3083
3084           if (vect_print_dump_info (REPORT_DETAILS))
3085             {
3086               fprintf (vect_dump, "created new init_stmt: ");
3087               print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
3088             }
3089           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3090         }
3091       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3092       vec = build_constructor (vectype, v);
3093       vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
3094     }
3095
3096
3097   /* Create the vector that holds the step of the induction.  */
3098   if (nested_in_vect_loop)
3099     /* iv_loop is nested in the loop to be vectorized. Generate:
3100        vec_step = [S, S, S, S]  */
3101     new_name = step_expr;
3102   else
3103     {
3104       /* iv_loop is the loop to be vectorized. Generate:
3105           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3106       expr = build_int_cst (TREE_TYPE (step_expr), vf);
3107       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3108                               expr, step_expr);
3109     }
3110
3111   t = unshare_expr (new_name);
3112   gcc_assert (CONSTANT_CLASS_P (new_name));
3113   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3114   gcc_assert (stepvectype);
3115   vec = build_vector_from_val (stepvectype, t);
3116   vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
3117
3118
3119   /* Create the following def-use cycle:
3120      loop prolog:
3121          vec_init = ...
3122          vec_step = ...
3123      loop:
3124          vec_iv = PHI <vec_init, vec_loop>
3125          ...
3126          STMT
3127          ...
3128          vec_loop = vec_iv + vec_step;  */
3129
3130   /* Create the induction-phi that defines the induction-operand.  */
3131   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3132   add_referenced_var (vec_dest);
3133   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3134   set_vinfo_for_stmt (induction_phi,
3135                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3136   induc_def = PHI_RESULT (induction_phi);
3137
3138   /* Create the iv update inside the loop  */
3139   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3140                                            induc_def, vec_step);
3141   vec_def = make_ssa_name (vec_dest, new_stmt);
3142   gimple_assign_set_lhs (new_stmt, vec_def);
3143   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3144   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3145                                                    NULL));
3146
3147   /* Set the arguments of the phi node:  */
3148   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3149   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3150                UNKNOWN_LOCATION);
3151
3152
3153   /* In case that vectorization factor (VF) is bigger than the number
3154      of elements that we can fit in a vectype (nunits), we have to generate
3155      more than one vector stmt - i.e - we need to "unroll" the
3156      vector stmt by a factor VF/nunits.  For more details see documentation
3157      in vectorizable_operation.  */
3158
3159   if (ncopies > 1)
3160     {
3161       stmt_vec_info prev_stmt_vinfo;
3162       /* FORNOW. This restriction should be relaxed.  */
3163       gcc_assert (!nested_in_vect_loop);
3164
3165       /* Create the vector that holds the step of the induction.  */
3166       expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3167       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3168                               expr, step_expr);
3169       t = unshare_expr (new_name);
3170       gcc_assert (CONSTANT_CLASS_P (new_name));
3171       vec = build_vector_from_val (stepvectype, t);
3172       vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
3173
3174       vec_def = induc_def;
3175       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3176       for (i = 1; i < ncopies; i++)
3177         {
3178           /* vec_i = vec_prev + vec_step  */
3179           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3180                                                    vec_def, vec_step);
3181           vec_def = make_ssa_name (vec_dest, new_stmt);
3182           gimple_assign_set_lhs (new_stmt, vec_def);
3183
3184           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3185           if (!useless_type_conversion_p (resvectype, vectype))
3186             {
3187               new_stmt = gimple_build_assign_with_ops
3188                   (VIEW_CONVERT_EXPR,
3189                    vect_get_new_vect_var (resvectype, vect_simple_var,
3190                                           "vec_iv_"),
3191                    build1 (VIEW_CONVERT_EXPR, resvectype,
3192                            gimple_assign_lhs (new_stmt)), NULL_TREE);
3193               gimple_assign_set_lhs (new_stmt,
3194                                      make_ssa_name
3195                                        (gimple_assign_lhs (new_stmt), new_stmt));
3196               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3197             }
3198           set_vinfo_for_stmt (new_stmt,
3199                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3200           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3201           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3202         }
3203     }
3204
3205   if (nested_in_vect_loop)
3206     {
3207       /* Find the loop-closed exit-phi of the induction, and record
3208          the final vector of induction results:  */
3209       exit_phi = NULL;
3210       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3211         {
3212           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3213             {
3214               exit_phi = USE_STMT (use_p);
3215               break;
3216             }
3217         }
3218       if (exit_phi)
3219         {
3220           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3221           /* FORNOW. Currently not supporting the case that an inner-loop induction
3222              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3223           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3224                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3225
3226           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3227           if (vect_print_dump_info (REPORT_DETAILS))
3228             {
3229               fprintf (vect_dump, "vector of inductions after inner-loop:");
3230               print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
3231             }
3232         }
3233     }
3234
3235
3236   if (vect_print_dump_info (REPORT_DETAILS))
3237     {
3238       fprintf (vect_dump, "transform induction: created def-use cycle: ");
3239       print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
3240       fprintf (vect_dump, "\n");
3241       print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
3242     }
3243
3244   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3245   if (!useless_type_conversion_p (resvectype, vectype))
3246     {
3247       new_stmt = gimple_build_assign_with_ops
3248          (VIEW_CONVERT_EXPR,
3249           vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3250           build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3251       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3252       gimple_assign_set_lhs (new_stmt, induc_def);
3253       si = gsi_start_bb (bb);
3254       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3255       set_vinfo_for_stmt (new_stmt,
3256                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3257       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3258         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3259     }
3260
3261   return induc_def;
3262 }
3263
3264
3265 /* Function get_initial_def_for_reduction
3266
3267    Input:
3268    STMT - a stmt that performs a reduction operation in the loop.
3269    INIT_VAL - the initial value of the reduction variable
3270
3271    Output:
3272    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3273         of the reduction (used for adjusting the epilog - see below).
3274    Return a vector variable, initialized according to the operation that STMT
3275         performs. This vector will be used as the initial value of the
3276         vector of partial results.
3277
3278    Option1 (adjust in epilog): Initialize the vector as follows:
3279      add/bit or/xor:    [0,0,...,0,0]
3280      mult/bit and:      [1,1,...,1,1]
3281      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3282    and when necessary (e.g. add/mult case) let the caller know
3283    that it needs to adjust the result by init_val.
3284
3285    Option2: Initialize the vector as follows:
3286      add/bit or/xor:    [init_val,0,0,...,0]
3287      mult/bit and:      [init_val,1,1,...,1]
3288      min/max/cond_expr: [init_val,init_val,...,init_val]
3289    and no adjustments are needed.
3290
3291    For example, for the following code:
3292
3293    s = init_val;
3294    for (i=0;i<n;i++)
3295      s = s + a[i];
3296
3297    STMT is 's = s + a[i]', and the reduction variable is 's'.
3298    For a vector of 4 units, we want to return either [0,0,0,init_val],
3299    or [0,0,0,0] and let the caller know that it needs to adjust
3300    the result at the end by 'init_val'.
3301
3302    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3303    initialization vector is simpler (same element in all entries), if
3304    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3305
3306    A cost model should help decide between these two schemes.  */
3307
3308 tree
3309 get_initial_def_for_reduction (gimple stmt, tree init_val,
3310                                tree *adjustment_def)
3311 {
3312   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3313   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3314   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3315   tree scalar_type = TREE_TYPE (init_val);
3316   tree vectype = get_vectype_for_scalar_type (scalar_type);
3317   int nunits;
3318   enum tree_code code = gimple_assign_rhs_code (stmt);
3319   tree def_for_init;
3320   tree init_def;
3321   tree *elts;
3322   int i;
3323   bool nested_in_vect_loop = false;
3324   tree init_value;
3325   REAL_VALUE_TYPE real_init_val = dconst0;
3326   int int_init_val = 0;
3327   gimple def_stmt = NULL;
3328
3329   gcc_assert (vectype);
3330   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3331
3332   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3333               || SCALAR_FLOAT_TYPE_P (scalar_type));
3334
3335   if (nested_in_vect_loop_p (loop, stmt))
3336     nested_in_vect_loop = true;
3337   else
3338     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3339
3340   /* In case of double reduction we only create a vector variable to be put
3341      in the reduction phi node.  The actual statement creation is done in
3342      vect_create_epilog_for_reduction.  */
3343   if (adjustment_def && nested_in_vect_loop
3344       && TREE_CODE (init_val) == SSA_NAME
3345       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3346       && gimple_code (def_stmt) == GIMPLE_PHI
3347       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3348       && vinfo_for_stmt (def_stmt)
3349       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3350           == vect_double_reduction_def)
3351     {
3352       *adjustment_def = NULL;
3353       return vect_create_destination_var (init_val, vectype);
3354     }
3355
3356   if (TREE_CONSTANT (init_val))
3357     {
3358       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3359         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3360       else
3361         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3362     }
3363   else
3364     init_value = init_val;
3365
3366   switch (code)
3367     {
3368       case WIDEN_SUM_EXPR:
3369       case DOT_PROD_EXPR:
3370       case PLUS_EXPR:
3371       case MINUS_EXPR:
3372       case BIT_IOR_EXPR:
3373       case BIT_XOR_EXPR:
3374       case MULT_EXPR:
3375       case BIT_AND_EXPR:
3376         /* ADJUSMENT_DEF is NULL when called from
3377            vect_create_epilog_for_reduction to vectorize double reduction.  */
3378         if (adjustment_def)
3379           {
3380             if (nested_in_vect_loop)
3381               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3382                                                               NULL);
3383             else
3384               *adjustment_def = init_val;
3385           }
3386
3387         if (code == MULT_EXPR)
3388           {
3389             real_init_val = dconst1;
3390             int_init_val = 1;
3391           }
3392
3393         if (code == BIT_AND_EXPR)
3394           int_init_val = -1;
3395
3396         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3397           def_for_init = build_real (scalar_type, real_init_val);
3398         else
3399           def_for_init = build_int_cst (scalar_type, int_init_val);
3400
3401         /* Create a vector of '0' or '1' except the first element.  */
3402         elts = XALLOCAVEC (tree, nunits);
3403         for (i = nunits - 2; i >= 0; --i)
3404           elts[i + 1] = def_for_init;
3405
3406         /* Option1: the first element is '0' or '1' as well.  */
3407         if (adjustment_def)
3408           {
3409             elts[0] = def_for_init;
3410             init_def = build_vector (vectype, elts);
3411             break;
3412           }
3413
3414         /* Option2: the first element is INIT_VAL.  */
3415         elts[0] = init_val;
3416         if (TREE_CONSTANT (init_val))
3417           init_def = build_vector (vectype, elts);
3418         else
3419           {
3420             VEC(constructor_elt,gc) *v;
3421             v = VEC_alloc (constructor_elt, gc, nunits);
3422             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3423             for (i = 1; i < nunits; ++i)
3424               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3425             init_def = build_constructor (vectype, v);
3426           }
3427
3428         break;
3429
3430       case MIN_EXPR:
3431       case MAX_EXPR:
3432       case COND_EXPR:
3433         if (adjustment_def)
3434           {
3435             *adjustment_def = NULL_TREE;
3436             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3437             break;
3438           }
3439
3440         init_def = build_vector_from_val (vectype, init_value);
3441         break;
3442
3443       default:
3444         gcc_unreachable ();
3445     }
3446
3447   return init_def;
3448 }
3449
3450
3451 /* Function vect_create_epilog_for_reduction
3452
3453    Create code at the loop-epilog to finalize the result of a reduction
3454    computation.
3455
3456    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3457      reduction statements.
3458    STMT is the scalar reduction stmt that is being vectorized.
3459    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3460      number of elements that we can fit in a vectype (nunits).  In this case
3461      we have to generate more than one vector stmt - i.e - we need to "unroll"
3462      the vector stmt by a factor VF/nunits.  For more details see documentation
3463      in vectorizable_operation.
3464    REDUC_CODE is the tree-code for the epilog reduction.
3465    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3466      computation.
3467    REDUC_INDEX is the index of the operand in the right hand side of the
3468      statement that is defined by REDUCTION_PHI.
3469    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3470    SLP_NODE is an SLP node containing a group of reduction statements. The
3471      first one in this group is STMT.
3472
3473    This function:
3474    1. Creates the reduction def-use cycles: sets the arguments for
3475       REDUCTION_PHIS:
3476       The loop-entry argument is the vectorized initial-value of the reduction.
3477       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3478       sums.
3479    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3480       by applying the operation specified by REDUC_CODE if available, or by
3481       other means (whole-vector shifts or a scalar loop).
3482       The function also creates a new phi node at the loop exit to preserve
3483       loop-closed form, as illustrated below.
3484
3485      The flow at the entry to this function:
3486
3487         loop:
3488           vec_def = phi <null, null>            # REDUCTION_PHI
3489           VECT_DEF = vector_stmt                # vectorized form of STMT
3490           s_loop = scalar_stmt                  # (scalar) STMT
3491         loop_exit:
3492           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3493           use <s_out0>
3494           use <s_out0>
3495
3496      The above is transformed by this function into:
3497
3498         loop:
3499           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3500           VECT_DEF = vector_stmt                # vectorized form of STMT
3501           s_loop = scalar_stmt                  # (scalar) STMT
3502         loop_exit:
3503           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3504           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3505           v_out2 = reduce <v_out1>
3506           s_out3 = extract_field <v_out2, 0>
3507           s_out4 = adjust_result <s_out3>
3508           use <s_out4>
3509           use <s_out4>
3510 */
3511
3512 static void
3513 vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
3514                                   int ncopies, enum tree_code reduc_code,
3515                                   VEC (gimple, heap) *reduction_phis,
3516                                   int reduc_index, bool double_reduc,
3517                                   slp_tree slp_node)
3518 {
3519   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3520   stmt_vec_info prev_phi_info;
3521   tree vectype;
3522   enum machine_mode mode;
3523   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3524   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3525   basic_block exit_bb;
3526   tree scalar_dest;
3527   tree scalar_type;
3528   gimple new_phi = NULL, phi;
3529   gimple_stmt_iterator exit_gsi;
3530   tree vec_dest;
3531   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3532   gimple epilog_stmt = NULL;
3533   enum tree_code code = gimple_assign_rhs_code (stmt);
3534   gimple exit_phi;
3535   tree bitsize, bitpos;
3536   tree adjustment_def = NULL;
3537   tree vec_initial_def = NULL;
3538   tree reduction_op, expr, def;
3539   tree orig_name, scalar_result;
3540   imm_use_iterator imm_iter, phi_imm_iter;
3541   use_operand_p use_p, phi_use_p;
3542   bool extract_scalar_result = false;
3543   gimple use_stmt, orig_stmt, reduction_phi = NULL;
3544   bool nested_in_vect_loop = false;
3545   VEC (gimple, heap) *new_phis = NULL;
3546   VEC (gimple, heap) *inner_phis = NULL;
3547   enum vect_def_type dt = vect_unknown_def_type;
3548   int j, i;
3549   VEC (tree, heap) *scalar_results = NULL;
3550   unsigned int group_size = 1, k, ratio;
3551   VEC (tree, heap) *vec_initial_defs = NULL;
3552   VEC (gimple, heap) *phis;
3553   bool slp_reduc = false;
3554   tree new_phi_result;
3555   gimple inner_phi = NULL;
3556
3557   if (slp_node)
3558     group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node));
3559
3560   if (nested_in_vect_loop_p (loop, stmt))
3561     {
3562       outer_loop = loop;
3563       loop = loop->inner;
3564       nested_in_vect_loop = true;
3565       gcc_assert (!slp_node);
3566     }
3567
3568   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3569     {
3570     case GIMPLE_SINGLE_RHS:
3571       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3572                   == ternary_op);
3573       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3574       break;
3575     case GIMPLE_UNARY_RHS:
3576       reduction_op = gimple_assign_rhs1 (stmt);
3577       break;
3578     case GIMPLE_BINARY_RHS:
3579       reduction_op = reduc_index ?
3580                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3581       break;
3582     case GIMPLE_TERNARY_RHS:
3583       reduction_op = gimple_op (stmt, reduc_index + 1);
3584       break;
3585     default:
3586       gcc_unreachable ();
3587     }
3588
3589   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3590   gcc_assert (vectype);
3591   mode = TYPE_MODE (vectype);
3592
3593   /* 1. Create the reduction def-use cycle:
3594      Set the arguments of REDUCTION_PHIS, i.e., transform
3595
3596         loop:
3597           vec_def = phi <null, null>            # REDUCTION_PHI
3598           VECT_DEF = vector_stmt                # vectorized form of STMT
3599           ...
3600
3601      into:
3602
3603         loop:
3604           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3605           VECT_DEF = vector_stmt                # vectorized form of STMT
3606           ...
3607
3608      (in case of SLP, do it for all the phis). */
3609
3610   /* Get the loop-entry arguments.  */
3611   if (slp_node)
3612     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3613                        NULL, slp_node, reduc_index);
3614   else
3615     {
3616       vec_initial_defs = VEC_alloc (tree, heap, 1);
3617      /* For the case of reduction, vect_get_vec_def_for_operand returns
3618         the scalar def before the loop, that defines the initial value
3619         of the reduction variable.  */
3620       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3621                                                       &adjustment_def);
3622       VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
3623     }
3624
3625   /* Set phi nodes arguments.  */
3626   FOR_EACH_VEC_ELT (gimple, reduction_phis, i, phi)
3627     {
3628       tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
3629       tree def = VEC_index (tree, vect_defs, i);
3630       for (j = 0; j < ncopies; j++)
3631         {
3632           /* Set the loop-entry arg of the reduction-phi.  */
3633           add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3634                        UNKNOWN_LOCATION);
3635
3636           /* Set the loop-latch arg for the reduction-phi.  */
3637           if (j > 0)
3638             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3639
3640           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3641
3642           if (vect_print_dump_info (REPORT_DETAILS))
3643             {
3644               fprintf (vect_dump, "transform reduction: created def-use"
3645                                   " cycle: ");
3646               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
3647               fprintf (vect_dump, "\n");
3648               print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0,
3649                                  TDF_SLIM);
3650             }
3651
3652           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3653         }
3654     }
3655
3656   VEC_free (tree, heap, vec_initial_defs);
3657
3658   /* 2. Create epilog code.
3659         The reduction epilog code operates across the elements of the vector
3660         of partial results computed by the vectorized loop.
3661         The reduction epilog code consists of:
3662
3663         step 1: compute the scalar result in a vector (v_out2)
3664         step 2: extract the scalar result (s_out3) from the vector (v_out2)
3665         step 3: adjust the scalar result (s_out3) if needed.
3666
3667         Step 1 can be accomplished using one the following three schemes:
3668           (scheme 1) using reduc_code, if available.
3669           (scheme 2) using whole-vector shifts, if available.
3670           (scheme 3) using a scalar loop. In this case steps 1+2 above are
3671                      combined.
3672
3673           The overall epilog code looks like this:
3674
3675           s_out0 = phi <s_loop>         # original EXIT_PHI
3676           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
3677           v_out2 = reduce <v_out1>              # step 1
3678           s_out3 = extract_field <v_out2, 0>    # step 2
3679           s_out4 = adjust_result <s_out3>       # step 3
3680
3681           (step 3 is optional, and steps 1 and 2 may be combined).
3682           Lastly, the uses of s_out0 are replaced by s_out4.  */
3683
3684
3685   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3686          v_out1 = phi <VECT_DEF>
3687          Store them in NEW_PHIS.  */
3688
3689   exit_bb = single_exit (loop)->dest;
3690   prev_phi_info = NULL;
3691   new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3692   FOR_EACH_VEC_ELT (tree, vect_defs, i, def)
3693     {
3694       for (j = 0; j < ncopies; j++)
3695         {
3696           phi = create_phi_node (SSA_NAME_VAR (def), exit_bb);
3697           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3698           if (j == 0)
3699             VEC_quick_push (gimple, new_phis, phi);
3700           else
3701             {
3702               def = vect_get_vec_def_for_stmt_copy (dt, def);
3703               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3704             }
3705
3706           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3707           prev_phi_info = vinfo_for_stmt (phi);
3708         }
3709     }
3710
3711   /* The epilogue is created for the outer-loop, i.e., for the loop being
3712      vectorized.  Create exit phis for the outer loop.  */
3713   if (double_reduc)
3714     {
3715       loop = outer_loop;
3716       exit_bb = single_exit (loop)->dest;
3717       inner_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3718       FOR_EACH_VEC_ELT (gimple, new_phis, i, phi)
3719         {
3720           gimple outer_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (phi)),
3721                                               exit_bb);
3722           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3723                            PHI_RESULT (phi));
3724           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3725                                                             loop_vinfo, NULL));
3726           VEC_quick_push (gimple, inner_phis, phi);
3727           VEC_replace (gimple, new_phis, i, outer_phi);
3728           prev_phi_info = vinfo_for_stmt (outer_phi);
3729           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
3730             {
3731               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3732               outer_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (phi)),
3733                                            exit_bb);
3734               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3735                                PHI_RESULT (phi));
3736               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3737                                                         loop_vinfo, NULL));
3738               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
3739               prev_phi_info = vinfo_for_stmt (outer_phi);
3740             }
3741         }
3742     }
3743
3744   exit_gsi = gsi_after_labels (exit_bb);
3745
3746   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3747          (i.e. when reduc_code is not available) and in the final adjustment
3748          code (if needed).  Also get the original scalar reduction variable as
3749          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
3750          represents a reduction pattern), the tree-code and scalar-def are
3751          taken from the original stmt that the pattern-stmt (STMT) replaces.
3752          Otherwise (it is a regular reduction) - the tree-code and scalar-def
3753          are taken from STMT.  */
3754
3755   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3756   if (!orig_stmt)
3757     {
3758       /* Regular reduction  */
3759       orig_stmt = stmt;
3760     }
3761   else
3762     {
3763       /* Reduction pattern  */
3764       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3765       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3766       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3767     }
3768
3769   code = gimple_assign_rhs_code (orig_stmt);
3770   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3771      partial results are added and not subtracted.  */
3772   if (code == MINUS_EXPR)
3773     code = PLUS_EXPR;
3774
3775   scalar_dest = gimple_assign_lhs (orig_stmt);
3776   scalar_type = TREE_TYPE (scalar_dest);
3777   scalar_results = VEC_alloc (tree, heap, group_size);
3778   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3779   bitsize = TYPE_SIZE (scalar_type);
3780
3781   /* In case this is a reduction in an inner-loop while vectorizing an outer
3782      loop - we don't need to extract a single scalar result at the end of the
3783      inner-loop (unless it is double reduction, i.e., the use of reduction is
3784      outside the outer-loop).  The final vector of partial results will be used
3785      in the vectorized outer-loop, or reduced to a scalar result at the end of
3786      the outer-loop.  */
3787   if (nested_in_vect_loop && !double_reduc)
3788     goto vect_finalize_reduction;
3789
3790   /* SLP reduction without reduction chain, e.g.,
3791      # a1 = phi <a2, a0>
3792      # b1 = phi <b2, b0>
3793      a2 = operation (a1)
3794      b2 = operation (b1)  */
3795   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
3796
3797   /* In case of reduction chain, e.g.,
3798      # a1 = phi <a3, a0>
3799      a2 = operation (a1)
3800      a3 = operation (a2),
3801
3802      we may end up with more than one vector result.  Here we reduce them to
3803      one vector.  */
3804   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
3805     {
3806       tree first_vect = PHI_RESULT (VEC_index (gimple, new_phis, 0));
3807       tree tmp;
3808       gimple new_vec_stmt = NULL;
3809
3810       vec_dest = vect_create_destination_var (scalar_dest, vectype);
3811       for (k = 1; k < VEC_length (gimple, new_phis); k++)
3812         {
3813           gimple next_phi = VEC_index (gimple, new_phis, k);
3814           tree second_vect = PHI_RESULT (next_phi);
3815
3816           tmp = build2 (code, vectype,  first_vect, second_vect);
3817           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
3818           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
3819           gimple_assign_set_lhs (new_vec_stmt, first_vect);
3820           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
3821         }
3822
3823       new_phi_result = first_vect;
3824       if (new_vec_stmt)
3825         {
3826           VEC_truncate (gimple, new_phis, 0);
3827           VEC_safe_push (gimple, heap, new_phis, new_vec_stmt);
3828         }
3829     }
3830   else
3831     new_phi_result = PHI_RESULT (VEC_index (gimple, new_phis, 0));
3832
3833   /* 2.3 Create the reduction code, using one of the three schemes described
3834          above. In SLP we simply need to extract all the elements from the
3835          vector (without reducing them), so we use scalar shifts.  */
3836   if (reduc_code != ERROR_MARK && !slp_reduc)
3837     {
3838       tree tmp;
3839
3840       /*** Case 1:  Create:
3841            v_out2 = reduc_expr <v_out1>  */
3842
3843       if (vect_print_dump_info (REPORT_DETAILS))
3844         fprintf (vect_dump, "Reduce using direct vector reduction.");
3845
3846       vec_dest = vect_create_destination_var (scalar_dest, vectype);
3847       tmp = build1 (reduc_code, vectype, new_phi_result);
3848       epilog_stmt = gimple_build_assign (vec_dest, tmp);
3849       new_temp = make_ssa_name (vec_dest, epilog_stmt);
3850       gimple_assign_set_lhs (epilog_stmt, new_temp);
3851       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3852
3853       extract_scalar_result = true;
3854     }
3855   else
3856     {
3857       enum tree_code shift_code = ERROR_MARK;
3858       bool have_whole_vector_shift = true;
3859       int bit_offset;
3860       int element_bitsize = tree_low_cst (bitsize, 1);
3861       int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3862       tree vec_temp;
3863
3864       if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3865         shift_code = VEC_RSHIFT_EXPR;
3866       else
3867         have_whole_vector_shift = false;
3868
3869       /* Regardless of whether we have a whole vector shift, if we're
3870          emulating the operation via tree-vect-generic, we don't want
3871          to use it.  Only the first round of the reduction is likely
3872          to still be profitable via emulation.  */
3873       /* ??? It might be better to emit a reduction tree code here, so that
3874          tree-vect-generic can expand the first round via bit tricks.  */
3875       if (!VECTOR_MODE_P (mode))
3876         have_whole_vector_shift = false;
3877       else
3878         {
3879           optab optab = optab_for_tree_code (code, vectype, optab_default);
3880           if (optab_handler (optab, mode) == CODE_FOR_nothing)
3881             have_whole_vector_shift = false;
3882         }
3883
3884       if (have_whole_vector_shift && !slp_reduc)
3885         {
3886           /*** Case 2: Create:
3887              for (offset = VS/2; offset >= element_size; offset/=2)
3888                 {
3889                   Create:  va' = vec_shift <va, offset>
3890                   Create:  va = vop <va, va'>
3891                 }  */
3892
3893           if (vect_print_dump_info (REPORT_DETAILS))
3894             fprintf (vect_dump, "Reduce using vector shifts");
3895
3896           vec_dest = vect_create_destination_var (scalar_dest, vectype);
3897           new_temp = new_phi_result;
3898           for (bit_offset = vec_size_in_bits/2;
3899                bit_offset >= element_bitsize;
3900                bit_offset /= 2)
3901             {
3902               tree bitpos = size_int (bit_offset);
3903
3904               epilog_stmt = gimple_build_assign_with_ops (shift_code,
3905                                                vec_dest, new_temp, bitpos);
3906               new_name = make_ssa_name (vec_dest, epilog_stmt);
3907               gimple_assign_set_lhs (epilog_stmt, new_name);
3908               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3909
3910               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3911                                                           new_name, new_temp);
3912               new_temp = make_ssa_name (vec_dest, epilog_stmt);
3913               gimple_assign_set_lhs (epilog_stmt, new_temp);
3914               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3915             }
3916
3917           extract_scalar_result = true;
3918         }
3919       else
3920         {
3921           tree rhs;
3922
3923           /*** Case 3: Create:
3924              s = extract_field <v_out2, 0>
3925              for (offset = element_size;
3926                   offset < vector_size;
3927                   offset += element_size;)
3928                {
3929                  Create:  s' = extract_field <v_out2, offset>
3930                  Create:  s = op <s, s'>  // For non SLP cases
3931                }  */
3932
3933           if (vect_print_dump_info (REPORT_DETAILS))
3934             fprintf (vect_dump, "Reduce using scalar code. ");
3935
3936           vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3937           FOR_EACH_VEC_ELT (gimple, new_phis, i, new_phi)
3938             {
3939               if (gimple_code (new_phi) == GIMPLE_PHI)
3940                 vec_temp = PHI_RESULT (new_phi);
3941               else
3942                 vec_temp = gimple_assign_lhs (new_phi);
3943               rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3944                             bitsize_zero_node);
3945               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3946               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3947               gimple_assign_set_lhs (epilog_stmt, new_temp);
3948               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3949
3950               /* In SLP we don't need to apply reduction operation, so we just
3951                  collect s' values in SCALAR_RESULTS.  */
3952               if (slp_reduc)
3953                 VEC_safe_push (tree, heap, scalar_results, new_temp);
3954
3955               for (bit_offset = element_bitsize;
3956                    bit_offset < vec_size_in_bits;
3957                    bit_offset += element_bitsize)
3958                 {
3959                   tree bitpos = bitsize_int (bit_offset);
3960                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
3961                                      bitsize, bitpos);
3962
3963                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3964                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3965                   gimple_assign_set_lhs (epilog_stmt, new_name);
3966                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3967
3968                   if (slp_reduc)
3969                     {
3970                       /* In SLP we don't need to apply reduction operation, so
3971                          we just collect s' values in SCALAR_RESULTS.  */
3972                       new_temp = new_name;
3973                       VEC_safe_push (tree, heap, scalar_results, new_name);
3974                     }
3975                   else
3976                     {
3977                       epilog_stmt = gimple_build_assign_with_ops (code,
3978                                           new_scalar_dest, new_name, new_temp);
3979                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3980                       gimple_assign_set_lhs (epilog_stmt, new_temp);
3981                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3982                     }
3983                 }
3984             }
3985
3986           /* The only case where we need to reduce scalar results in SLP, is
3987              unrolling.  If the size of SCALAR_RESULTS is greater than
3988              GROUP_SIZE, we reduce them combining elements modulo
3989              GROUP_SIZE.  */
3990           if (slp_reduc)
3991             {
3992               tree res, first_res, new_res;
3993               gimple new_stmt;
3994
3995               /* Reduce multiple scalar results in case of SLP unrolling.  */
3996               for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
3997                    j++)
3998                 {
3999                   first_res = VEC_index (tree, scalar_results, j % group_size);
4000                   new_stmt = gimple_build_assign_with_ops (code,
4001                                               new_scalar_dest, first_res, res);
4002                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4003                   gimple_assign_set_lhs (new_stmt, new_res);
4004                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4005                   VEC_replace (tree, scalar_results, j % group_size, new_res);
4006                 }
4007             }
4008           else
4009             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4010             VEC_safe_push (tree, heap, scalar_results, new_temp);
4011
4012           extract_scalar_result = false;
4013         }
4014     }
4015
4016   /* 2.4  Extract the final scalar result.  Create:
4017           s_out3 = extract_field <v_out2, bitpos>  */
4018
4019   if (extract_scalar_result)
4020     {
4021       tree rhs;
4022
4023       if (vect_print_dump_info (REPORT_DETAILS))
4024         fprintf (vect_dump, "extract scalar result");
4025
4026       if (BYTES_BIG_ENDIAN)
4027         bitpos = size_binop (MULT_EXPR,
4028                              bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4029                              TYPE_SIZE (scalar_type));
4030       else
4031         bitpos = bitsize_zero_node;
4032
4033       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4034       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4035       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4036       gimple_assign_set_lhs (epilog_stmt, new_temp);
4037       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4038       VEC_safe_push (tree, heap, scalar_results, new_temp);
4039     }
4040
4041 vect_finalize_reduction:
4042
4043   if (double_reduc)
4044     loop = loop->inner;
4045
4046   /* 2.5 Adjust the final result by the initial value of the reduction
4047          variable. (When such adjustment is not needed, then
4048          'adjustment_def' is zero).  For example, if code is PLUS we create:
4049          new_temp = loop_exit_def + adjustment_def  */
4050
4051   if (adjustment_def)
4052     {
4053       gcc_assert (!slp_reduc);
4054       if (nested_in_vect_loop)
4055         {
4056           new_phi = VEC_index (gimple, new_phis, 0);
4057           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4058           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4059           new_dest = vect_create_destination_var (scalar_dest, vectype);
4060         }
4061       else
4062         {
4063           new_temp = VEC_index (tree, scalar_results, 0);
4064           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4065           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4066           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4067         }
4068
4069       epilog_stmt = gimple_build_assign (new_dest, expr);
4070       new_temp = make_ssa_name (new_dest, epilog_stmt);
4071       gimple_assign_set_lhs (epilog_stmt, new_temp);
4072       SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
4073       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4074       if (nested_in_vect_loop)
4075         {
4076           set_vinfo_for_stmt (epilog_stmt,
4077                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
4078                                                  NULL));
4079           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4080                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4081
4082           if (!double_reduc)
4083             VEC_quick_push (tree, scalar_results, new_temp);
4084           else
4085             VEC_replace (tree, scalar_results, 0, new_temp);
4086         }
4087       else
4088         VEC_replace (tree, scalar_results, 0, new_temp);
4089
4090       VEC_replace (gimple, new_phis, 0, epilog_stmt);
4091     }
4092
4093   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4094           phis with new adjusted scalar results, i.e., replace use <s_out0>
4095           with use <s_out4>.
4096
4097      Transform:
4098         loop_exit:
4099           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4100           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4101           v_out2 = reduce <v_out1>
4102           s_out3 = extract_field <v_out2, 0>
4103           s_out4 = adjust_result <s_out3>
4104           use <s_out0>
4105           use <s_out0>
4106
4107      into:
4108
4109         loop_exit:
4110           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4111           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4112           v_out2 = reduce <v_out1>
4113           s_out3 = extract_field <v_out2, 0>
4114           s_out4 = adjust_result <s_out3>
4115           use <s_out4>
4116           use <s_out4> */
4117
4118
4119   /* In SLP reduction chain we reduce vector results into one vector if
4120      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4121      the last stmt in the reduction chain, since we are looking for the loop
4122      exit phi node.  */
4123   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4124     {
4125       scalar_dest = gimple_assign_lhs (VEC_index (gimple,
4126                                        SLP_TREE_SCALAR_STMTS (slp_node),
4127                                        group_size - 1));
4128       group_size = 1;
4129     }
4130
4131   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4132      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4133      need to match SCALAR_RESULTS with corresponding statements.  The first
4134      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4135      the first vector stmt, etc.
4136      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4137   if (group_size > VEC_length (gimple, new_phis))
4138     {
4139       ratio = group_size / VEC_length (gimple, new_phis);
4140       gcc_assert (!(group_size % VEC_length (gimple, new_phis)));
4141     }
4142   else
4143     ratio = 1;
4144
4145   for (k = 0; k < group_size; k++)
4146     {
4147       if (k % ratio == 0)
4148         {
4149           epilog_stmt = VEC_index (gimple, new_phis, k / ratio);
4150           reduction_phi = VEC_index (gimple, reduction_phis, k / ratio);
4151           if (double_reduc)
4152             inner_phi = VEC_index (gimple, inner_phis, k / ratio);
4153         }
4154
4155       if (slp_reduc)
4156         {
4157           gimple current_stmt = VEC_index (gimple,
4158                                        SLP_TREE_SCALAR_STMTS (slp_node), k);
4159
4160           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4161           /* SLP statements can't participate in patterns.  */
4162           gcc_assert (!orig_stmt);
4163           scalar_dest = gimple_assign_lhs (current_stmt);
4164         }
4165
4166       phis = VEC_alloc (gimple, heap, 3);
4167       /* Find the loop-closed-use at the loop exit of the original scalar
4168          result.  (The reduction result is expected to have two immediate uses -
4169          one at the latch block, and one at the loop exit).  */
4170       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4171         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4172           VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
4173
4174       /* We expect to have found an exit_phi because of loop-closed-ssa
4175          form.  */
4176       gcc_assert (!VEC_empty (gimple, phis));
4177
4178       FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
4179         {
4180           if (outer_loop)
4181             {
4182               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4183               gimple vect_phi;
4184
4185               /* FORNOW. Currently not supporting the case that an inner-loop
4186                  reduction is not used in the outer-loop (but only outside the
4187                  outer-loop), unless it is double reduction.  */
4188               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4189                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4190                           || double_reduc);
4191
4192               STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4193               if (!double_reduc
4194                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4195                       != vect_double_reduction_def)
4196                 continue;
4197
4198               /* Handle double reduction:
4199
4200                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4201                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4202                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4203                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4204
4205                  At that point the regular reduction (stmt2 and stmt3) is
4206                  already vectorized, as well as the exit phi node, stmt4.
4207                  Here we vectorize the phi node of double reduction, stmt1, and
4208                  update all relevant statements.  */
4209
4210               /* Go through all the uses of s2 to find double reduction phi
4211                  node, i.e., stmt1 above.  */
4212               orig_name = PHI_RESULT (exit_phi);
4213               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4214                 {
4215                   stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4216                   stmt_vec_info new_phi_vinfo;
4217                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4218                   basic_block bb = gimple_bb (use_stmt);
4219                   gimple use;
4220
4221                   /* Check that USE_STMT is really double reduction phi
4222                      node.  */
4223                   if (gimple_code (use_stmt) != GIMPLE_PHI
4224                       || gimple_phi_num_args (use_stmt) != 2
4225                       || !use_stmt_vinfo
4226                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4227                           != vect_double_reduction_def
4228                       || bb->loop_father != outer_loop)
4229                     continue;
4230
4231                   /* Create vector phi node for double reduction:
4232                      vs1 = phi <vs0, vs2>
4233                      vs1 was created previously in this function by a call to
4234                        vect_get_vec_def_for_operand and is stored in
4235                        vec_initial_def;
4236                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4237                      vs0 is created here.  */
4238
4239                   /* Create vector phi node.  */
4240                   vect_phi = create_phi_node (vec_initial_def, bb);
4241                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4242                                     loop_vec_info_for_loop (outer_loop), NULL);
4243                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4244
4245                   /* Create vs0 - initial def of the double reduction phi.  */
4246                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4247                                              loop_preheader_edge (outer_loop));
4248                   init_def = get_initial_def_for_reduction (stmt,
4249                                                           preheader_arg, NULL);
4250                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4251                                                     vectype, NULL);
4252
4253                   /* Update phi node arguments with vs0 and vs2.  */
4254                   add_phi_arg (vect_phi, vect_phi_init,
4255                                loop_preheader_edge (outer_loop),
4256                                UNKNOWN_LOCATION);
4257                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4258                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4259                   if (vect_print_dump_info (REPORT_DETAILS))
4260                     {
4261                       fprintf (vect_dump, "created double reduction phi "
4262                                           "node: ");
4263                       print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
4264                     }
4265
4266                   vect_phi_res = PHI_RESULT (vect_phi);
4267
4268                   /* Replace the use, i.e., set the correct vs1 in the regular
4269                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4270                      loop is redundant.  */
4271                   use = reduction_phi;
4272                   for (j = 0; j < ncopies; j++)
4273                     {
4274                       edge pr_edge = loop_preheader_edge (loop);
4275                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4276                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4277                     }
4278                 }
4279             }
4280         }
4281
4282       VEC_free (gimple, heap, phis);
4283       if (nested_in_vect_loop)
4284         {
4285           if (double_reduc)
4286             loop = outer_loop;
4287           else
4288             continue;
4289         }
4290
4291       phis = VEC_alloc (gimple, heap, 3);
4292       /* Find the loop-closed-use at the loop exit of the original scalar
4293          result.  (The reduction result is expected to have two immediate uses,
4294          one at the latch block, and one at the loop exit).  For double
4295          reductions we are looking for exit phis of the outer loop.  */
4296       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4297         {
4298           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4299             VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
4300           else
4301             {
4302               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4303                 {
4304                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4305
4306                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4307                     {
4308                       if (!flow_bb_inside_loop_p (loop,
4309                                              gimple_bb (USE_STMT (phi_use_p))))
4310                         VEC_safe_push (gimple, heap, phis,
4311                                        USE_STMT (phi_use_p));
4312                     }
4313                 }
4314             }
4315         }
4316
4317       FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
4318         {
4319           /* Replace the uses:  */
4320           orig_name = PHI_RESULT (exit_phi);
4321           scalar_result = VEC_index (tree, scalar_results, k);
4322           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4323             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4324               SET_USE (use_p, scalar_result);
4325         }
4326
4327       VEC_free (gimple, heap, phis);
4328     }
4329
4330   VEC_free (tree, heap, scalar_results);
4331   VEC_free (gimple, heap, new_phis);
4332 }
4333
4334
4335 /* Function vectorizable_reduction.
4336
4337    Check if STMT performs a reduction operation that can be vectorized.
4338    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4339    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4340    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4341
4342    This function also handles reduction idioms (patterns) that have been
4343    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4344    of this form:
4345      X = pattern_expr (arg0, arg1, ..., X)
4346    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4347    sequence that had been detected and replaced by the pattern-stmt (STMT).
4348
4349    In some cases of reduction patterns, the type of the reduction variable X is
4350    different than the type of the other arguments of STMT.
4351    In such cases, the vectype that is used when transforming STMT into a vector
4352    stmt is different than the vectype that is used to determine the
4353    vectorization factor, because it consists of a different number of elements
4354    than the actual number of elements that are being operated upon in parallel.
4355
4356    For example, consider an accumulation of shorts into an int accumulator.
4357    On some targets it's possible to vectorize this pattern operating on 8
4358    shorts at a time (hence, the vectype for purposes of determining the
4359    vectorization factor should be V8HI); on the other hand, the vectype that
4360    is used to create the vector form is actually V4SI (the type of the result).
4361
4362    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4363    indicates what is the actual level of parallelism (V8HI in the example), so
4364    that the right vectorization factor would be derived.  This vectype
4365    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4366    be used to create the vectorized stmt.  The right vectype for the vectorized
4367    stmt is obtained from the type of the result X:
4368         get_vectype_for_scalar_type (TREE_TYPE (X))
4369
4370    This means that, contrary to "regular" reductions (or "regular" stmts in
4371    general), the following equation:
4372       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4373    does *NOT* necessarily hold for reduction patterns.  */
4374
4375 bool
4376 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4377                         gimple *vec_stmt, slp_tree slp_node)
4378 {
4379   tree vec_dest;
4380   tree scalar_dest;
4381   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4382   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4383   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4384   tree vectype_in = NULL_TREE;
4385   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4386   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4387   enum tree_code code, orig_code, epilog_reduc_code;
4388   enum machine_mode vec_mode;
4389   int op_type;
4390   optab optab, reduc_optab;
4391   tree new_temp = NULL_TREE;
4392   tree def;
4393   gimple def_stmt;
4394   enum vect_def_type dt;
4395   gimple new_phi = NULL;
4396   tree scalar_type;
4397   bool is_simple_use;
4398   gimple orig_stmt;
4399   stmt_vec_info orig_stmt_info;
4400   tree expr = NULL_TREE;
4401   int i;
4402   int ncopies;
4403   int epilog_copies;
4404   stmt_vec_info prev_stmt_info, prev_phi_info;
4405   bool single_defuse_cycle = false;
4406   tree reduc_def = NULL_TREE;
4407   gimple new_stmt = NULL;
4408   int j;
4409   tree ops[3];
4410   bool nested_cycle = false, found_nested_cycle_def = false;
4411   gimple reduc_def_stmt = NULL;
4412   /* The default is that the reduction variable is the last in statement.  */
4413   int reduc_index = 2;
4414   bool double_reduc = false, dummy;
4415   basic_block def_bb;
4416   struct loop * def_stmt_loop, *outer_loop = NULL;
4417   tree def_arg;
4418   gimple def_arg_stmt;
4419   VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
4420   VEC (gimple, heap) *phis = NULL;
4421   int vec_num;
4422   tree def0, def1, tem, op0, op1 = NULL_TREE;
4423
4424   /* In case of reduction chain we switch to the first stmt in the chain, but
4425      we don't update STMT_INFO, since only the last stmt is marked as reduction
4426      and has reduction properties.  */
4427   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4428     stmt = GROUP_FIRST_ELEMENT (stmt_info);
4429
4430   if (nested_in_vect_loop_p (loop, stmt))
4431     {
4432       outer_loop = loop;
4433       loop = loop->inner;
4434       nested_cycle = true;
4435     }
4436
4437   /* 1. Is vectorizable reduction?  */
4438   /* Not supportable if the reduction variable is used in the loop, unless
4439      it's a reduction chain.  */
4440   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4441       && !GROUP_FIRST_ELEMENT (stmt_info))
4442     return false;
4443
4444   /* Reductions that are not used even in an enclosing outer-loop,
4445      are expected to be "live" (used out of the loop).  */
4446   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4447       && !STMT_VINFO_LIVE_P (stmt_info))
4448     return false;
4449
4450   /* Make sure it was already recognized as a reduction computation.  */
4451   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4452       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4453     return false;
4454
4455   /* 2. Has this been recognized as a reduction pattern?
4456
4457      Check if STMT represents a pattern that has been recognized
4458      in earlier analysis stages.  For stmts that represent a pattern,
4459      the STMT_VINFO_RELATED_STMT field records the last stmt in
4460      the original sequence that constitutes the pattern.  */
4461
4462   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4463   if (orig_stmt)
4464     {
4465       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4466       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
4467       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4468       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4469     }
4470
4471   /* 3. Check the operands of the operation.  The first operands are defined
4472         inside the loop body. The last operand is the reduction variable,
4473         which is defined by the loop-header-phi.  */
4474
4475   gcc_assert (is_gimple_assign (stmt));
4476
4477   /* Flatten RHS.  */
4478   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4479     {
4480     case GIMPLE_SINGLE_RHS:
4481       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4482       if (op_type == ternary_op)
4483         {
4484           tree rhs = gimple_assign_rhs1 (stmt);
4485           ops[0] = TREE_OPERAND (rhs, 0);
4486           ops[1] = TREE_OPERAND (rhs, 1);
4487           ops[2] = TREE_OPERAND (rhs, 2);
4488           code = TREE_CODE (rhs);
4489         }
4490       else
4491         return false;
4492       break;
4493
4494     case GIMPLE_BINARY_RHS:
4495       code = gimple_assign_rhs_code (stmt);
4496       op_type = TREE_CODE_LENGTH (code);
4497       gcc_assert (op_type == binary_op);
4498       ops[0] = gimple_assign_rhs1 (stmt);
4499       ops[1] = gimple_assign_rhs2 (stmt);
4500       break;
4501
4502     case GIMPLE_TERNARY_RHS:
4503       code = gimple_assign_rhs_code (stmt);
4504       op_type = TREE_CODE_LENGTH (code);
4505       gcc_assert (op_type == ternary_op);
4506       ops[0] = gimple_assign_rhs1 (stmt);
4507       ops[1] = gimple_assign_rhs2 (stmt);
4508       ops[2] = gimple_assign_rhs3 (stmt);
4509       break;
4510
4511     case GIMPLE_UNARY_RHS:
4512       return false;
4513
4514     default:
4515       gcc_unreachable ();
4516     }
4517
4518   if (code == COND_EXPR && slp_node)
4519     return false;
4520
4521   scalar_dest = gimple_assign_lhs (stmt);
4522   scalar_type = TREE_TYPE (scalar_dest);
4523   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4524       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4525     return false;
4526
4527   /* Do not try to vectorize bit-precision reductions.  */
4528   if ((TYPE_PRECISION (scalar_type)
4529        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4530     return false;
4531
4532   /* All uses but the last are expected to be defined in the loop.
4533      The last use is the reduction variable.  In case of nested cycle this
4534      assumption is not true: we use reduc_index to record the index of the
4535      reduction variable.  */
4536   for (i = 0; i < op_type-1; i++)
4537     {
4538       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4539       if (i == 0 && code == COND_EXPR)
4540         continue;
4541
4542       is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4543                                             &def_stmt, &def, &dt, &tem);
4544       if (!vectype_in)
4545         vectype_in = tem;
4546       gcc_assert (is_simple_use);
4547
4548       if (dt != vect_internal_def
4549           && dt != vect_external_def
4550           && dt != vect_constant_def
4551           && dt != vect_induction_def
4552           && !(dt == vect_nested_cycle && nested_cycle))
4553         return false;
4554
4555       if (dt == vect_nested_cycle)
4556         {
4557           found_nested_cycle_def = true;
4558           reduc_def_stmt = def_stmt;
4559           reduc_index = i;
4560         }
4561     }
4562
4563   is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4564                                         &def_stmt, &def, &dt, &tem);
4565   if (!vectype_in)
4566     vectype_in = tem;
4567   gcc_assert (is_simple_use);
4568   gcc_assert (dt == vect_reduction_def
4569               || dt == vect_nested_cycle
4570               || ((dt == vect_internal_def || dt == vect_external_def
4571                    || dt == vect_constant_def || dt == vect_induction_def)
4572                    && nested_cycle && found_nested_cycle_def));
4573   if (!found_nested_cycle_def)
4574     reduc_def_stmt = def_stmt;
4575
4576   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4577   if (orig_stmt)
4578     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4579                                                        reduc_def_stmt,
4580                                                        !nested_cycle,
4581                                                        &dummy));
4582   else
4583     {
4584       gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4585                                              !nested_cycle, &dummy);
4586       /* We changed STMT to be the first stmt in reduction chain, hence we
4587          check that in this case the first element in the chain is STMT.  */
4588       gcc_assert (stmt == tmp
4589                   || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4590     }
4591
4592   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4593     return false;
4594
4595   if (slp_node || PURE_SLP_STMT (stmt_info))
4596     ncopies = 1;
4597   else
4598     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4599                / TYPE_VECTOR_SUBPARTS (vectype_in));
4600
4601   gcc_assert (ncopies >= 1);
4602
4603   vec_mode = TYPE_MODE (vectype_in);
4604
4605   if (code == COND_EXPR)
4606     {
4607       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4608         {
4609           if (vect_print_dump_info (REPORT_DETAILS))
4610             fprintf (vect_dump, "unsupported condition in reduction");
4611
4612             return false;
4613         }
4614     }
4615   else
4616     {
4617       /* 4. Supportable by target?  */
4618
4619       /* 4.1. check support for the operation in the loop  */
4620       optab = optab_for_tree_code (code, vectype_in, optab_default);
4621       if (!optab)
4622         {
4623           if (vect_print_dump_info (REPORT_DETAILS))
4624             fprintf (vect_dump, "no optab.");
4625
4626           return false;
4627         }
4628
4629       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4630         {
4631           if (vect_print_dump_info (REPORT_DETAILS))
4632             fprintf (vect_dump, "op not supported by target.");
4633
4634           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4635               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4636                   < vect_min_worthwhile_factor (code))
4637             return false;
4638
4639           if (vect_print_dump_info (REPORT_DETAILS))
4640             fprintf (vect_dump, "proceeding using word mode.");
4641         }
4642
4643       /* Worthwhile without SIMD support?  */
4644       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4645           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4646              < vect_min_worthwhile_factor (code))
4647         {
4648           if (vect_print_dump_info (REPORT_DETAILS))
4649             fprintf (vect_dump, "not worthwhile without SIMD support.");
4650
4651           return false;
4652         }
4653     }
4654
4655   /* 4.2. Check support for the epilog operation.
4656
4657           If STMT represents a reduction pattern, then the type of the
4658           reduction variable may be different than the type of the rest
4659           of the arguments.  For example, consider the case of accumulation
4660           of shorts into an int accumulator; The original code:
4661                         S1: int_a = (int) short_a;
4662           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
4663
4664           was replaced with:
4665                         STMT: int_acc = widen_sum <short_a, int_acc>
4666
4667           This means that:
4668           1. The tree-code that is used to create the vector operation in the
4669              epilog code (that reduces the partial results) is not the
4670              tree-code of STMT, but is rather the tree-code of the original
4671              stmt from the pattern that STMT is replacing.  I.e, in the example
4672              above we want to use 'widen_sum' in the loop, but 'plus' in the
4673              epilog.
4674           2. The type (mode) we use to check available target support
4675              for the vector operation to be created in the *epilog*, is
4676              determined by the type of the reduction variable (in the example
4677              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4678              However the type (mode) we use to check available target support
4679              for the vector operation to be created *inside the loop*, is
4680              determined by the type of the other arguments to STMT (in the
4681              example we'd check this: optab_handler (widen_sum_optab,
4682              vect_short_mode)).
4683
4684           This is contrary to "regular" reductions, in which the types of all
4685           the arguments are the same as the type of the reduction variable.
4686           For "regular" reductions we can therefore use the same vector type
4687           (and also the same tree-code) when generating the epilog code and
4688           when generating the code inside the loop.  */
4689
4690   if (orig_stmt)
4691     {
4692       /* This is a reduction pattern: get the vectype from the type of the
4693          reduction variable, and get the tree-code from orig_stmt.  */
4694       orig_code = gimple_assign_rhs_code (orig_stmt);
4695       gcc_assert (vectype_out);
4696       vec_mode = TYPE_MODE (vectype_out);
4697     }
4698   else
4699     {
4700       /* Regular reduction: use the same vectype and tree-code as used for
4701          the vector code inside the loop can be used for the epilog code. */
4702       orig_code = code;
4703     }
4704
4705   if (nested_cycle)
4706     {
4707       def_bb = gimple_bb (reduc_def_stmt);
4708       def_stmt_loop = def_bb->loop_father;
4709       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4710                                        loop_preheader_edge (def_stmt_loop));
4711       if (TREE_CODE (def_arg) == SSA_NAME
4712           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4713           && gimple_code (def_arg_stmt) == GIMPLE_PHI
4714           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4715           && vinfo_for_stmt (def_arg_stmt)
4716           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4717               == vect_double_reduction_def)
4718         double_reduc = true;
4719     }
4720
4721   epilog_reduc_code = ERROR_MARK;
4722   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4723     {
4724       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4725                                          optab_default);
4726       if (!reduc_optab)
4727         {
4728           if (vect_print_dump_info (REPORT_DETAILS))
4729             fprintf (vect_dump, "no optab for reduction.");
4730
4731           epilog_reduc_code = ERROR_MARK;
4732         }
4733
4734       if (reduc_optab
4735           && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4736         {
4737           if (vect_print_dump_info (REPORT_DETAILS))
4738             fprintf (vect_dump, "reduc op not supported by target.");
4739
4740           epilog_reduc_code = ERROR_MARK;
4741         }
4742     }
4743   else
4744     {
4745       if (!nested_cycle || double_reduc)
4746         {
4747           if (vect_print_dump_info (REPORT_DETAILS))
4748             fprintf (vect_dump, "no reduc code for scalar code.");
4749
4750           return false;
4751         }
4752     }
4753
4754   if (double_reduc && ncopies > 1)
4755     {
4756       if (vect_print_dump_info (REPORT_DETAILS))
4757         fprintf (vect_dump, "multiple types in double reduction");
4758
4759       return false;
4760     }
4761
4762   /* In case of widenning multiplication by a constant, we update the type
4763      of the constant to be the type of the other operand.  We check that the
4764      constant fits the type in the pattern recognition pass.  */
4765   if (code == DOT_PROD_EXPR
4766       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
4767     {
4768       if (TREE_CODE (ops[0]) == INTEGER_CST)
4769         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
4770       else if (TREE_CODE (ops[1]) == INTEGER_CST)
4771         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
4772       else
4773         {
4774           if (vect_print_dump_info (REPORT_DETAILS))
4775             fprintf (vect_dump, "invalid types in dot-prod");
4776
4777           return false;
4778         }
4779     }
4780
4781   if (!vec_stmt) /* transformation not required.  */
4782     {
4783       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4784         return false;
4785       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4786       return true;
4787     }
4788
4789   /** Transform.  **/
4790
4791   if (vect_print_dump_info (REPORT_DETAILS))
4792     fprintf (vect_dump, "transform reduction.");
4793
4794   /* FORNOW: Multiple types are not supported for condition.  */
4795   if (code == COND_EXPR)
4796     gcc_assert (ncopies == 1);
4797
4798   /* Create the destination vector  */
4799   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4800
4801   /* In case the vectorization factor (VF) is bigger than the number
4802      of elements that we can fit in a vectype (nunits), we have to generate
4803      more than one vector stmt - i.e - we need to "unroll" the
4804      vector stmt by a factor VF/nunits.  For more details see documentation
4805      in vectorizable_operation.  */
4806
4807   /* If the reduction is used in an outer loop we need to generate
4808      VF intermediate results, like so (e.g. for ncopies=2):
4809         r0 = phi (init, r0)
4810         r1 = phi (init, r1)
4811         r0 = x0 + r0;
4812         r1 = x1 + r1;
4813     (i.e. we generate VF results in 2 registers).
4814     In this case we have a separate def-use cycle for each copy, and therefore
4815     for each copy we get the vector def for the reduction variable from the
4816     respective phi node created for this copy.
4817
4818     Otherwise (the reduction is unused in the loop nest), we can combine
4819     together intermediate results, like so (e.g. for ncopies=2):
4820         r = phi (init, r)
4821         r = x0 + r;
4822         r = x1 + r;
4823    (i.e. we generate VF/2 results in a single register).
4824    In this case for each copy we get the vector def for the reduction variable
4825    from the vectorized reduction operation generated in the previous iteration.
4826   */
4827
4828   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
4829     {
4830       single_defuse_cycle = true;
4831       epilog_copies = 1;
4832     }
4833   else
4834     epilog_copies = ncopies;
4835
4836   prev_stmt_info = NULL;
4837   prev_phi_info = NULL;
4838   if (slp_node)
4839     {
4840       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4841       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
4842                   == TYPE_VECTOR_SUBPARTS (vectype_in));
4843     }
4844   else
4845     {
4846       vec_num = 1;
4847       vec_oprnds0 = VEC_alloc (tree, heap, 1);
4848       if (op_type == ternary_op)
4849         vec_oprnds1 = VEC_alloc (tree, heap, 1);
4850     }
4851
4852   phis = VEC_alloc (gimple, heap, vec_num);
4853   vect_defs = VEC_alloc (tree, heap, vec_num);
4854   if (!slp_node)
4855     VEC_quick_push (tree, vect_defs, NULL_TREE);
4856
4857   for (j = 0; j < ncopies; j++)
4858     {
4859       if (j == 0 || !single_defuse_cycle)
4860         {
4861           for (i = 0; i < vec_num; i++)
4862             {
4863               /* Create the reduction-phi that defines the reduction
4864                  operand.  */
4865               new_phi = create_phi_node (vec_dest, loop->header);
4866               set_vinfo_for_stmt (new_phi,
4867                                   new_stmt_vec_info (new_phi, loop_vinfo,
4868                                                      NULL));
4869                if (j == 0 || slp_node)
4870                  VEC_quick_push (gimple, phis, new_phi);
4871             }
4872         }
4873
4874       if (code == COND_EXPR)
4875         {
4876           gcc_assert (!slp_node);
4877           vectorizable_condition (stmt, gsi, vec_stmt,
4878                                   PHI_RESULT (VEC_index (gimple, phis, 0)),
4879                                   reduc_index, NULL);
4880           /* Multiple types are not supported for condition.  */
4881           break;
4882         }
4883
4884       /* Handle uses.  */
4885       if (j == 0)
4886         {
4887           op0 = ops[!reduc_index];
4888           if (op_type == ternary_op)
4889             {
4890               if (reduc_index == 0)
4891                 op1 = ops[2];
4892               else
4893                 op1 = ops[1];
4894             }
4895
4896           if (slp_node)
4897             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
4898                                slp_node, -1);
4899           else
4900             {
4901               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
4902                                                             stmt, NULL);
4903               VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
4904               if (op_type == ternary_op)
4905                {
4906                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
4907                                                                NULL);
4908                  VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
4909                }
4910             }
4911         }
4912       else
4913         {
4914           if (!slp_node)
4915             {
4916               enum vect_def_type dt;
4917               gimple dummy_stmt;
4918               tree dummy;
4919
4920               vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
4921                                   &dummy_stmt, &dummy, &dt);
4922               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
4923                                                               loop_vec_def0);
4924               VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0);
4925               if (op_type == ternary_op)
4926                 {
4927                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
4928                                       &dummy, &dt);
4929                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
4930                                                                 loop_vec_def1);
4931                   VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1);
4932                 }
4933             }
4934
4935           if (single_defuse_cycle)
4936             reduc_def = gimple_assign_lhs (new_stmt);
4937
4938           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
4939         }
4940
4941       FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, def0)
4942         {
4943           if (slp_node)
4944             reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
4945           else
4946             {
4947               if (!single_defuse_cycle || j == 0)
4948                 reduc_def = PHI_RESULT (new_phi);
4949             }
4950
4951           def1 = ((op_type == ternary_op)
4952                   ? VEC_index (tree, vec_oprnds1, i) : NULL);
4953           if (op_type == binary_op)
4954             {
4955               if (reduc_index == 0)
4956                 expr = build2 (code, vectype_out, reduc_def, def0);
4957               else
4958                 expr = build2 (code, vectype_out, def0, reduc_def);
4959             }
4960           else
4961             {
4962               if (reduc_index == 0)
4963                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
4964               else
4965                 {
4966                   if (reduc_index == 1)
4967                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
4968                   else
4969                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
4970                 }
4971             }
4972
4973           new_stmt = gimple_build_assign (vec_dest, expr);
4974           new_temp = make_ssa_name (vec_dest, new_stmt);
4975           gimple_assign_set_lhs (new_stmt, new_temp);
4976           vect_finish_stmt_generation (stmt, new_stmt, gsi);
4977
4978           if (slp_node)
4979             {
4980               VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4981               VEC_quick_push (tree, vect_defs, new_temp);
4982             }
4983           else
4984             VEC_replace (tree, vect_defs, 0, new_temp);
4985         }
4986
4987       if (slp_node)
4988         continue;
4989
4990       if (j == 0)
4991         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4992       else
4993         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4994
4995       prev_stmt_info = vinfo_for_stmt (new_stmt);
4996       prev_phi_info = vinfo_for_stmt (new_phi);
4997     }
4998
4999   /* Finalize the reduction-phi (set its arguments) and create the
5000      epilog reduction code.  */
5001   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5002     {
5003       new_temp = gimple_assign_lhs (*vec_stmt);
5004       VEC_replace (tree, vect_defs, 0, new_temp);
5005     }
5006
5007   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5008                                     epilog_reduc_code, phis, reduc_index,
5009                                     double_reduc, slp_node);
5010
5011   VEC_free (gimple, heap, phis);
5012   VEC_free (tree, heap, vec_oprnds0);
5013   if (vec_oprnds1)
5014     VEC_free (tree, heap, vec_oprnds1);
5015
5016   return true;
5017 }
5018
5019 /* Function vect_min_worthwhile_factor.
5020
5021    For a loop where we could vectorize the operation indicated by CODE,
5022    return the minimum vectorization factor that makes it worthwhile
5023    to use generic vectors.  */
5024 int
5025 vect_min_worthwhile_factor (enum tree_code code)
5026 {
5027   switch (code)
5028     {
5029     case PLUS_EXPR:
5030     case MINUS_EXPR:
5031     case NEGATE_EXPR:
5032       return 4;
5033
5034     case BIT_AND_EXPR:
5035     case BIT_IOR_EXPR:
5036     case BIT_XOR_EXPR:
5037     case BIT_NOT_EXPR:
5038       return 2;
5039
5040     default:
5041       return INT_MAX;
5042     }
5043 }
5044
5045
5046 /* Function vectorizable_induction
5047
5048    Check if PHI performs an induction computation that can be vectorized.
5049    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5050    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5051    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5052
5053 bool
5054 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5055                         gimple *vec_stmt)
5056 {
5057   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5058   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5059   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5060   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5061   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5062   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5063   tree vec_def;
5064
5065   gcc_assert (ncopies >= 1);
5066   /* FORNOW. These restrictions should be relaxed.  */
5067   if (nested_in_vect_loop_p (loop, phi))
5068     {
5069       imm_use_iterator imm_iter;
5070       use_operand_p use_p;
5071       gimple exit_phi;
5072       edge latch_e;
5073       tree loop_arg;
5074
5075       if (ncopies > 1)
5076         {
5077           if (vect_print_dump_info (REPORT_DETAILS))
5078             fprintf (vect_dump, "multiple types in nested loop.");
5079           return false;
5080         }
5081
5082       exit_phi = NULL;
5083       latch_e = loop_latch_edge (loop->inner);
5084       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5085       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5086         {
5087           if (!flow_bb_inside_loop_p (loop->inner,
5088                                       gimple_bb (USE_STMT (use_p))))
5089             {
5090               exit_phi = USE_STMT (use_p);
5091               break;
5092             }
5093         }
5094       if (exit_phi)
5095         {
5096           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5097           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5098                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5099             {
5100               if (vect_print_dump_info (REPORT_DETAILS))
5101                 fprintf (vect_dump, "inner-loop induction only used outside "
5102                          "of the outer vectorized loop.");
5103               return false;
5104             }
5105         }
5106     }
5107
5108   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5109     return false;
5110
5111   /* FORNOW: SLP not supported.  */
5112   if (STMT_SLP_TYPE (stmt_info))
5113     return false;
5114
5115   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5116
5117   if (gimple_code (phi) != GIMPLE_PHI)
5118     return false;
5119
5120   if (!vec_stmt) /* transformation not required.  */
5121     {
5122       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5123       if (vect_print_dump_info (REPORT_DETAILS))
5124         fprintf (vect_dump, "=== vectorizable_induction ===");
5125       vect_model_induction_cost (stmt_info, ncopies);
5126       return true;
5127     }
5128
5129   /** Transform.  **/
5130
5131   if (vect_print_dump_info (REPORT_DETAILS))
5132     fprintf (vect_dump, "transform induction phi.");
5133
5134   vec_def = get_initial_def_for_induction (phi);
5135   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5136   return true;
5137 }
5138
5139 /* Function vectorizable_live_operation.
5140
5141    STMT computes a value that is used outside the loop.  Check if
5142    it can be supported.  */
5143
5144 bool
5145 vectorizable_live_operation (gimple stmt,
5146                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5147                              gimple *vec_stmt ATTRIBUTE_UNUSED)
5148 {
5149   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5150   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5151   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5152   int i;
5153   int op_type;
5154   tree op;
5155   tree def;
5156   gimple def_stmt;
5157   enum vect_def_type dt;
5158   enum tree_code code;
5159   enum gimple_rhs_class rhs_class;
5160
5161   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5162
5163   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5164     return false;
5165
5166   if (!is_gimple_assign (stmt))
5167     return false;
5168
5169   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5170     return false;
5171
5172   /* FORNOW. CHECKME. */
5173   if (nested_in_vect_loop_p (loop, stmt))
5174     return false;
5175
5176   code = gimple_assign_rhs_code (stmt);
5177   op_type = TREE_CODE_LENGTH (code);
5178   rhs_class = get_gimple_rhs_class (code);
5179   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5180   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5181
5182   /* FORNOW: support only if all uses are invariant.  This means
5183      that the scalar operations can remain in place, unvectorized.
5184      The original last scalar value that they compute will be used.  */
5185
5186   for (i = 0; i < op_type; i++)
5187     {
5188       if (rhs_class == GIMPLE_SINGLE_RHS)
5189         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5190       else
5191         op = gimple_op (stmt, i + 1);
5192       if (op
5193           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5194                                   &dt))
5195         {
5196           if (vect_print_dump_info (REPORT_DETAILS))
5197             fprintf (vect_dump, "use not simple.");
5198           return false;
5199         }
5200
5201       if (dt != vect_external_def && dt != vect_constant_def)
5202         return false;
5203     }
5204
5205   /* No transformation is required for the cases we currently support.  */
5206   return true;
5207 }
5208
5209 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5210
5211 static void
5212 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5213 {
5214   ssa_op_iter op_iter;
5215   imm_use_iterator imm_iter;
5216   def_operand_p def_p;
5217   gimple ustmt;
5218
5219   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5220     {
5221       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5222         {
5223           basic_block bb;
5224
5225           if (!is_gimple_debug (ustmt))
5226             continue;
5227
5228           bb = gimple_bb (ustmt);
5229
5230           if (!flow_bb_inside_loop_p (loop, bb))
5231             {
5232               if (gimple_debug_bind_p (ustmt))
5233                 {
5234                   if (vect_print_dump_info (REPORT_DETAILS))
5235                     fprintf (vect_dump, "killing debug use");
5236
5237                   gimple_debug_bind_reset_value (ustmt);
5238                   update_stmt (ustmt);
5239                 }
5240               else
5241                 gcc_unreachable ();
5242             }
5243         }
5244     }
5245 }
5246
5247 /* Function vect_transform_loop.
5248
5249    The analysis phase has determined that the loop is vectorizable.
5250    Vectorize the loop - created vectorized stmts to replace the scalar
5251    stmts in the loop, and update the loop exit condition.  */
5252
5253 void
5254 vect_transform_loop (loop_vec_info loop_vinfo)
5255 {
5256   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5257   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5258   int nbbs = loop->num_nodes;
5259   gimple_stmt_iterator si;
5260   int i;
5261   tree ratio = NULL;
5262   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5263   bool grouped_store;
5264   bool slp_scheduled = false;
5265   unsigned int nunits;
5266   gimple stmt, pattern_stmt;
5267   gimple_seq pattern_def_seq = NULL;
5268   gimple_stmt_iterator pattern_def_si = gsi_none ();
5269   bool transform_pattern_stmt = false;
5270   bool check_profitability;
5271   int th;
5272
5273   if (vect_print_dump_info (REPORT_DETAILS))
5274     fprintf (vect_dump, "=== vec_transform_loop ===");
5275
5276   /* Use the more conservative vectorization threshold.  If the number
5277      of iterations is constant assume the cost check has been performed
5278      by our caller.  If the threshold makes all loops profitable that
5279      run at least the vectorization factor number of times checking
5280      is pointless, too.  */
5281   th = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
5282          * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
5283   th = MAX (th, LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo));
5284   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5285       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5286     {
5287       if (vect_print_dump_info (REPORT_COST))
5288         fprintf (vect_dump,
5289                  "Profitability threshold is %d loop iterations.", th);
5290       check_profitability = true;
5291     }
5292
5293   /* Peel the loop if there are data refs with unknown alignment.
5294      Only one data ref with unknown store is allowed.  */
5295
5296   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5297     {
5298       vect_do_peeling_for_alignment (loop_vinfo, th, check_profitability);
5299       check_profitability = false;
5300     }
5301
5302   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5303       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5304     {
5305       vect_loop_versioning (loop_vinfo, th, check_profitability);
5306       check_profitability = false;
5307     }
5308
5309   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5310      compile time constant), or it is a constant that doesn't divide by the
5311      vectorization factor, then an epilog loop needs to be created.
5312      We therefore duplicate the loop: the original loop will be vectorized,
5313      and will compute the first (n/VF) iterations.  The second copy of the loop
5314      will remain scalar and will compute the remaining (n%VF) iterations.
5315      (VF is the vectorization factor).  */
5316
5317   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5318        || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5319            && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
5320        || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5321     vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
5322                                     th, check_profitability);
5323   else
5324     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5325                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5326
5327   /* 1) Make sure the loop header has exactly two entries
5328      2) Make sure we have a preheader basic block.  */
5329
5330   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5331
5332   split_edge (loop_preheader_edge (loop));
5333
5334   /* FORNOW: the vectorizer supports only loops which body consist
5335      of one basic block (header + empty latch). When the vectorizer will
5336      support more involved loop forms, the order by which the BBs are
5337      traversed need to be reconsidered.  */
5338
5339   for (i = 0; i < nbbs; i++)
5340     {
5341       basic_block bb = bbs[i];
5342       stmt_vec_info stmt_info;
5343       gimple phi;
5344
5345       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5346         {
5347           phi = gsi_stmt (si);
5348           if (vect_print_dump_info (REPORT_DETAILS))
5349             {
5350               fprintf (vect_dump, "------>vectorizing phi: ");
5351               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
5352             }
5353           stmt_info = vinfo_for_stmt (phi);
5354           if (!stmt_info)
5355             continue;
5356
5357           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5358             vect_loop_kill_debug_uses (loop, phi);
5359
5360           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5361               && !STMT_VINFO_LIVE_P (stmt_info))
5362             continue;
5363
5364           if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5365                 != (unsigned HOST_WIDE_INT) vectorization_factor)
5366               && vect_print_dump_info (REPORT_DETAILS))
5367             fprintf (vect_dump, "multiple-types.");
5368
5369           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5370             {
5371               if (vect_print_dump_info (REPORT_DETAILS))
5372                 fprintf (vect_dump, "transform phi.");
5373               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5374             }
5375         }
5376
5377       pattern_stmt = NULL;
5378       for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5379         {
5380           bool is_store;
5381
5382           if (transform_pattern_stmt)
5383             stmt = pattern_stmt;
5384           else
5385             stmt = gsi_stmt (si);
5386
5387           if (vect_print_dump_info (REPORT_DETAILS))
5388             {
5389               fprintf (vect_dump, "------>vectorizing statement: ");
5390               print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
5391             }
5392
5393           stmt_info = vinfo_for_stmt (stmt);
5394
5395           /* vector stmts created in the outer-loop during vectorization of
5396              stmts in an inner-loop may not have a stmt_info, and do not
5397              need to be vectorized.  */
5398           if (!stmt_info)
5399             {
5400               gsi_next (&si);
5401               continue;
5402             }
5403
5404           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5405             vect_loop_kill_debug_uses (loop, stmt);
5406
5407           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5408               && !STMT_VINFO_LIVE_P (stmt_info))
5409             {
5410               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5411                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5412                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5413                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5414                 {
5415                   stmt = pattern_stmt;
5416                   stmt_info = vinfo_for_stmt (stmt);
5417                 }
5418               else
5419                 {
5420                   gsi_next (&si);
5421                   continue;
5422                 }
5423             }
5424           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5425                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5426                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5427                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5428             transform_pattern_stmt = true;
5429
5430           /* If pattern statement has def stmts, vectorize them too.  */
5431           if (is_pattern_stmt_p (stmt_info))
5432             {
5433               if (pattern_def_seq == NULL)
5434                 {
5435                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
5436                   pattern_def_si = gsi_start (pattern_def_seq);
5437                 }
5438               else if (!gsi_end_p (pattern_def_si))
5439                 gsi_next (&pattern_def_si);
5440               if (pattern_def_seq != NULL)
5441                 {
5442                   gimple pattern_def_stmt = NULL;
5443                   stmt_vec_info pattern_def_stmt_info = NULL;
5444
5445                   while (!gsi_end_p (pattern_def_si))
5446                     {
5447                       pattern_def_stmt = gsi_stmt (pattern_def_si);
5448                       pattern_def_stmt_info
5449                         = vinfo_for_stmt (pattern_def_stmt);
5450                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
5451                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
5452                         break;
5453                       gsi_next (&pattern_def_si);
5454                     }
5455
5456                   if (!gsi_end_p (pattern_def_si))
5457                     {
5458                       if (vect_print_dump_info (REPORT_DETAILS))
5459                         {
5460                           fprintf (vect_dump, "==> vectorizing pattern def"
5461                                               " stmt: ");
5462                           print_gimple_stmt (vect_dump, pattern_def_stmt, 0,
5463                                              TDF_SLIM);
5464                         }
5465
5466                       stmt = pattern_def_stmt;
5467                       stmt_info = pattern_def_stmt_info;
5468                     }
5469                   else
5470                     {
5471                       pattern_def_si = gsi_none ();
5472                       transform_pattern_stmt = false;
5473                     }
5474                 }
5475               else
5476                 transform_pattern_stmt = false;
5477             }
5478
5479           gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5480           nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
5481                                                STMT_VINFO_VECTYPE (stmt_info));
5482           if (!STMT_SLP_TYPE (stmt_info)
5483               && nunits != (unsigned int) vectorization_factor
5484               && vect_print_dump_info (REPORT_DETAILS))
5485             /* For SLP VF is set according to unrolling factor, and not to
5486                vector size, hence for SLP this print is not valid.  */
5487             fprintf (vect_dump, "multiple-types.");
5488
5489           /* SLP. Schedule all the SLP instances when the first SLP stmt is
5490              reached.  */
5491           if (STMT_SLP_TYPE (stmt_info))
5492             {
5493               if (!slp_scheduled)
5494                 {
5495                   slp_scheduled = true;
5496
5497                   if (vect_print_dump_info (REPORT_DETAILS))
5498                     fprintf (vect_dump, "=== scheduling SLP instances ===");
5499
5500                   vect_schedule_slp (loop_vinfo, NULL);
5501                 }
5502
5503               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
5504               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
5505                 {
5506                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5507                     {
5508                       pattern_def_seq = NULL;
5509                       gsi_next (&si);
5510                     }
5511                   continue;
5512                 }
5513             }
5514
5515           /* -------- vectorize statement ------------ */
5516           if (vect_print_dump_info (REPORT_DETAILS))
5517             fprintf (vect_dump, "transform statement.");
5518
5519           grouped_store = false;
5520           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
5521           if (is_store)
5522             {
5523               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
5524                 {
5525                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5526                      interleaving chain was completed - free all the stores in
5527                      the chain.  */
5528                   gsi_next (&si);
5529                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
5530                   continue;
5531                 }
5532               else
5533                 {
5534                   /* Free the attached stmt_vec_info and remove the stmt.  */
5535                   gimple store = gsi_stmt (si);
5536                   free_stmt_vec_info (store);
5537                   unlink_stmt_vdef (store);
5538                   gsi_remove (&si, true);
5539                   release_defs (store);
5540                   continue;
5541                 }
5542             }
5543
5544           if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5545             {
5546               pattern_def_seq = NULL;
5547               gsi_next (&si);
5548             }
5549         }                       /* stmts in BB */
5550     }                           /* BBs in loop */
5551
5552   slpeel_make_loop_iterate_ntimes (loop, ratio);
5553
5554   /* The memory tags and pointers in vectorized statements need to
5555      have their SSA forms updated.  FIXME, why can't this be delayed
5556      until all the loops have been transformed?  */
5557   update_ssa (TODO_update_ssa);
5558
5559   if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
5560     fprintf (vect_dump, "LOOP VECTORIZED.");
5561   if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
5562     fprintf (vect_dump, "OUTER LOOP VECTORIZED.");
5563 }