gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2015 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "cfgloop.h"
  45 #include "params.h"
  46 #include "tree-scalar-evolution.h"
  47 #include "tree-vectorizer.h"
  48 #include "gimple-fold.h"
  49 #include "cgraph.h"
  50
  51 /* Loop Vectorization Pass.
  52
  53    This pass tries to vectorize loops.
  54
  55    For example, the vectorizer transforms the following simple loop:
  56
  57         short a[N]; short b[N]; short c[N]; int i;
  58
  59         for (i=0; i<N; i++){
  60           a[i] = b[i] + c[i];
  61         }
  62
  63    as if it was manually vectorized by rewriting the source code into:
  64
  65         typedef int __attribute__((mode(V8HI))) v8hi;
  66         short a[N];  short b[N]; short c[N];   int i;
  67         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  68         v8hi va, vb, vc;
  69
  70         for (i=0; i<N/8; i++){
  71           vb = pb[i];
  72           vc = pc[i];
  73           va = vb + vc;
  74           pa[i] = va;
  75         }
  76
  77         The main entry to this pass is vectorize_loops(), in which
  78    the vectorizer applies a set of analyses on a given set of loops,
  79    followed by the actual vectorization transformation for the loops that
  80    had successfully passed the analysis phase.
  81         Throughout this pass we make a distinction between two types of
  82    data: scalars (which are represented by SSA_NAMES), and memory references
  83    ("data-refs").  These two types of data require different handling both
  84    during analysis and transformation. The types of data-refs that the
  85    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  86    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  87    accesses are required to have a simple (consecutive) access pattern.
  88
  89    Analysis phase:
  90    ===============
  91         The driver for the analysis phase is vect_analyze_loop().
  92    It applies a set of analyses, some of which rely on the scalar evolution
  93    analyzer (scev) developed by Sebastian Pop.
  94
  95         During the analysis phase the vectorizer records some information
  96    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  97    loop, as well as general information about the loop as a whole, which is
  98    recorded in a "loop_vec_info" struct attached to each loop.
  99
 100    Transformation phase:
 101    =====================
 102         The loop transformation phase scans all the stmts in the loop, and
 103    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 104    the loop that needs to be vectorized.  It inserts the vector code sequence
 105    just before the scalar stmt S, and records a pointer to the vector code
 106    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 107    attached to S).  This pointer will be used for the vectorization of following
 108    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 109    otherwise, we rely on dead code elimination for removing it.
 110
 111         For example, say stmt S1 was vectorized into stmt VS1:
 112
 113    VS1: vb = px[i];
 114    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 115    S2:  a = b;
 116
 117    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 118    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 119    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 120    resulting sequence would be:
 121
 122    VS1: vb = px[i];
 123    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 124    VS2: va = vb;
 125    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 126
 127         Operands that are not SSA_NAMEs, are data-refs that appear in
 128    load/store operations (like 'x[i]' in S1), and are handled differently.
 129
 130    Target modeling:
 131    =================
 132         Currently the only target specific information that is used is the
 133    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 134    Targets that can support different sizes of vectors, for now will need
 135    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 136    flexibility will be added in the future.
 137
 138         Since we only vectorize operations which vector form can be
 139    expressed using existing tree codes, to verify that an operation is
 140    supported, the vectorizer checks the relevant optab at the relevant
 141    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 142    the value found is CODE_FOR_nothing, then there's no target support, and
 143    we can't vectorize the stmt.
 144
 145    For additional information on this project see:
 146    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 147 */
 148
 149 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 150
 151 /* Function vect_determine_vectorization_factor
 152
 153    Determine the vectorization factor (VF).  VF is the number of data elements
 154    that are operated upon in parallel in a single iteration of the vectorized
 155    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 156    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 157    elements can fit in a single vector register.
 158
 159    We currently support vectorization of loops in which all types operated upon
 160    are of the same size.  Therefore this function currently sets VF according to
 161    the size of the types operated upon, and fails if there are multiple sizes
 162    in the loop.
 163
 164    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 165    original loop:
 166         for (i=0; i<N; i++){
 167           a[i] = b[i] + c[i];
 168         }
 169
 170    vectorized loop:
 171         for (i=0; i<N; i+=VF){
 172           a[i:VF] = b[i:VF] + c[i:VF];
 173         }
 174 */
 175
 176 static bool
 177 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 178 {
 179   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 180   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 181   unsigned nbbs = loop->num_nodes;
 182   unsigned int vectorization_factor = 0;
 183   tree scalar_type;
 184   gphi *phi;
 185   tree vectype;
 186   unsigned int nunits;
 187   stmt_vec_info stmt_info;
 188   unsigned i;
 189   HOST_WIDE_INT dummy;
 190   gimple *stmt, *pattern_stmt = NULL;
 191   gimple_seq pattern_def_seq = NULL;
 192   gimple_stmt_iterator pattern_def_si = gsi_none ();
 193   bool analyze_pattern_stmt = false;
 194   bool bool_result;
 195   auto_vec<stmt_vec_info> mask_producers;
 196
 197   if (dump_enabled_p ())
 198     dump_printf_loc (MSG_NOTE, vect_location,
 199                      "=== vect_determine_vectorization_factor ===\n");
 200
 201   for (i = 0; i < nbbs; i++)
 202     {
 203       basic_block bb = bbs[i];
 204
 205       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 206            gsi_next (&si))
 207         {
 208           phi = si.phi ();
 209           stmt_info = vinfo_for_stmt (phi);
 210           if (dump_enabled_p ())
 211             {
 212               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 213               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 214               dump_printf (MSG_NOTE, "\n");
 215             }
 216
 217           gcc_assert (stmt_info);
 218
 219           if (STMT_VINFO_RELEVANT_P (stmt_info))
 220             {
 221               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 222               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 223
 224               if (dump_enabled_p ())
 225                 {
 226                   dump_printf_loc (MSG_NOTE, vect_location,
 227                                    "get vectype for scalar type:  ");
 228                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 229                   dump_printf (MSG_NOTE, "\n");
 230                 }
 231
 232               vectype = get_vectype_for_scalar_type (scalar_type);
 233               if (!vectype)
 234                 {
 235                   if (dump_enabled_p ())
 236                     {
 237                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 238                                        "not vectorized: unsupported "
 239                                        "data-type ");
 240                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 241                                          scalar_type);
 242                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 243                     }
 244                   return false;
 245                 }
 246               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 247
 248               if (dump_enabled_p ())
 249                 {
 250                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 251                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 252                   dump_printf (MSG_NOTE, "\n");
 253                 }
 254
 255               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 256               if (dump_enabled_p ())
 257                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 258                                  nunits);
 259
 260               if (!vectorization_factor
 261                   || (nunits > vectorization_factor))
 262                 vectorization_factor = nunits;
 263             }
 264         }
 265
 266       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 267            !gsi_end_p (si) || analyze_pattern_stmt;)
 268         {
 269           tree vf_vectype;
 270
 271           if (analyze_pattern_stmt)
 272             stmt = pattern_stmt;
 273           else
 274             stmt = gsi_stmt (si);
 275
 276           stmt_info = vinfo_for_stmt (stmt);
 277
 278           if (dump_enabled_p ())
 279             {
 280               dump_printf_loc (MSG_NOTE, vect_location,
 281                                "==> examining statement: ");
 282               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 283               dump_printf (MSG_NOTE, "\n");
 284             }
 285
 286           gcc_assert (stmt_info);
 287
 288           /* Skip stmts which do not need to be vectorized.  */
 289           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 290                && !STMT_VINFO_LIVE_P (stmt_info))
 291               || gimple_clobber_p (stmt))
 292             {
 293               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 294                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 295                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 296                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 297                 {
 298                   stmt = pattern_stmt;
 299                   stmt_info = vinfo_for_stmt (pattern_stmt);
 300                   if (dump_enabled_p ())
 301                     {
 302                       dump_printf_loc (MSG_NOTE, vect_location,
 303                                        "==> examining pattern statement: ");
 304                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 305                       dump_printf (MSG_NOTE, "\n");
 306                     }
 307                 }
 308               else
 309                 {
 310                   if (dump_enabled_p ())
 311                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 312                   gsi_next (&si);
 313                   continue;
 314                 }
 315             }
 316           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 317                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 318                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 319                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 320             analyze_pattern_stmt = true;
 321
 322           /* If a pattern statement has def stmts, analyze them too.  */
 323           if (is_pattern_stmt_p (stmt_info))
 324             {
 325               if (pattern_def_seq == NULL)
 326                 {
 327                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 328                   pattern_def_si = gsi_start (pattern_def_seq);
 329                 }
 330               else if (!gsi_end_p (pattern_def_si))
 331                 gsi_next (&pattern_def_si);
 332               if (pattern_def_seq != NULL)
 333                 {
 334                   gimple *pattern_def_stmt = NULL;
 335                   stmt_vec_info pattern_def_stmt_info = NULL;
 336
 337                   while (!gsi_end_p (pattern_def_si))
 338                     {
 339                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 340                       pattern_def_stmt_info
 341                         = vinfo_for_stmt (pattern_def_stmt);
 342                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 343                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 344                         break;
 345                       gsi_next (&pattern_def_si);
 346                     }
 347
 348                   if (!gsi_end_p (pattern_def_si))
 349                     {
 350                       if (dump_enabled_p ())
 351                         {
 352                           dump_printf_loc (MSG_NOTE, vect_location,
 353                                            "==> examining pattern def stmt: ");
 354                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 355                                             pattern_def_stmt, 0);
 356                           dump_printf (MSG_NOTE, "\n");
 357                         }
 358
 359                       stmt = pattern_def_stmt;
 360                       stmt_info = pattern_def_stmt_info;
 361                     }
 362                   else
 363                     {
 364                       pattern_def_si = gsi_none ();
 365                       analyze_pattern_stmt = false;
 366                     }
 367                 }
 368               else
 369                 analyze_pattern_stmt = false;
 370             }
 371
 372           if (gimple_get_lhs (stmt) == NULL_TREE
 373               /* MASK_STORE has no lhs, but is ok.  */
 374               && (!is_gimple_call (stmt)
 375                   || !gimple_call_internal_p (stmt)
 376                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 377             {
 378               if (is_gimple_call (stmt))
 379                 {
 380                   /* Ignore calls with no lhs.  These must be calls to
 381                      #pragma omp simd functions, and what vectorization factor
 382                      it really needs can't be determined until
 383                      vectorizable_simd_clone_call.  */
 384                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 385                     {
 386                       pattern_def_seq = NULL;
 387                       gsi_next (&si);
 388                     }
 389                   continue;
 390                 }
 391               if (dump_enabled_p ())
 392                 {
 393                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 394                                    "not vectorized: irregular stmt.");
 395                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 396                                     0);
 397                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 398                 }
 399               return false;
 400             }
 401
 402           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 403             {
 404               if (dump_enabled_p ())
 405                 {
 406                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 407                                    "not vectorized: vector stmt in loop:");
 408                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 409                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 410                 }
 411               return false;
 412             }
 413
 414           bool_result = false;
 415
 416           if (STMT_VINFO_VECTYPE (stmt_info))
 417             {
 418               /* The only case when a vectype had been already set is for stmts
 419                  that contain a dataref, or for "pattern-stmts" (stmts
 420                  generated by the vectorizer to represent/replace a certain
 421                  idiom).  */
 422               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 423                           || is_pattern_stmt_p (stmt_info)
 424                           || !gsi_end_p (pattern_def_si));
 425               vectype = STMT_VINFO_VECTYPE (stmt_info);
 426             }
 427           else
 428             {
 429               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 430               if (is_gimple_call (stmt)
 431                   && gimple_call_internal_p (stmt)
 432                   && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
 433                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 434               else
 435                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 436
 437               /* Bool ops don't participate in vectorization factor
 438                  computation.  For comparison use compared types to
 439                  compute a factor.  */
 440               if (TREE_CODE (scalar_type) == BOOLEAN_TYPE)
 441                 {
 442                   mask_producers.safe_push (stmt_info);
 443                   bool_result = true;
 444
 445                   if (gimple_code (stmt) == GIMPLE_ASSIGN
 446                       && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
 447                          == tcc_comparison
 448                       && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt)))
 449                          != BOOLEAN_TYPE)
 450                     scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 451                   else
 452                     {
 453                       if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 454                         {
 455                           pattern_def_seq = NULL;
 456                           gsi_next (&si);
 457                         }
 458                       continue;
 459                     }
 460                 }
 461
 462               if (dump_enabled_p ())
 463                 {
 464                   dump_printf_loc (MSG_NOTE, vect_location,
 465                                    "get vectype for scalar type:  ");
 466                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 467                   dump_printf (MSG_NOTE, "\n");
 468                 }
 469               vectype = get_vectype_for_scalar_type (scalar_type);
 470               if (!vectype)
 471                 {
 472                   if (dump_enabled_p ())
 473                     {
 474                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 475                                        "not vectorized: unsupported "
 476                                        "data-type ");
 477                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 478                                          scalar_type);
 479                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 480                     }
 481                   return false;
 482                 }
 483
 484               if (!bool_result)
 485                 STMT_VINFO_VECTYPE (stmt_info) = vectype;
 486
 487               if (dump_enabled_p ())
 488                 {
 489                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 490                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 491                   dump_printf (MSG_NOTE, "\n");
 492                 }
 493             }
 494
 495           /* Don't try to compute VF out scalar types if we stmt
 496              produces boolean vector.  Use result vectype instead.  */
 497           if (VECTOR_BOOLEAN_TYPE_P (vectype))
 498             vf_vectype = vectype;
 499           else
 500             {
 501               /* The vectorization factor is according to the smallest
 502                  scalar type (or the largest vector size, but we only
 503                  support one vector size per loop).  */
 504               if (!bool_result)
 505                 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 506                                                              &dummy);
 507               if (dump_enabled_p ())
 508                 {
 509                   dump_printf_loc (MSG_NOTE, vect_location,
 510                                    "get vectype for scalar type:  ");
 511                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 512                   dump_printf (MSG_NOTE, "\n");
 513                 }
 514               vf_vectype = get_vectype_for_scalar_type (scalar_type);
 515             }
 516           if (!vf_vectype)
 517             {
 518               if (dump_enabled_p ())
 519                 {
 520                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 521                                    "not vectorized: unsupported data-type ");
 522                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 523                                      scalar_type);
 524                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 525                 }
 526               return false;
 527             }
 528
 529           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 530                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 531             {
 532               if (dump_enabled_p ())
 533                 {
 534                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 535                                    "not vectorized: different sized vector "
 536                                    "types in statement, ");
 537                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 538                                      vectype);
 539                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 540                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 541                                      vf_vectype);
 542                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 543                 }
 544               return false;
 545             }
 546
 547           if (dump_enabled_p ())
 548             {
 549               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 550               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 551               dump_printf (MSG_NOTE, "\n");
 552             }
 553
 554           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 555           if (dump_enabled_p ())
 556             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 557           if (!vectorization_factor
 558               || (nunits > vectorization_factor))
 559             vectorization_factor = nunits;
 560
 561           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 562             {
 563               pattern_def_seq = NULL;
 564               gsi_next (&si);
 565             }
 566         }
 567     }
 568
 569   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 570   if (dump_enabled_p ())
 571     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 572                      vectorization_factor);
 573   if (vectorization_factor <= 1)
 574     {
 575       if (dump_enabled_p ())
 576         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 577                          "not vectorized: unsupported data-type\n");
 578       return false;
 579     }
 580   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 581
 582   for (i = 0; i < mask_producers.length (); i++)
 583     {
 584       tree mask_type = NULL;
 585
 586       stmt = STMT_VINFO_STMT (mask_producers[i]);
 587
 588       if (gimple_code (stmt) == GIMPLE_ASSIGN
 589           && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
 590           && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt))) != BOOLEAN_TYPE)
 591         {
 592           scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 593           mask_type = get_mask_type_for_scalar_type (scalar_type);
 594
 595           if (!mask_type)
 596             {
 597               if (dump_enabled_p ())
 598                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 599                                  "not vectorized: unsupported mask\n");
 600               return false;
 601             }
 602         }
 603       else
 604         {
 605           tree rhs;
 606           ssa_op_iter iter;
 607           gimple *def_stmt;
 608           enum vect_def_type dt;
 609
 610           FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
 611             {
 612               if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
 613                                        &def_stmt, &dt, &vectype))
 614                 {
 615                   if (dump_enabled_p ())
 616                     {
 617                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 618                                        "not vectorized: can't compute mask type "
 619                                        "for statement, ");
 620                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 621                                         0);
 622                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 623                     }
 624                   return false;
 625                 }
 626
 627               /* No vectype probably means external definition.
 628                  Allow it in case there is another operand which
 629                  allows to determine mask type.  */
 630               if (!vectype)
 631                 continue;
 632
 633               if (!mask_type)
 634                 mask_type = vectype;
 635               else if (TYPE_VECTOR_SUBPARTS (mask_type)
 636                        != TYPE_VECTOR_SUBPARTS (vectype))
 637                 {
 638                   if (dump_enabled_p ())
 639                     {
 640                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 641                                        "not vectorized: different sized masks "
 642                                        "types in statement, ");
 643                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 644                                          mask_type);
 645                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 646                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 647                                          vectype);
 648                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 649                     }
 650                   return false;
 651                 }
 652               else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
 653                        != VECTOR_BOOLEAN_TYPE_P (vectype))
 654                 {
 655                   if (dump_enabled_p ())
 656                     {
 657                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 658                                        "not vectorized: mixed mask and "
 659                                        "nonmask vector types in statement, ");
 660                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 661                                          mask_type);
 662                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 663                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 664                                          vectype);
 665                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 666                     }
 667                   return false;
 668                 }
 669             }
 670
 671           /* We may compare boolean value loaded as vector of integers.
 672              Fix mask_type in such case.  */
 673           if (mask_type
 674               && !VECTOR_BOOLEAN_TYPE_P (mask_type)
 675               && gimple_code (stmt) == GIMPLE_ASSIGN
 676               && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
 677             mask_type = build_same_sized_truth_vector_type (mask_type);
 678         }
 679
 680       /* No mask_type should mean loop invariant predicate.
 681          This is probably a subject for optimization in
 682          if-conversion.  */
 683       if (!mask_type)
 684         {
 685           if (dump_enabled_p ())
 686             {
 687               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 688                                "not vectorized: can't compute mask type "
 689                                "for statement, ");
 690               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 691                                 0);
 692               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 693             }
 694           return false;
 695         }
 696
 697       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
 698     }
 699
 700   return true;
 701 }
 702
 703
 704 /* Function vect_is_simple_iv_evolution.
 705
 706    FORNOW: A simple evolution of an induction variables in the loop is
 707    considered a polynomial evolution.  */
 708
 709 static bool
 710 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 711                              tree * step)
 712 {
 713   tree init_expr;
 714   tree step_expr;
 715   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 716   basic_block bb;
 717
 718   /* When there is no evolution in this loop, the evolution function
 719      is not "simple".  */
 720   if (evolution_part == NULL_TREE)
 721     return false;
 722
 723   /* When the evolution is a polynomial of degree >= 2
 724      the evolution function is not "simple".  */
 725   if (tree_is_chrec (evolution_part))
 726     return false;
 727
 728   step_expr = evolution_part;
 729   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 730
 731   if (dump_enabled_p ())
 732     {
 733       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 734       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 735       dump_printf (MSG_NOTE, ",  init: ");
 736       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 737       dump_printf (MSG_NOTE, "\n");
 738     }
 739
 740   *init = init_expr;
 741   *step = step_expr;
 742
 743   if (TREE_CODE (step_expr) != INTEGER_CST
 744       && (TREE_CODE (step_expr) != SSA_NAME
 745           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 746               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 747           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 748               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 749                   || !flag_associative_math)))
 750       && (TREE_CODE (step_expr) != REAL_CST
 751           || !flag_associative_math))
 752     {
 753       if (dump_enabled_p ())
 754         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 755                          "step unknown.\n");
 756       return false;
 757     }
 758
 759   return true;
 760 }
 761
 762 /* Function vect_analyze_scalar_cycles_1.
 763
 764    Examine the cross iteration def-use cycles of scalar variables
 765    in LOOP.  LOOP_VINFO represents the loop that is now being
 766    considered for vectorization (can be LOOP, or an outer-loop
 767    enclosing LOOP).  */
 768
 769 static void
 770 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 771 {
 772   basic_block bb = loop->header;
 773   tree init, step;
 774   auto_vec<gimple *, 64> worklist;
 775   gphi_iterator gsi;
 776   bool double_reduc;
 777
 778   if (dump_enabled_p ())
 779     dump_printf_loc (MSG_NOTE, vect_location,
 780                      "=== vect_analyze_scalar_cycles ===\n");
 781
 782   /* First - identify all inductions.  Reduction detection assumes that all the
 783      inductions have been identified, therefore, this order must not be
 784      changed.  */
 785   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 786     {
 787       gphi *phi = gsi.phi ();
 788       tree access_fn = NULL;
 789       tree def = PHI_RESULT (phi);
 790       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 791
 792       if (dump_enabled_p ())
 793         {
 794           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 795           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 796           dump_printf (MSG_NOTE, "\n");
 797         }
 798
 799       /* Skip virtual phi's.  The data dependences that are associated with
 800          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 801       if (virtual_operand_p (def))
 802         continue;
 803
 804       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 805
 806       /* Analyze the evolution function.  */
 807       access_fn = analyze_scalar_evolution (loop, def);
 808       if (access_fn)
 809         {
 810           STRIP_NOPS (access_fn);
 811           if (dump_enabled_p ())
 812             {
 813               dump_printf_loc (MSG_NOTE, vect_location,
 814                                "Access function of PHI: ");
 815               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 816               dump_printf (MSG_NOTE, "\n");
 817             }
 818           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 819             = initial_condition_in_loop_num (access_fn, loop->num);
 820           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 821             = evolution_part_in_loop_num (access_fn, loop->num);
 822         }
 823
 824       if (!access_fn
 825           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 826           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 827               && TREE_CODE (step) != INTEGER_CST))
 828         {
 829           worklist.safe_push (phi);
 830           continue;
 831         }
 832
 833       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 834                   != NULL_TREE);
 835       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 836
 837       if (dump_enabled_p ())
 838         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 839       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 840     }
 841
 842
 843   /* Second - identify all reductions and nested cycles.  */
 844   while (worklist.length () > 0)
 845     {
 846       gimple *phi = worklist.pop ();
 847       tree def = PHI_RESULT (phi);
 848       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 849       gimple *reduc_stmt;
 850       bool nested_cycle;
 851
 852       if (dump_enabled_p ())
 853         {
 854           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 855           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 856           dump_printf (MSG_NOTE, "\n");
 857         }
 858
 859       gcc_assert (!virtual_operand_p (def)
 860                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 861
 862       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 863       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 864                                                 &double_reduc, false);
 865       if (reduc_stmt)
 866         {
 867           if (double_reduc)
 868             {
 869               if (dump_enabled_p ())
 870                 dump_printf_loc (MSG_NOTE, vect_location,
 871                                  "Detected double reduction.\n");
 872
 873               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 874               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 875                                                     vect_double_reduction_def;
 876             }
 877           else
 878             {
 879               if (nested_cycle)
 880                 {
 881                   if (dump_enabled_p ())
 882                     dump_printf_loc (MSG_NOTE, vect_location,
 883                                      "Detected vectorizable nested cycle.\n");
 884
 885                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 886                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 887                                                              vect_nested_cycle;
 888                 }
 889               else
 890                 {
 891                   if (dump_enabled_p ())
 892                     dump_printf_loc (MSG_NOTE, vect_location,
 893                                      "Detected reduction.\n");
 894
 895                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 896                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 897                                                            vect_reduction_def;
 898                   /* Store the reduction cycles for possible vectorization in
 899                      loop-aware SLP.  */
 900                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 901                 }
 902             }
 903         }
 904       else
 905         if (dump_enabled_p ())
 906           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 907                            "Unknown def-use cycle pattern.\n");
 908     }
 909 }
 910
 911
 912 /* Function vect_analyze_scalar_cycles.
 913
 914    Examine the cross iteration def-use cycles of scalar variables, by
 915    analyzing the loop-header PHIs of scalar variables.  Classify each
 916    cycle as one of the following: invariant, induction, reduction, unknown.
 917    We do that for the loop represented by LOOP_VINFO, and also to its
 918    inner-loop, if exists.
 919    Examples for scalar cycles:
 920
 921    Example1: reduction:
 922
 923               loop1:
 924               for (i=0; i<N; i++)
 925                  sum += a[i];
 926
 927    Example2: induction:
 928
 929               loop2:
 930               for (i=0; i<N; i++)
 931                  a[i] = i;  */
 932
 933 static void
 934 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 935 {
 936   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 937
 938   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 939
 940   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 941      Reductions in such inner-loop therefore have different properties than
 942      the reductions in the nest that gets vectorized:
 943      1. When vectorized, they are executed in the same order as in the original
 944         scalar loop, so we can't change the order of computation when
 945         vectorizing them.
 946      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 947         current checks are too strict.  */
 948
 949   if (loop->inner)
 950     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 951 }
 952
 953 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 954
 955 static void
 956 vect_fixup_reduc_chain (gimple *stmt)
 957 {
 958   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 959   gimple *stmtp;
 960   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 961               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 962   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 963   do
 964     {
 965       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 966       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 967       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 968       if (stmt)
 969         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 970           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 971     }
 972   while (stmt);
 973   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 974 }
 975
 976 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 977
 978 static void
 979 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 980 {
 981   gimple *first;
 982   unsigned i;
 983
 984   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 985     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 986       {
 987         vect_fixup_reduc_chain (first);
 988         LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 989           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 990       }
 991 }
 992
 993 /* Function vect_get_loop_niters.
 994
 995    Determine how many iterations the loop is executed and place it
 996    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 997    in NUMBER_OF_ITERATIONSM1.
 998
 999    Return the loop exit condition.  */
1000
1001
1002 static gcond *
1003 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
1004                       tree *number_of_iterationsm1)
1005 {
1006   tree niters;
1007
1008   if (dump_enabled_p ())
1009     dump_printf_loc (MSG_NOTE, vect_location,
1010                      "=== get_loop_niters ===\n");
1011
1012   niters = number_of_latch_executions (loop);
1013   *number_of_iterationsm1 = niters;
1014
1015   /* We want the number of loop header executions which is the number
1016      of latch executions plus one.
1017      ???  For UINT_MAX latch executions this number overflows to zero
1018      for loops like do { n++; } while (n != 0);  */
1019   if (niters && !chrec_contains_undetermined (niters))
1020     niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
1021                           build_int_cst (TREE_TYPE (niters), 1));
1022   *number_of_iterations = niters;
1023
1024   return get_loop_exit_condition (loop);
1025 }
1026
1027
1028 /* Function bb_in_loop_p
1029
1030    Used as predicate for dfs order traversal of the loop bbs.  */
1031
1032 static bool
1033 bb_in_loop_p (const_basic_block bb, const void *data)
1034 {
1035   const struct loop *const loop = (const struct loop *)data;
1036   if (flow_bb_inside_loop_p (loop, bb))
1037     return true;
1038   return false;
1039 }
1040
1041
1042 /* Function new_loop_vec_info.
1043
1044    Create and initialize a new loop_vec_info struct for LOOP, as well as
1045    stmt_vec_info structs for all the stmts in LOOP.  */
1046
1047 static loop_vec_info
1048 new_loop_vec_info (struct loop *loop)
1049 {
1050   loop_vec_info res;
1051   basic_block *bbs;
1052   gimple_stmt_iterator si;
1053   unsigned int i, nbbs;
1054
1055   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
1056   res->kind = vec_info::loop;
1057   LOOP_VINFO_LOOP (res) = loop;
1058
1059   bbs = get_loop_body (loop);
1060
1061   /* Create/Update stmt_info for all stmts in the loop.  */
1062   for (i = 0; i < loop->num_nodes; i++)
1063     {
1064       basic_block bb = bbs[i];
1065
1066       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1067         {
1068           gimple *phi = gsi_stmt (si);
1069           gimple_set_uid (phi, 0);
1070           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res));
1071         }
1072
1073       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1074         {
1075           gimple *stmt = gsi_stmt (si);
1076           gimple_set_uid (stmt, 0);
1077           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res));
1078         }
1079     }
1080
1081   /* CHECKME: We want to visit all BBs before their successors (except for
1082      latch blocks, for which this assertion wouldn't hold).  In the simple
1083      case of the loop forms we allow, a dfs order of the BBs would the same
1084      as reversed postorder traversal, so we are safe.  */
1085
1086    free (bbs);
1087    bbs = XCNEWVEC (basic_block, loop->num_nodes);
1088    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1089                               bbs, loop->num_nodes, loop);
1090    gcc_assert (nbbs == loop->num_nodes);
1091
1092   LOOP_VINFO_BBS (res) = bbs;
1093   LOOP_VINFO_NITERSM1 (res) = NULL;
1094   LOOP_VINFO_NITERS (res) = NULL;
1095   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
1096   LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
1097   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
1098   LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
1099   LOOP_VINFO_VECT_FACTOR (res) = 0;
1100   LOOP_VINFO_LOOP_NEST (res) = vNULL;
1101   LOOP_VINFO_DATAREFS (res) = vNULL;
1102   LOOP_VINFO_DDRS (res) = vNULL;
1103   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
1104   LOOP_VINFO_MAY_MISALIGN_STMTS (res) = vNULL;
1105   LOOP_VINFO_MAY_ALIAS_DDRS (res) = vNULL;
1106   LOOP_VINFO_GROUPED_STORES (res) = vNULL;
1107   LOOP_VINFO_REDUCTIONS (res) = vNULL;
1108   LOOP_VINFO_REDUCTION_CHAINS (res) = vNULL;
1109   LOOP_VINFO_SLP_INSTANCES (res) = vNULL;
1110   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1111   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
1112   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
1113   LOOP_VINFO_PEELING_FOR_NITER (res) = false;
1114   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
1115
1116   return res;
1117 }
1118
1119
1120 /* Function destroy_loop_vec_info.
1121
1122    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1123    stmts in the loop.  */
1124
1125 void
1126 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1127 {
1128   struct loop *loop;
1129   basic_block *bbs;
1130   int nbbs;
1131   gimple_stmt_iterator si;
1132   int j;
1133   vec<slp_instance> slp_instances;
1134   slp_instance instance;
1135   bool swapped;
1136
1137   if (!loop_vinfo)
1138     return;
1139
1140   loop = LOOP_VINFO_LOOP (loop_vinfo);
1141
1142   bbs = LOOP_VINFO_BBS (loop_vinfo);
1143   nbbs = clean_stmts ? loop->num_nodes : 0;
1144   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1145
1146   for (j = 0; j < nbbs; j++)
1147     {
1148       basic_block bb = bbs[j];
1149       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1150         free_stmt_vec_info (gsi_stmt (si));
1151
1152       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1153         {
1154           gimple *stmt = gsi_stmt (si);
1155
1156           /* We may have broken canonical form by moving a constant
1157              into RHS1 of a commutative op.  Fix such occurrences.  */
1158           if (swapped && is_gimple_assign (stmt))
1159             {
1160               enum tree_code code = gimple_assign_rhs_code (stmt);
1161
1162               if ((code == PLUS_EXPR
1163                    || code == POINTER_PLUS_EXPR
1164                    || code == MULT_EXPR)
1165                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1166                 swap_ssa_operands (stmt,
1167                                    gimple_assign_rhs1_ptr (stmt),
1168                                    gimple_assign_rhs2_ptr (stmt));
1169             }
1170
1171           /* Free stmt_vec_info.  */
1172           free_stmt_vec_info (stmt);
1173           gsi_next (&si);
1174         }
1175     }
1176
1177   free (LOOP_VINFO_BBS (loop_vinfo));
1178   vect_destroy_datarefs (loop_vinfo);
1179   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1180   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1181   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1182   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1183   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1184   FOR_EACH_VEC_ELT (slp_instances, j, instance)
1185     vect_free_slp_instance (instance);
1186
1187   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1188   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1189   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1190   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1191
1192   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1193   loop_vinfo->scalar_cost_vec.release ();
1194
1195   free (loop_vinfo);
1196   loop->aux = NULL;
1197 }
1198
1199
1200 /* Calculate the cost of one scalar iteration of the loop.  */
1201 static void
1202 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1203 {
1204   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1205   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1206   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1207   int innerloop_iters, i;
1208
1209   /* Count statements in scalar loop.  Using this as scalar cost for a single
1210      iteration for now.
1211
1212      TODO: Add outer loop support.
1213
1214      TODO: Consider assigning different costs to different scalar
1215      statements.  */
1216
1217   /* FORNOW.  */
1218   innerloop_iters = 1;
1219   if (loop->inner)
1220     innerloop_iters = 50; /* FIXME */
1221
1222   for (i = 0; i < nbbs; i++)
1223     {
1224       gimple_stmt_iterator si;
1225       basic_block bb = bbs[i];
1226
1227       if (bb->loop_father == loop->inner)
1228         factor = innerloop_iters;
1229       else
1230         factor = 1;
1231
1232       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1233         {
1234           gimple *stmt = gsi_stmt (si);
1235           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1236
1237           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1238             continue;
1239
1240           /* Skip stmts that are not vectorized inside the loop.  */
1241           if (stmt_info
1242               && !STMT_VINFO_RELEVANT_P (stmt_info)
1243               && (!STMT_VINFO_LIVE_P (stmt_info)
1244                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1245               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1246             continue;
1247
1248           vect_cost_for_stmt kind;
1249           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
1250             {
1251               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1252                kind = scalar_load;
1253              else
1254                kind = scalar_store;
1255             }
1256           else
1257             kind = scalar_stmt;
1258
1259           scalar_single_iter_cost
1260             += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1261                                  factor, kind, NULL, 0, vect_prologue);
1262         }
1263     }
1264   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1265     = scalar_single_iter_cost;
1266 }
1267
1268
1269 /* Function vect_analyze_loop_form_1.
1270
1271    Verify that certain CFG restrictions hold, including:
1272    - the loop has a pre-header
1273    - the loop has a single entry and exit
1274    - the loop exit condition is simple enough, and the number of iterations
1275      can be analyzed (a countable loop).  */
1276
1277 bool
1278 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1279                           tree *number_of_iterationsm1,
1280                           tree *number_of_iterations, gcond **inner_loop_cond)
1281 {
1282   if (dump_enabled_p ())
1283     dump_printf_loc (MSG_NOTE, vect_location,
1284                      "=== vect_analyze_loop_form ===\n");
1285
1286   /* Different restrictions apply when we are considering an inner-most loop,
1287      vs. an outer (nested) loop.
1288      (FORNOW. May want to relax some of these restrictions in the future).  */
1289
1290   if (!loop->inner)
1291     {
1292       /* Inner-most loop.  We currently require that the number of BBs is
1293          exactly 2 (the header and latch).  Vectorizable inner-most loops
1294          look like this:
1295
1296                         (pre-header)
1297                            |
1298                           header <--------+
1299                            | |            |
1300                            | +--> latch --+
1301                            |
1302                         (exit-bb)  */
1303
1304       if (loop->num_nodes != 2)
1305         {
1306           if (dump_enabled_p ())
1307             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1308                              "not vectorized: control flow in loop.\n");
1309           return false;
1310         }
1311
1312       if (empty_block_p (loop->header))
1313         {
1314           if (dump_enabled_p ())
1315             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1316                              "not vectorized: empty loop.\n");
1317           return false;
1318         }
1319     }
1320   else
1321     {
1322       struct loop *innerloop = loop->inner;
1323       edge entryedge;
1324
1325       /* Nested loop. We currently require that the loop is doubly-nested,
1326          contains a single inner loop, and the number of BBs is exactly 5.
1327          Vectorizable outer-loops look like this:
1328
1329                         (pre-header)
1330                            |
1331                           header <---+
1332                            |         |
1333                           inner-loop |
1334                            |         |
1335                           tail ------+
1336                            |
1337                         (exit-bb)
1338
1339          The inner-loop has the properties expected of inner-most loops
1340          as described above.  */
1341
1342       if ((loop->inner)->inner || (loop->inner)->next)
1343         {
1344           if (dump_enabled_p ())
1345             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1346                              "not vectorized: multiple nested loops.\n");
1347           return false;
1348         }
1349
1350       if (loop->num_nodes != 5)
1351         {
1352           if (dump_enabled_p ())
1353             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1354                              "not vectorized: control flow in loop.\n");
1355           return false;
1356         }
1357
1358       entryedge = loop_preheader_edge (innerloop);
1359       if (entryedge->src != loop->header
1360           || !single_exit (innerloop)
1361           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1362         {
1363           if (dump_enabled_p ())
1364             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1365                              "not vectorized: unsupported outerloop form.\n");
1366           return false;
1367         }
1368
1369       /* Analyze the inner-loop.  */
1370       tree inner_niterm1, inner_niter;
1371       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1372                                       &inner_niterm1, &inner_niter, NULL))
1373         {
1374           if (dump_enabled_p ())
1375             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1376                              "not vectorized: Bad inner loop.\n");
1377           return false;
1378         }
1379
1380       if (!expr_invariant_in_loop_p (loop, inner_niter))
1381         {
1382           if (dump_enabled_p ())
1383             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1384                              "not vectorized: inner-loop count not"
1385                              " invariant.\n");
1386           return false;
1387         }
1388
1389       if (dump_enabled_p ())
1390         dump_printf_loc (MSG_NOTE, vect_location,
1391                          "Considering outer-loop vectorization.\n");
1392     }
1393
1394   if (!single_exit (loop)
1395       || EDGE_COUNT (loop->header->preds) != 2)
1396     {
1397       if (dump_enabled_p ())
1398         {
1399           if (!single_exit (loop))
1400             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1401                              "not vectorized: multiple exits.\n");
1402           else if (EDGE_COUNT (loop->header->preds) != 2)
1403             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1404                              "not vectorized: too many incoming edges.\n");
1405         }
1406       return false;
1407     }
1408
1409   /* We assume that the loop exit condition is at the end of the loop. i.e,
1410      that the loop is represented as a do-while (with a proper if-guard
1411      before the loop if needed), where the loop header contains all the
1412      executable statements, and the latch is empty.  */
1413   if (!empty_block_p (loop->latch)
1414       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1415     {
1416       if (dump_enabled_p ())
1417         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1418                          "not vectorized: latch block not empty.\n");
1419       return false;
1420     }
1421
1422   /* Make sure there exists a single-predecessor exit bb:  */
1423   if (!single_pred_p (single_exit (loop)->dest))
1424     {
1425       edge e = single_exit (loop);
1426       if (!(e->flags & EDGE_ABNORMAL))
1427         {
1428           split_loop_exit_edge (e);
1429           if (dump_enabled_p ())
1430             dump_printf (MSG_NOTE, "split exit edge.\n");
1431         }
1432       else
1433         {
1434           if (dump_enabled_p ())
1435             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1436                              "not vectorized: abnormal loop exit edge.\n");
1437           return false;
1438         }
1439     }
1440
1441   *loop_cond = vect_get_loop_niters (loop, number_of_iterations,
1442                                      number_of_iterationsm1);
1443   if (!*loop_cond)
1444     {
1445       if (dump_enabled_p ())
1446         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1447                          "not vectorized: complicated exit condition.\n");
1448       return false;
1449     }
1450
1451   if (!*number_of_iterations
1452       || chrec_contains_undetermined (*number_of_iterations))
1453     {
1454       if (dump_enabled_p ())
1455         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1456                          "not vectorized: number of iterations cannot be "
1457                          "computed.\n");
1458       return false;
1459     }
1460
1461   if (integer_zerop (*number_of_iterations))
1462     {
1463       if (dump_enabled_p ())
1464         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1465                          "not vectorized: number of iterations = 0.\n");
1466       return false;
1467     }
1468
1469   return true;
1470 }
1471
1472 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1473
1474 loop_vec_info
1475 vect_analyze_loop_form (struct loop *loop)
1476 {
1477   tree number_of_iterations, number_of_iterationsm1;
1478   gcond *loop_cond, *inner_loop_cond = NULL;
1479
1480   if (! vect_analyze_loop_form_1 (loop, &loop_cond, &number_of_iterationsm1,
1481                                   &number_of_iterations, &inner_loop_cond))
1482     return NULL;
1483
1484   loop_vec_info loop_vinfo = new_loop_vec_info (loop);
1485   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1486   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1487   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1488
1489   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1490     {
1491       if (dump_enabled_p ())
1492         {
1493           dump_printf_loc (MSG_NOTE, vect_location,
1494                            "Symbolic number of iterations is ");
1495           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1496           dump_printf (MSG_NOTE, "\n");
1497         }
1498     }
1499
1500   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1501   if (inner_loop_cond)
1502     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1503       = loop_exit_ctrl_vec_info_type;
1504
1505   gcc_assert (!loop->aux);
1506   loop->aux = loop_vinfo;
1507   return loop_vinfo;
1508 }
1509
1510
1511
1512 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1513    statements update the vectorization factor.  */
1514
1515 static void
1516 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1517 {
1518   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1519   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1520   int nbbs = loop->num_nodes;
1521   unsigned int vectorization_factor;
1522   int i;
1523
1524   if (dump_enabled_p ())
1525     dump_printf_loc (MSG_NOTE, vect_location,
1526                      "=== vect_update_vf_for_slp ===\n");
1527
1528   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1529   gcc_assert (vectorization_factor != 0);
1530
1531   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1532      vectorization factor of the loop is the unrolling factor required by
1533      the SLP instances.  If that unrolling factor is 1, we say, that we
1534      perform pure SLP on loop - cross iteration parallelism is not
1535      exploited.  */
1536   bool only_slp_in_loop = true;
1537   for (i = 0; i < nbbs; i++)
1538     {
1539       basic_block bb = bbs[i];
1540       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1541            gsi_next (&si))
1542         {
1543           gimple *stmt = gsi_stmt (si);
1544           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1545           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1546               && STMT_VINFO_RELATED_STMT (stmt_info))
1547             {
1548               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1549               stmt_info = vinfo_for_stmt (stmt);
1550             }
1551           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1552                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1553               && !PURE_SLP_STMT (stmt_info))
1554             /* STMT needs both SLP and loop-based vectorization.  */
1555             only_slp_in_loop = false;
1556         }
1557     }
1558
1559   if (only_slp_in_loop)
1560     vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1561   else
1562     vectorization_factor
1563       = least_common_multiple (vectorization_factor,
1564                                LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1565
1566   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1567   if (dump_enabled_p ())
1568     dump_printf_loc (MSG_NOTE, vect_location,
1569                      "Updating vectorization factor to %d\n",
1570                      vectorization_factor);
1571 }
1572
1573 /* Function vect_analyze_loop_operations.
1574
1575    Scan the loop stmts and make sure they are all vectorizable.  */
1576
1577 static bool
1578 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1579 {
1580   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1581   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1582   int nbbs = loop->num_nodes;
1583   int i;
1584   stmt_vec_info stmt_info;
1585   bool need_to_vectorize = false;
1586   bool ok;
1587
1588   if (dump_enabled_p ())
1589     dump_printf_loc (MSG_NOTE, vect_location,
1590                      "=== vect_analyze_loop_operations ===\n");
1591
1592   for (i = 0; i < nbbs; i++)
1593     {
1594       basic_block bb = bbs[i];
1595
1596       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1597            gsi_next (&si))
1598         {
1599           gphi *phi = si.phi ();
1600           ok = true;
1601
1602           stmt_info = vinfo_for_stmt (phi);
1603           if (dump_enabled_p ())
1604             {
1605               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1606               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1607               dump_printf (MSG_NOTE, "\n");
1608             }
1609           if (virtual_operand_p (gimple_phi_result (phi)))
1610             continue;
1611
1612           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1613              (i.e., a phi in the tail of the outer-loop).  */
1614           if (! is_loop_header_bb_p (bb))
1615             {
1616               /* FORNOW: we currently don't support the case that these phis
1617                  are not used in the outerloop (unless it is double reduction,
1618                  i.e., this phi is vect_reduction_def), cause this case
1619                  requires to actually do something here.  */
1620               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1621                    || STMT_VINFO_LIVE_P (stmt_info))
1622                   && STMT_VINFO_DEF_TYPE (stmt_info)
1623                      != vect_double_reduction_def)
1624                 {
1625                   if (dump_enabled_p ())
1626                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1627                                      "Unsupported loop-closed phi in "
1628                                      "outer-loop.\n");
1629                   return false;
1630                 }
1631
1632               /* If PHI is used in the outer loop, we check that its operand
1633                  is defined in the inner loop.  */
1634               if (STMT_VINFO_RELEVANT_P (stmt_info))
1635                 {
1636                   tree phi_op;
1637                   gimple *op_def_stmt;
1638
1639                   if (gimple_phi_num_args (phi) != 1)
1640                     return false;
1641
1642                   phi_op = PHI_ARG_DEF (phi, 0);
1643                   if (TREE_CODE (phi_op) != SSA_NAME)
1644                     return false;
1645
1646                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1647                   if (gimple_nop_p (op_def_stmt)
1648                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1649                       || !vinfo_for_stmt (op_def_stmt))
1650                     return false;
1651
1652                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1653                         != vect_used_in_outer
1654                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1655                            != vect_used_in_outer_by_reduction)
1656                     return false;
1657                 }
1658
1659               continue;
1660             }
1661
1662           gcc_assert (stmt_info);
1663
1664           if (STMT_VINFO_LIVE_P (stmt_info))
1665             {
1666               /* FORNOW: not yet supported.  */
1667               if (dump_enabled_p ())
1668                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1669                                  "not vectorized: value used after loop.\n");
1670               return false;
1671             }
1672
1673           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1674               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1675             {
1676               /* A scalar-dependence cycle that we don't support.  */
1677               if (dump_enabled_p ())
1678                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1679                                  "not vectorized: scalar dependence cycle.\n");
1680               return false;
1681             }
1682
1683           if (STMT_VINFO_RELEVANT_P (stmt_info))
1684             {
1685               need_to_vectorize = true;
1686               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1687                 ok = vectorizable_induction (phi, NULL, NULL);
1688             }
1689
1690           if (!ok)
1691             {
1692               if (dump_enabled_p ())
1693                 {
1694                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1695                                    "not vectorized: relevant phi not "
1696                                    "supported: ");
1697                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1698                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1699                 }
1700               return false;
1701             }
1702         }
1703
1704       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1705            gsi_next (&si))
1706         {
1707           gimple *stmt = gsi_stmt (si);
1708           if (!gimple_clobber_p (stmt)
1709               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1710             return false;
1711         }
1712     } /* bbs */
1713
1714   /* All operations in the loop are either irrelevant (deal with loop
1715      control, or dead), or only used outside the loop and can be moved
1716      out of the loop (e.g. invariants, inductions).  The loop can be
1717      optimized away by scalar optimizations.  We're better off not
1718      touching this loop.  */
1719   if (!need_to_vectorize)
1720     {
1721       if (dump_enabled_p ())
1722         dump_printf_loc (MSG_NOTE, vect_location,
1723                          "All the computation can be taken out of the loop.\n");
1724       if (dump_enabled_p ())
1725         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1726                          "not vectorized: redundant loop. no profit to "
1727                          "vectorize.\n");
1728       return false;
1729     }
1730
1731   return true;
1732 }
1733
1734
1735 /* Function vect_analyze_loop_2.
1736
1737    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1738    for it.  The different analyses will record information in the
1739    loop_vec_info struct.  */
1740 static bool
1741 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1742 {
1743   bool ok;
1744   int max_vf = MAX_VECTORIZATION_FACTOR;
1745   int min_vf = 2;
1746   unsigned int n_stmts = 0;
1747
1748   /* The first group of checks is independent of the vector size.  */
1749   fatal = true;
1750
1751   /* Find all data references in the loop (which correspond to vdefs/vuses)
1752      and analyze their evolution in the loop.  */
1753
1754   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1755
1756   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1757   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1758     {
1759       if (dump_enabled_p ())
1760         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1761                          "not vectorized: loop contains function calls"
1762                          " or data references that cannot be analyzed\n");
1763       return false;
1764     }
1765
1766   for (unsigned i = 0; i < loop->num_nodes; i++)
1767     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1768          !gsi_end_p (gsi); gsi_next (&gsi))
1769       {
1770         gimple *stmt = gsi_stmt (gsi);
1771         if (is_gimple_debug (stmt))
1772           continue;
1773         ++n_stmts;
1774         if (!find_data_references_in_stmt (loop, stmt,
1775                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
1776           {
1777             if (is_gimple_call (stmt) && loop->safelen)
1778               {
1779                 tree fndecl = gimple_call_fndecl (stmt), op;
1780                 if (fndecl != NULL_TREE)
1781                   {
1782                     cgraph_node *node = cgraph_node::get (fndecl);
1783                     if (node != NULL && node->simd_clones != NULL)
1784                       {
1785                         unsigned int j, n = gimple_call_num_args (stmt);
1786                         for (j = 0; j < n; j++)
1787                           {
1788                             op = gimple_call_arg (stmt, j);
1789                             if (DECL_P (op)
1790                                 || (REFERENCE_CLASS_P (op)
1791                                     && get_base_address (op)))
1792                               break;
1793                           }
1794                         op = gimple_call_lhs (stmt);
1795                         /* Ignore #pragma omp declare simd functions
1796                            if they don't have data references in the
1797                            call stmt itself.  */
1798                         if (j == n
1799                             && !(op
1800                                  && (DECL_P (op)
1801                                      || (REFERENCE_CLASS_P (op)
1802                                          && get_base_address (op)))))
1803                           continue;
1804                       }
1805                   }
1806               }
1807             if (dump_enabled_p ())
1808               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1809                                "not vectorized: loop contains function "
1810                                "calls or data references that cannot "
1811                                "be analyzed\n");
1812             return false;
1813           }
1814       }
1815
1816   /* Analyze the data references and also adjust the minimal
1817      vectorization factor according to the loads and stores.  */
1818
1819   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1820   if (!ok)
1821     {
1822       if (dump_enabled_p ())
1823         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1824                          "bad data references.\n");
1825       return false;
1826     }
1827
1828   /* Classify all cross-iteration scalar data-flow cycles.
1829      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1830   vect_analyze_scalar_cycles (loop_vinfo);
1831
1832   vect_pattern_recog (loop_vinfo);
1833
1834   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1835
1836   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1837      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1838
1839   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1840   if (!ok)
1841     {
1842       if (dump_enabled_p ())
1843         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1844                          "bad data access.\n");
1845       return false;
1846     }
1847
1848   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1849
1850   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1851   if (!ok)
1852     {
1853       if (dump_enabled_p ())
1854         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1855                          "unexpected pattern.\n");
1856       return false;
1857     }
1858
1859   /* While the rest of the analysis below depends on it in some way.  */
1860   fatal = false;
1861
1862   /* Analyze data dependences between the data-refs in the loop
1863      and adjust the maximum vectorization factor according to
1864      the dependences.
1865      FORNOW: fail at the first data dependence that we encounter.  */
1866
1867   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1868   if (!ok
1869       || max_vf < min_vf)
1870     {
1871       if (dump_enabled_p ())
1872             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1873                              "bad data dependence.\n");
1874       return false;
1875     }
1876
1877   ok = vect_determine_vectorization_factor (loop_vinfo);
1878   if (!ok)
1879     {
1880       if (dump_enabled_p ())
1881         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1882                          "can't determine vectorization factor.\n");
1883       return false;
1884     }
1885   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1886     {
1887       if (dump_enabled_p ())
1888         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1889                          "bad data dependence.\n");
1890       return false;
1891     }
1892
1893   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1894   ok = vect_analyze_slp (loop_vinfo, n_stmts);
1895   if (!ok)
1896     return false;
1897
1898   /* If there are any SLP instances mark them as pure_slp.  */
1899   bool slp = vect_make_slp_decision (loop_vinfo);
1900   if (slp)
1901     {
1902       /* Find stmts that need to be both vectorized and SLPed.  */
1903       vect_detect_hybrid_slp (loop_vinfo);
1904
1905       /* Update the vectorization factor based on the SLP decision.  */
1906       vect_update_vf_for_slp (loop_vinfo);
1907     }
1908
1909   /* Now the vectorization factor is final.  */
1910   unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1911   gcc_assert (vectorization_factor != 0);
1912
1913   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1914     dump_printf_loc (MSG_NOTE, vect_location,
1915                      "vectorization_factor = %d, niters = "
1916                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1917                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1918
1919   HOST_WIDE_INT max_niter
1920     = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1921   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1922        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1923       || (max_niter != -1
1924           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1925     {
1926       if (dump_enabled_p ())
1927         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1928                          "not vectorized: iteration count too small.\n");
1929       if (dump_enabled_p ())
1930         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1931                          "not vectorized: iteration count smaller than "
1932                          "vectorization factor.\n");
1933       return false;
1934     }
1935
1936   /* Analyze the alignment of the data-refs in the loop.
1937      Fail if a data reference is found that cannot be vectorized.  */
1938
1939   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1940   if (!ok)
1941     {
1942       if (dump_enabled_p ())
1943         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1944                          "bad data alignment.\n");
1945       return false;
1946     }
1947
1948   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1949      It is important to call pruning after vect_analyze_data_ref_accesses,
1950      since we use grouping information gathered by interleaving analysis.  */
1951   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1952   if (!ok)
1953     {
1954       if (dump_enabled_p ())
1955         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1956                          "number of versioning for alias "
1957                          "run-time tests exceeds %d "
1958                          "(--param vect-max-version-for-alias-checks)\n",
1959                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1960       return false;
1961     }
1962
1963   /* Compute the scalar iteration cost.  */
1964   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1965
1966   /* This pass will decide on using loop versioning and/or loop peeling in
1967      order to enhance the alignment of data references in the loop.  */
1968
1969   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1970   if (!ok)
1971     {
1972       if (dump_enabled_p ())
1973         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1974                          "bad data alignment.\n");
1975       return false;
1976     }
1977
1978   if (slp)
1979     {
1980       /* Analyze operations in the SLP instances.  Note this may
1981          remove unsupported SLP instances which makes the above
1982          SLP kind detection invalid.  */
1983       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1984       vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
1985                                    LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1986       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1987         return false;
1988     }
1989
1990   /* Scan all the remaining operations in the loop that are not subject
1991      to SLP and make sure they are vectorizable.  */
1992   ok = vect_analyze_loop_operations (loop_vinfo);
1993   if (!ok)
1994     {
1995       if (dump_enabled_p ())
1996         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1997                          "bad operation or unsupported loop bound.\n");
1998       return false;
1999     }
2000
2001   /* Analyze cost.  Decide if worth while to vectorize.  */
2002   int min_profitable_estimate, min_profitable_iters;
2003   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2004                                       &min_profitable_estimate);
2005
2006   if (min_profitable_iters < 0)
2007     {
2008       if (dump_enabled_p ())
2009         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2010                          "not vectorized: vectorization not profitable.\n");
2011       if (dump_enabled_p ())
2012         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2013                          "not vectorized: vector version will never be "
2014                          "profitable.\n");
2015       return false;
2016     }
2017
2018   int min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2019                                 * vectorization_factor) - 1);
2020
2021   /* Use the cost model only if it is more conservative than user specified
2022      threshold.  */
2023   unsigned th = (unsigned) min_scalar_loop_bound;
2024   if (min_profitable_iters
2025       && (!min_scalar_loop_bound
2026           || min_profitable_iters > min_scalar_loop_bound))
2027     th = (unsigned) min_profitable_iters;
2028
2029   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2030
2031   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2032       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
2033     {
2034       if (dump_enabled_p ())
2035         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2036                          "not vectorized: vectorization not profitable.\n");
2037       if (dump_enabled_p ())
2038         dump_printf_loc (MSG_NOTE, vect_location,
2039                          "not vectorized: iteration count smaller than user "
2040                          "specified loop bound parameter or minimum profitable "
2041                          "iterations (whichever is more conservative).\n");
2042       return false;
2043     }
2044
2045   HOST_WIDE_INT estimated_niter
2046     = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2047   if (estimated_niter != -1
2048       && ((unsigned HOST_WIDE_INT) estimated_niter
2049           <= MAX (th, (unsigned)min_profitable_estimate)))
2050     {
2051       if (dump_enabled_p ())
2052         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2053                          "not vectorized: estimated iteration count too "
2054                          "small.\n");
2055       if (dump_enabled_p ())
2056         dump_printf_loc (MSG_NOTE, vect_location,
2057                          "not vectorized: estimated iteration count smaller "
2058                          "than specified loop bound parameter or minimum "
2059                          "profitable iterations (whichever is more "
2060                          "conservative).\n");
2061       return false;
2062     }
2063
2064   /* Decide whether we need to create an epilogue loop to handle
2065      remaining scalar iterations.  */
2066   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
2067         / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2068        * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2069
2070   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2071       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2072     {
2073       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
2074                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
2075           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2076         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2077     }
2078   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2079            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2080                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2081                /* In case of versioning, check if the maximum number of
2082                   iterations is greater than th.  If they are identical,
2083                   the epilogue is unnecessary.  */
2084                && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
2085                     && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2086                    || (unsigned HOST_WIDE_INT) max_niter > th)))
2087     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2088
2089   /* If an epilogue loop is required make sure we can create one.  */
2090   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2091       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2092     {
2093       if (dump_enabled_p ())
2094         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2095       if (!vect_can_advance_ivs_p (loop_vinfo)
2096           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2097                                            single_exit (LOOP_VINFO_LOOP
2098                                                          (loop_vinfo))))
2099         {
2100           if (dump_enabled_p ())
2101             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2102                              "not vectorized: can't create required "
2103                              "epilog loop\n");
2104           return false;
2105         }
2106     }
2107
2108   gcc_assert (vectorization_factor
2109               == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2110
2111   return true;
2112 }
2113
2114 /* Function vect_analyze_loop.
2115
2116    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2117    for it.  The different analyses will record information in the
2118    loop_vec_info struct.  */
2119 loop_vec_info
2120 vect_analyze_loop (struct loop *loop)
2121 {
2122   loop_vec_info loop_vinfo;
2123   unsigned int vector_sizes;
2124
2125   /* Autodetect first vector size we try.  */
2126   current_vector_size = 0;
2127   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2128
2129   if (dump_enabled_p ())
2130     dump_printf_loc (MSG_NOTE, vect_location,
2131                      "===== analyze_loop_nest =====\n");
2132
2133   if (loop_outer (loop)
2134       && loop_vec_info_for_loop (loop_outer (loop))
2135       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2136     {
2137       if (dump_enabled_p ())
2138         dump_printf_loc (MSG_NOTE, vect_location,
2139                          "outer-loop already vectorized.\n");
2140       return NULL;
2141     }
2142
2143   while (1)
2144     {
2145       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2146       loop_vinfo = vect_analyze_loop_form (loop);
2147       if (!loop_vinfo)
2148         {
2149           if (dump_enabled_p ())
2150             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2151                              "bad loop form.\n");
2152           return NULL;
2153         }
2154
2155       bool fatal = false;
2156       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2157         {
2158           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2159
2160           return loop_vinfo;
2161         }
2162
2163       destroy_loop_vec_info (loop_vinfo, true);
2164
2165       vector_sizes &= ~current_vector_size;
2166       if (fatal
2167           || vector_sizes == 0
2168           || current_vector_size == 0)
2169         return NULL;
2170
2171       /* Try the next biggest vector size.  */
2172       current_vector_size = 1 << floor_log2 (vector_sizes);
2173       if (dump_enabled_p ())
2174         dump_printf_loc (MSG_NOTE, vect_location,
2175                          "***** Re-trying analysis with "
2176                          "vector size %d\n", current_vector_size);
2177     }
2178 }
2179
2180
2181 /* Function reduction_code_for_scalar_code
2182
2183    Input:
2184    CODE - tree_code of a reduction operations.
2185
2186    Output:
2187    REDUC_CODE - the corresponding tree-code to be used to reduce the
2188       vector of partial results into a single scalar result, or ERROR_MARK
2189       if the operation is a supported reduction operation, but does not have
2190       such a tree-code.
2191
2192    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2193
2194 static bool
2195 reduction_code_for_scalar_code (enum tree_code code,
2196                                 enum tree_code *reduc_code)
2197 {
2198   switch (code)
2199     {
2200       case MAX_EXPR:
2201         *reduc_code = REDUC_MAX_EXPR;
2202         return true;
2203
2204       case MIN_EXPR:
2205         *reduc_code = REDUC_MIN_EXPR;
2206         return true;
2207
2208       case PLUS_EXPR:
2209         *reduc_code = REDUC_PLUS_EXPR;
2210         return true;
2211
2212       case MULT_EXPR:
2213       case MINUS_EXPR:
2214       case BIT_IOR_EXPR:
2215       case BIT_XOR_EXPR:
2216       case BIT_AND_EXPR:
2217         *reduc_code = ERROR_MARK;
2218         return true;
2219
2220       default:
2221        return false;
2222     }
2223 }
2224
2225
2226 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2227    STMT is printed with a message MSG. */
2228
2229 static void
2230 report_vect_op (int msg_type, gimple *stmt, const char *msg)
2231 {
2232   dump_printf_loc (msg_type, vect_location, "%s", msg);
2233   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2234   dump_printf (msg_type, "\n");
2235 }
2236
2237
2238 /* Detect SLP reduction of the form:
2239
2240    #a1 = phi <a5, a0>
2241    a2 = operation (a1)
2242    a3 = operation (a2)
2243    a4 = operation (a3)
2244    a5 = operation (a4)
2245
2246    #a = phi <a5>
2247
2248    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2249    FIRST_STMT is the first reduction stmt in the chain
2250    (a2 = operation (a1)).
2251
2252    Return TRUE if a reduction chain was detected.  */
2253
2254 static bool
2255 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2256                        gimple *first_stmt)
2257 {
2258   struct loop *loop = (gimple_bb (phi))->loop_father;
2259   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2260   enum tree_code code;
2261   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2262   stmt_vec_info use_stmt_info, current_stmt_info;
2263   tree lhs;
2264   imm_use_iterator imm_iter;
2265   use_operand_p use_p;
2266   int nloop_uses, size = 0, n_out_of_loop_uses;
2267   bool found = false;
2268
2269   if (loop != vect_loop)
2270     return false;
2271
2272   lhs = PHI_RESULT (phi);
2273   code = gimple_assign_rhs_code (first_stmt);
2274   while (1)
2275     {
2276       nloop_uses = 0;
2277       n_out_of_loop_uses = 0;
2278       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2279         {
2280           gimple *use_stmt = USE_STMT (use_p);
2281           if (is_gimple_debug (use_stmt))
2282             continue;
2283
2284           /* Check if we got back to the reduction phi.  */
2285           if (use_stmt == phi)
2286             {
2287               loop_use_stmt = use_stmt;
2288               found = true;
2289               break;
2290             }
2291
2292           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2293             {
2294               loop_use_stmt = use_stmt;
2295               nloop_uses++;
2296             }
2297            else
2298              n_out_of_loop_uses++;
2299
2300            /* There are can be either a single use in the loop or two uses in
2301               phi nodes.  */
2302            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2303              return false;
2304         }
2305
2306       if (found)
2307         break;
2308
2309       /* We reached a statement with no loop uses.  */
2310       if (nloop_uses == 0)
2311         return false;
2312
2313       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2314       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2315         return false;
2316
2317       if (!is_gimple_assign (loop_use_stmt)
2318           || code != gimple_assign_rhs_code (loop_use_stmt)
2319           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2320         return false;
2321
2322       /* Insert USE_STMT into reduction chain.  */
2323       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2324       if (current_stmt)
2325         {
2326           current_stmt_info = vinfo_for_stmt (current_stmt);
2327           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2328           GROUP_FIRST_ELEMENT (use_stmt_info)
2329             = GROUP_FIRST_ELEMENT (current_stmt_info);
2330         }
2331       else
2332         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2333
2334       lhs = gimple_assign_lhs (loop_use_stmt);
2335       current_stmt = loop_use_stmt;
2336       size++;
2337    }
2338
2339   if (!found || loop_use_stmt != phi || size < 2)
2340     return false;
2341
2342   /* Swap the operands, if needed, to make the reduction operand be the second
2343      operand.  */
2344   lhs = PHI_RESULT (phi);
2345   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2346   while (next_stmt)
2347     {
2348       if (gimple_assign_rhs2 (next_stmt) == lhs)
2349         {
2350           tree op = gimple_assign_rhs1 (next_stmt);
2351           gimple *def_stmt = NULL;
2352
2353           if (TREE_CODE (op) == SSA_NAME)
2354             def_stmt = SSA_NAME_DEF_STMT (op);
2355
2356           /* Check that the other def is either defined in the loop
2357              ("vect_internal_def"), or it's an induction (defined by a
2358              loop-header phi-node).  */
2359           if (def_stmt
2360               && gimple_bb (def_stmt)
2361               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2362               && (is_gimple_assign (def_stmt)
2363                   || is_gimple_call (def_stmt)
2364                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2365                            == vect_induction_def
2366                   || (gimple_code (def_stmt) == GIMPLE_PHI
2367                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2368                                   == vect_internal_def
2369                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2370             {
2371               lhs = gimple_assign_lhs (next_stmt);
2372               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2373               continue;
2374             }
2375
2376           return false;
2377         }
2378       else
2379         {
2380           tree op = gimple_assign_rhs2 (next_stmt);
2381           gimple *def_stmt = NULL;
2382
2383           if (TREE_CODE (op) == SSA_NAME)
2384             def_stmt = SSA_NAME_DEF_STMT (op);
2385
2386           /* Check that the other def is either defined in the loop
2387             ("vect_internal_def"), or it's an induction (defined by a
2388             loop-header phi-node).  */
2389           if (def_stmt
2390               && gimple_bb (def_stmt)
2391               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2392               && (is_gimple_assign (def_stmt)
2393                   || is_gimple_call (def_stmt)
2394                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2395                               == vect_induction_def
2396                   || (gimple_code (def_stmt) == GIMPLE_PHI
2397                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2398                                   == vect_internal_def
2399                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2400             {
2401               if (dump_enabled_p ())
2402                 {
2403                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2404                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2405                   dump_printf (MSG_NOTE, "\n");
2406                 }
2407
2408               swap_ssa_operands (next_stmt,
2409                                  gimple_assign_rhs1_ptr (next_stmt),
2410                                  gimple_assign_rhs2_ptr (next_stmt));
2411               update_stmt (next_stmt);
2412
2413               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2414                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2415             }
2416           else
2417             return false;
2418         }
2419
2420       lhs = gimple_assign_lhs (next_stmt);
2421       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2422     }
2423
2424   /* Save the chain for further analysis in SLP detection.  */
2425   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2426   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2427   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2428
2429   return true;
2430 }
2431
2432
2433 /* Function vect_is_simple_reduction_1
2434
2435    (1) Detect a cross-iteration def-use cycle that represents a simple
2436    reduction computation.  We look for the following pattern:
2437
2438    loop_header:
2439      a1 = phi < a0, a2 >
2440      a3 = ...
2441      a2 = operation (a3, a1)
2442
2443    or
2444
2445    a3 = ...
2446    loop_header:
2447      a1 = phi < a0, a2 >
2448      a2 = operation (a3, a1)
2449
2450    such that:
2451    1. operation is commutative and associative and it is safe to
2452       change the order of the computation (if CHECK_REDUCTION is true)
2453    2. no uses for a2 in the loop (a2 is used out of the loop)
2454    3. no uses of a1 in the loop besides the reduction operation
2455    4. no uses of a1 outside the loop.
2456
2457    Conditions 1,4 are tested here.
2458    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2459
2460    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2461    nested cycles, if CHECK_REDUCTION is false.
2462
2463    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2464    reductions:
2465
2466      a1 = phi < a0, a2 >
2467      inner loop (def of a3)
2468      a2 = phi < a3 >
2469
2470    (4) Detect condition expressions, ie:
2471      for (int i = 0; i < N; i++)
2472        if (a[i] < val)
2473         ret_val = a[i];
2474
2475 */
2476
2477 static gimple *
2478 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2479                           bool check_reduction, bool *double_reduc,
2480                           bool need_wrapping_integral_overflow,
2481                           enum vect_reduction_type *v_reduc_type)
2482 {
2483   struct loop *loop = (gimple_bb (phi))->loop_father;
2484   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2485   edge latch_e = loop_latch_edge (loop);
2486   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2487   gimple *def_stmt, *def1 = NULL, *def2 = NULL;
2488   enum tree_code orig_code, code;
2489   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2490   tree type;
2491   int nloop_uses;
2492   tree name;
2493   imm_use_iterator imm_iter;
2494   use_operand_p use_p;
2495   bool phi_def;
2496
2497   *double_reduc = false;
2498   *v_reduc_type = TREE_CODE_REDUCTION;
2499
2500   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2501      otherwise, we assume outer loop vectorization.  */
2502   gcc_assert ((check_reduction && loop == vect_loop)
2503               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2504
2505   name = PHI_RESULT (phi);
2506   /* ???  If there are no uses of the PHI result the inner loop reduction
2507      won't be detected as possibly double-reduction by vectorizable_reduction
2508      because that tries to walk the PHI arg from the preheader edge which
2509      can be constant.  See PR60382.  */
2510   if (has_zero_uses (name))
2511     return NULL;
2512   nloop_uses = 0;
2513   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2514     {
2515       gimple *use_stmt = USE_STMT (use_p);
2516       if (is_gimple_debug (use_stmt))
2517         continue;
2518
2519       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2520         {
2521           if (dump_enabled_p ())
2522             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2523                              "intermediate value used outside loop.\n");
2524
2525           return NULL;
2526         }
2527
2528       nloop_uses++;
2529       if (nloop_uses > 1)
2530         {
2531           if (dump_enabled_p ())
2532             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2533                              "reduction used in loop.\n");
2534           return NULL;
2535         }
2536     }
2537
2538   if (TREE_CODE (loop_arg) != SSA_NAME)
2539     {
2540       if (dump_enabled_p ())
2541         {
2542           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2543                            "reduction: not ssa_name: ");
2544           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2545           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2546         }
2547       return NULL;
2548     }
2549
2550   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2551   if (!def_stmt)
2552     {
2553       if (dump_enabled_p ())
2554         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2555                          "reduction: no def_stmt.\n");
2556       return NULL;
2557     }
2558
2559   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2560     {
2561       if (dump_enabled_p ())
2562         {
2563           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2564           dump_printf (MSG_NOTE, "\n");
2565         }
2566       return NULL;
2567     }
2568
2569   if (is_gimple_assign (def_stmt))
2570     {
2571       name = gimple_assign_lhs (def_stmt);
2572       phi_def = false;
2573     }
2574   else
2575     {
2576       name = PHI_RESULT (def_stmt);
2577       phi_def = true;
2578     }
2579
2580   nloop_uses = 0;
2581   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2582     {
2583       gimple *use_stmt = USE_STMT (use_p);
2584       if (is_gimple_debug (use_stmt))
2585         continue;
2586       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2587         nloop_uses++;
2588       if (nloop_uses > 1)
2589         {
2590           if (dump_enabled_p ())
2591             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2592                              "reduction used in loop.\n");
2593           return NULL;
2594         }
2595     }
2596
2597   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2598      defined in the inner loop.  */
2599   if (phi_def)
2600     {
2601       op1 = PHI_ARG_DEF (def_stmt, 0);
2602
2603       if (gimple_phi_num_args (def_stmt) != 1
2604           || TREE_CODE (op1) != SSA_NAME)
2605         {
2606           if (dump_enabled_p ())
2607             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2608                              "unsupported phi node definition.\n");
2609
2610           return NULL;
2611         }
2612
2613       def1 = SSA_NAME_DEF_STMT (op1);
2614       if (gimple_bb (def1)
2615           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2616           && loop->inner
2617           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2618           && is_gimple_assign (def1))
2619         {
2620           if (dump_enabled_p ())
2621             report_vect_op (MSG_NOTE, def_stmt,
2622                             "detected double reduction: ");
2623
2624           *double_reduc = true;
2625           return def_stmt;
2626         }
2627
2628       return NULL;
2629     }
2630
2631   code = orig_code = gimple_assign_rhs_code (def_stmt);
2632
2633   /* We can handle "res -= x[i]", which is non-associative by
2634      simply rewriting this into "res += -x[i]".  Avoid changing
2635      gimple instruction for the first simple tests and only do this
2636      if we're allowed to change code at all.  */
2637   if (code == MINUS_EXPR
2638       && (op1 = gimple_assign_rhs1 (def_stmt))
2639       && TREE_CODE (op1) == SSA_NAME
2640       && SSA_NAME_DEF_STMT (op1) == phi)
2641     code = PLUS_EXPR;
2642
2643   if (check_reduction)
2644     {
2645       if (code == COND_EXPR)
2646         *v_reduc_type = COND_REDUCTION;
2647       else if (!commutative_tree_code (code) || !associative_tree_code (code))
2648         {
2649           if (dump_enabled_p ())
2650             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2651                             "reduction: not commutative/associative: ");
2652           return NULL;
2653         }
2654     }
2655
2656   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2657     {
2658       if (code != COND_EXPR)
2659         {
2660           if (dump_enabled_p ())
2661             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2662                             "reduction: not binary operation: ");
2663
2664           return NULL;
2665         }
2666
2667       op3 = gimple_assign_rhs1 (def_stmt);
2668       if (COMPARISON_CLASS_P (op3))
2669         {
2670           op4 = TREE_OPERAND (op3, 1);
2671           op3 = TREE_OPERAND (op3, 0);
2672         }
2673
2674       op1 = gimple_assign_rhs2 (def_stmt);
2675       op2 = gimple_assign_rhs3 (def_stmt);
2676
2677       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2678         {
2679           if (dump_enabled_p ())
2680             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2681                             "reduction: uses not ssa_names: ");
2682
2683           return NULL;
2684         }
2685     }
2686   else
2687     {
2688       op1 = gimple_assign_rhs1 (def_stmt);
2689       op2 = gimple_assign_rhs2 (def_stmt);
2690
2691       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2692         {
2693           if (dump_enabled_p ())
2694             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2695                             "reduction: uses not ssa_names: ");
2696
2697           return NULL;
2698         }
2699    }
2700
2701   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2702   if ((TREE_CODE (op1) == SSA_NAME
2703        && !types_compatible_p (type,TREE_TYPE (op1)))
2704       || (TREE_CODE (op2) == SSA_NAME
2705           && !types_compatible_p (type, TREE_TYPE (op2)))
2706       || (op3 && TREE_CODE (op3) == SSA_NAME
2707           && !types_compatible_p (type, TREE_TYPE (op3)))
2708       || (op4 && TREE_CODE (op4) == SSA_NAME
2709           && !types_compatible_p (type, TREE_TYPE (op4))))
2710     {
2711       if (dump_enabled_p ())
2712         {
2713           dump_printf_loc (MSG_NOTE, vect_location,
2714                            "reduction: multiple types: operation type: ");
2715           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2716           dump_printf (MSG_NOTE, ", operands types: ");
2717           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2718                              TREE_TYPE (op1));
2719           dump_printf (MSG_NOTE, ",");
2720           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2721                              TREE_TYPE (op2));
2722           if (op3)
2723             {
2724               dump_printf (MSG_NOTE, ",");
2725               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2726                                  TREE_TYPE (op3));
2727             }
2728
2729           if (op4)
2730             {
2731               dump_printf (MSG_NOTE, ",");
2732               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2733                                  TREE_TYPE (op4));
2734             }
2735           dump_printf (MSG_NOTE, "\n");
2736         }
2737
2738       return NULL;
2739     }
2740
2741   /* Check that it's ok to change the order of the computation.
2742      Generally, when vectorizing a reduction we change the order of the
2743      computation.  This may change the behavior of the program in some
2744      cases, so we need to check that this is ok.  One exception is when
2745      vectorizing an outer-loop: the inner-loop is executed sequentially,
2746      and therefore vectorizing reductions in the inner-loop during
2747      outer-loop vectorization is safe.  */
2748
2749   if (*v_reduc_type != COND_REDUCTION)
2750     {
2751       /* CHECKME: check for !flag_finite_math_only too?  */
2752       if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2753           && check_reduction)
2754         {
2755           /* Changing the order of operations changes the semantics.  */
2756           if (dump_enabled_p ())
2757             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2758                         "reduction: unsafe fp math optimization: ");
2759           return NULL;
2760         }
2761       else if (INTEGRAL_TYPE_P (type) && check_reduction)
2762         {
2763           if (!operation_no_trapping_overflow (type, code))
2764             {
2765               /* Changing the order of operations changes the semantics.  */
2766               if (dump_enabled_p ())
2767                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2768                                 "reduction: unsafe int math optimization"
2769                                 " (overflow traps): ");
2770               return NULL;
2771             }
2772           if (need_wrapping_integral_overflow
2773               && !TYPE_OVERFLOW_WRAPS (type)
2774               && operation_can_overflow (code))
2775             {
2776               /* Changing the order of operations changes the semantics.  */
2777               if (dump_enabled_p ())
2778                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2779                                 "reduction: unsafe int math optimization"
2780                                 " (overflow doesn't wrap): ");
2781               return NULL;
2782             }
2783         }
2784       else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2785         {
2786           /* Changing the order of operations changes the semantics.  */
2787           if (dump_enabled_p ())
2788           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2789                           "reduction: unsafe fixed-point math optimization: ");
2790           return NULL;
2791         }
2792     }
2793
2794   /* Reduction is safe. We're dealing with one of the following:
2795      1) integer arithmetic and no trapv
2796      2) floating point arithmetic, and special flags permit this optimization
2797      3) nested cycle (i.e., outer loop vectorization).  */
2798   if (TREE_CODE (op1) == SSA_NAME)
2799     def1 = SSA_NAME_DEF_STMT (op1);
2800
2801   if (TREE_CODE (op2) == SSA_NAME)
2802     def2 = SSA_NAME_DEF_STMT (op2);
2803
2804   if (code != COND_EXPR
2805       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2806     {
2807       if (dump_enabled_p ())
2808         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2809       return NULL;
2810     }
2811
2812   /* Check that one def is the reduction def, defined by PHI,
2813      the other def is either defined in the loop ("vect_internal_def"),
2814      or it's an induction (defined by a loop-header phi-node).  */
2815
2816   if (def2 && def2 == phi
2817       && (code == COND_EXPR
2818           || !def1 || gimple_nop_p (def1)
2819           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2820           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2821               && (is_gimple_assign (def1)
2822                   || is_gimple_call (def1)
2823                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2824                       == vect_induction_def
2825                   || (gimple_code (def1) == GIMPLE_PHI
2826                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2827                           == vect_internal_def
2828                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2829     {
2830       if (dump_enabled_p ())
2831         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2832       return def_stmt;
2833     }
2834
2835   if (def1 && def1 == phi
2836       && (code == COND_EXPR
2837           || !def2 || gimple_nop_p (def2)
2838           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2839           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2840               && (is_gimple_assign (def2)
2841                   || is_gimple_call (def2)
2842                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2843                       == vect_induction_def
2844                   || (gimple_code (def2) == GIMPLE_PHI
2845                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2846                           == vect_internal_def
2847                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2848     {
2849       if (check_reduction
2850           && orig_code != MINUS_EXPR)
2851         {
2852           if (code == COND_EXPR)
2853             {
2854               /* No current known use where this case would be useful.  */
2855               if (dump_enabled_p ())
2856                 report_vect_op (MSG_NOTE, def_stmt,
2857                                 "detected reduction: cannot currently swap "
2858                                 "operands for cond_expr");
2859               return NULL;
2860             }
2861
2862           /* Swap operands (just for simplicity - so that the rest of the code
2863              can assume that the reduction variable is always the last (second)
2864              argument).  */
2865           if (dump_enabled_p ())
2866             report_vect_op (MSG_NOTE, def_stmt,
2867                             "detected reduction: need to swap operands: ");
2868
2869           swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2870                              gimple_assign_rhs2_ptr (def_stmt));
2871
2872           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2873             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2874         }
2875       else
2876         {
2877           if (dump_enabled_p ())
2878             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2879         }
2880
2881       return def_stmt;
2882     }
2883
2884   /* Try to find SLP reduction chain.  */
2885   if (check_reduction && code != COND_EXPR
2886       && vect_is_slp_reduction (loop_info, phi, def_stmt))
2887     {
2888       if (dump_enabled_p ())
2889         report_vect_op (MSG_NOTE, def_stmt,
2890                         "reduction: detected reduction chain: ");
2891
2892       return def_stmt;
2893     }
2894
2895   if (dump_enabled_p ())
2896     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2897                     "reduction: unknown pattern: ");
2898
2899   return NULL;
2900 }
2901
2902 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2903    in-place if it enables detection of more reductions.  Arguments
2904    as there.  */
2905
2906 gimple *
2907 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
2908                              bool check_reduction, bool *double_reduc,
2909                              bool need_wrapping_integral_overflow)
2910 {
2911   enum vect_reduction_type v_reduc_type;
2912   return vect_is_simple_reduction (loop_info, phi, check_reduction,
2913                                    double_reduc,
2914                                    need_wrapping_integral_overflow,
2915                                    &v_reduc_type);
2916 }
2917
2918 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2919 int
2920 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2921                              int *peel_iters_epilogue,
2922                              stmt_vector_for_cost *scalar_cost_vec,
2923                              stmt_vector_for_cost *prologue_cost_vec,
2924                              stmt_vector_for_cost *epilogue_cost_vec)
2925 {
2926   int retval = 0;
2927   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2928
2929   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2930     {
2931       *peel_iters_epilogue = vf/2;
2932       if (dump_enabled_p ())
2933         dump_printf_loc (MSG_NOTE, vect_location,
2934                          "cost model: epilogue peel iters set to vf/2 "
2935                          "because loop iterations are unknown .\n");
2936
2937       /* If peeled iterations are known but number of scalar loop
2938          iterations are unknown, count a taken branch per peeled loop.  */
2939       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2940                                  NULL, 0, vect_prologue);
2941       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2942                                  NULL, 0, vect_epilogue);
2943     }
2944   else
2945     {
2946       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2947       peel_iters_prologue = niters < peel_iters_prologue ?
2948                             niters : peel_iters_prologue;
2949       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2950       /* If we need to peel for gaps, but no peeling is required, we have to
2951          peel VF iterations.  */
2952       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2953         *peel_iters_epilogue = vf;
2954     }
2955
2956   stmt_info_for_cost *si;
2957   int j;
2958   if (peel_iters_prologue)
2959     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2960       retval += record_stmt_cost (prologue_cost_vec,
2961                                   si->count * peel_iters_prologue,
2962                                   si->kind, NULL, si->misalign,
2963                                   vect_prologue);
2964   if (*peel_iters_epilogue)
2965     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2966       retval += record_stmt_cost (epilogue_cost_vec,
2967                                   si->count * *peel_iters_epilogue,
2968                                   si->kind, NULL, si->misalign,
2969                                   vect_epilogue);
2970
2971   return retval;
2972 }
2973
2974 /* Function vect_estimate_min_profitable_iters
2975
2976    Return the number of iterations required for the vector version of the
2977    loop to be profitable relative to the cost of the scalar version of the
2978    loop.  */
2979
2980 static void
2981 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2982                                     int *ret_min_profitable_niters,
2983                                     int *ret_min_profitable_estimate)
2984 {
2985   int min_profitable_iters;
2986   int min_profitable_estimate;
2987   int peel_iters_prologue;
2988   int peel_iters_epilogue;
2989   unsigned vec_inside_cost = 0;
2990   int vec_outside_cost = 0;
2991   unsigned vec_prologue_cost = 0;
2992   unsigned vec_epilogue_cost = 0;
2993   int scalar_single_iter_cost = 0;
2994   int scalar_outside_cost = 0;
2995   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2996   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2997   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2998
2999   /* Cost model disabled.  */
3000   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3001     {
3002       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3003       *ret_min_profitable_niters = 0;
3004       *ret_min_profitable_estimate = 0;
3005       return;
3006     }
3007
3008   /* Requires loop versioning tests to handle misalignment.  */
3009   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3010     {
3011       /*  FIXME: Make cost depend on complexity of individual check.  */
3012       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3013       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3014                             vect_prologue);
3015       dump_printf (MSG_NOTE,
3016                    "cost model: Adding cost of checks for loop "
3017                    "versioning to treat misalignment.\n");
3018     }
3019
3020   /* Requires loop versioning with alias checks.  */
3021   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3022     {
3023       /*  FIXME: Make cost depend on complexity of individual check.  */
3024       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3025       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3026                             vect_prologue);
3027       dump_printf (MSG_NOTE,
3028                    "cost model: Adding cost of checks for loop "
3029                    "versioning aliasing.\n");
3030     }
3031
3032   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3033       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3034     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3035                           vect_prologue);
3036
3037   /* Count statements in scalar loop.  Using this as scalar cost for a single
3038      iteration for now.
3039
3040      TODO: Add outer loop support.
3041
3042      TODO: Consider assigning different costs to different scalar
3043      statements.  */
3044
3045   scalar_single_iter_cost
3046     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3047
3048   /* Add additional cost for the peeled instructions in prologue and epilogue
3049      loop.
3050
3051      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3052      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3053
3054      TODO: Build an expression that represents peel_iters for prologue and
3055      epilogue to be used in a run-time test.  */
3056
3057   if (npeel  < 0)
3058     {
3059       peel_iters_prologue = vf/2;
3060       dump_printf (MSG_NOTE, "cost model: "
3061                    "prologue peel iters set to vf/2.\n");
3062
3063       /* If peeling for alignment is unknown, loop bound of main loop becomes
3064          unknown.  */
3065       peel_iters_epilogue = vf/2;
3066       dump_printf (MSG_NOTE, "cost model: "
3067                    "epilogue peel iters set to vf/2 because "
3068                    "peeling for alignment is unknown.\n");
3069
3070       /* If peeled iterations are unknown, count a taken branch and a not taken
3071          branch per peeled loop. Even if scalar loop iterations are known,
3072          vector iterations are not known since peeled prologue iterations are
3073          not known. Hence guards remain the same.  */
3074       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3075                             NULL, 0, vect_prologue);
3076       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3077                             NULL, 0, vect_prologue);
3078       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3079                             NULL, 0, vect_epilogue);
3080       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3081                             NULL, 0, vect_epilogue);
3082       stmt_info_for_cost *si;
3083       int j;
3084       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3085         {
3086           struct _stmt_vec_info *stmt_info
3087             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3088           (void) add_stmt_cost (target_cost_data,
3089                                 si->count * peel_iters_prologue,
3090                                 si->kind, stmt_info, si->misalign,
3091                                 vect_prologue);
3092           (void) add_stmt_cost (target_cost_data,
3093                                 si->count * peel_iters_epilogue,
3094                                 si->kind, stmt_info, si->misalign,
3095                                 vect_epilogue);
3096         }
3097     }
3098   else
3099     {
3100       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3101       stmt_info_for_cost *si;
3102       int j;
3103       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3104
3105       prologue_cost_vec.create (2);
3106       epilogue_cost_vec.create (2);
3107       peel_iters_prologue = npeel;
3108
3109       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3110                                           &peel_iters_epilogue,
3111                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3112                                             (loop_vinfo),
3113                                           &prologue_cost_vec,
3114                                           &epilogue_cost_vec);
3115
3116       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3117         {
3118           struct _stmt_vec_info *stmt_info
3119             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3120           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3121                                 si->misalign, vect_prologue);
3122         }
3123
3124       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3125         {
3126           struct _stmt_vec_info *stmt_info
3127             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3128           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3129                                 si->misalign, vect_epilogue);
3130         }
3131
3132       prologue_cost_vec.release ();
3133       epilogue_cost_vec.release ();
3134     }
3135
3136   /* FORNOW: The scalar outside cost is incremented in one of the
3137      following ways:
3138
3139      1. The vectorizer checks for alignment and aliasing and generates
3140      a condition that allows dynamic vectorization.  A cost model
3141      check is ANDED with the versioning condition.  Hence scalar code
3142      path now has the added cost of the versioning check.
3143
3144        if (cost > th & versioning_check)
3145          jmp to vector code
3146
3147      Hence run-time scalar is incremented by not-taken branch cost.
3148
3149      2. The vectorizer then checks if a prologue is required.  If the
3150      cost model check was not done before during versioning, it has to
3151      be done before the prologue check.
3152
3153        if (cost <= th)
3154          prologue = scalar_iters
3155        if (prologue == 0)
3156          jmp to vector code
3157        else
3158          execute prologue
3159        if (prologue == num_iters)
3160          go to exit
3161
3162      Hence the run-time scalar cost is incremented by a taken branch,
3163      plus a not-taken branch, plus a taken branch cost.
3164
3165      3. The vectorizer then checks if an epilogue is required.  If the
3166      cost model check was not done before during prologue check, it
3167      has to be done with the epilogue check.
3168
3169        if (prologue == 0)
3170          jmp to vector code
3171        else
3172          execute prologue
3173        if (prologue == num_iters)
3174          go to exit
3175        vector code:
3176          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3177            jmp to epilogue
3178
3179      Hence the run-time scalar cost should be incremented by 2 taken
3180      branches.
3181
3182      TODO: The back end may reorder the BBS's differently and reverse
3183      conditions/branch directions.  Change the estimates below to
3184      something more reasonable.  */
3185
3186   /* If the number of iterations is known and we do not do versioning, we can
3187      decide whether to vectorize at compile time.  Hence the scalar version
3188      do not carry cost model guard costs.  */
3189   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3190       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3191       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3192     {
3193       /* Cost model check occurs at versioning.  */
3194       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3195           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3196         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3197       else
3198         {
3199           /* Cost model check occurs at prologue generation.  */
3200           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3201             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3202               + vect_get_stmt_cost (cond_branch_not_taken);
3203           /* Cost model check occurs at epilogue generation.  */
3204           else
3205             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3206         }
3207     }
3208
3209   /* Complete the target-specific cost calculations.  */
3210   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3211                &vec_inside_cost, &vec_epilogue_cost);
3212
3213   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3214
3215   if (dump_enabled_p ())
3216     {
3217       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3218       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3219                    vec_inside_cost);
3220       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3221                    vec_prologue_cost);
3222       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3223                    vec_epilogue_cost);
3224       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3225                    scalar_single_iter_cost);
3226       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3227                    scalar_outside_cost);
3228       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3229                    vec_outside_cost);
3230       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3231                    peel_iters_prologue);
3232       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3233                    peel_iters_epilogue);
3234     }
3235
3236   /* Calculate number of iterations required to make the vector version
3237      profitable, relative to the loop bodies only.  The following condition
3238      must hold true:
3239      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3240      where
3241      SIC = scalar iteration cost, VIC = vector iteration cost,
3242      VOC = vector outside cost, VF = vectorization factor,
3243      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3244      SOC = scalar outside cost for run time cost model check.  */
3245
3246   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3247     {
3248       if (vec_outside_cost <= 0)
3249         min_profitable_iters = 1;
3250       else
3251         {
3252           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3253                                   - vec_inside_cost * peel_iters_prologue
3254                                   - vec_inside_cost * peel_iters_epilogue)
3255                                  / ((scalar_single_iter_cost * vf)
3256                                     - vec_inside_cost);
3257
3258           if ((scalar_single_iter_cost * vf * min_profitable_iters)
3259               <= (((int) vec_inside_cost * min_profitable_iters)
3260                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3261             min_profitable_iters++;
3262         }
3263     }
3264   /* vector version will never be profitable.  */
3265   else
3266     {
3267       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3268         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3269                     "did not happen for a simd loop");
3270
3271       if (dump_enabled_p ())
3272         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3273                          "cost model: the vector iteration cost = %d "
3274                          "divided by the scalar iteration cost = %d "
3275                          "is greater or equal to the vectorization factor = %d"
3276                          ".\n",
3277                          vec_inside_cost, scalar_single_iter_cost, vf);
3278       *ret_min_profitable_niters = -1;
3279       *ret_min_profitable_estimate = -1;
3280       return;
3281     }
3282
3283   dump_printf (MSG_NOTE,
3284                "  Calculated minimum iters for profitability: %d\n",
3285                min_profitable_iters);
3286
3287   min_profitable_iters =
3288         min_profitable_iters < vf ? vf : min_profitable_iters;
3289
3290   /* Because the condition we create is:
3291      if (niters <= min_profitable_iters)
3292        then skip the vectorized loop.  */
3293   min_profitable_iters--;
3294
3295   if (dump_enabled_p ())
3296     dump_printf_loc (MSG_NOTE, vect_location,
3297                      "  Runtime profitability threshold = %d\n",
3298                      min_profitable_iters);
3299
3300   *ret_min_profitable_niters = min_profitable_iters;
3301
3302   /* Calculate number of iterations required to make the vector version
3303      profitable, relative to the loop bodies only.
3304
3305      Non-vectorized variant is SIC * niters and it must win over vector
3306      variant on the expected loop trip count.  The following condition must hold true:
3307      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3308
3309   if (vec_outside_cost <= 0)
3310     min_profitable_estimate = 1;
3311   else
3312     {
3313       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3314                                  - vec_inside_cost * peel_iters_prologue
3315                                  - vec_inside_cost * peel_iters_epilogue)
3316                                  / ((scalar_single_iter_cost * vf)
3317                                    - vec_inside_cost);
3318     }
3319   min_profitable_estimate --;
3320   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3321   if (dump_enabled_p ())
3322     dump_printf_loc (MSG_NOTE, vect_location,
3323                      "  Static estimate profitability threshold = %d\n",
3324                       min_profitable_iters);
3325
3326   *ret_min_profitable_estimate = min_profitable_estimate;
3327 }
3328
3329 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3330    vector elements (not bits) for a vector of mode MODE.  */
3331 static void
3332 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3333                               unsigned char *sel)
3334 {
3335   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3336
3337   for (i = 0; i < nelt; i++)
3338     sel[i] = (i + offset) & (2*nelt - 1);
3339 }
3340
3341 /* Checks whether the target supports whole-vector shifts for vectors of mode
3342    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3343    it supports vec_perm_const with masks for all necessary shift amounts.  */
3344 static bool
3345 have_whole_vector_shift (enum machine_mode mode)
3346 {
3347   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3348     return true;
3349
3350   if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3351     return false;
3352
3353   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3354   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3355
3356   for (i = nelt/2; i >= 1; i/=2)
3357     {
3358       calc_vec_perm_mask_for_shift (mode, i, sel);
3359       if (!can_vec_perm_p (mode, false, sel))
3360         return false;
3361     }
3362   return true;
3363 }
3364
3365 /* Return the reduction operand (with index REDUC_INDEX) of STMT.  */
3366
3367 static tree
3368 get_reduction_op (gimple *stmt, int reduc_index)
3369 {
3370   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3371     {
3372     case GIMPLE_SINGLE_RHS:
3373       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3374                   == ternary_op);
3375       return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3376     case GIMPLE_UNARY_RHS:
3377       return gimple_assign_rhs1 (stmt);
3378     case GIMPLE_BINARY_RHS:
3379       return (reduc_index
3380               ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3381     case GIMPLE_TERNARY_RHS:
3382       return gimple_op (stmt, reduc_index + 1);
3383     default:
3384       gcc_unreachable ();
3385     }
3386 }
3387
3388 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3389    functions. Design better to avoid maintenance issues.  */
3390
3391 /* Function vect_model_reduction_cost.
3392
3393    Models cost for a reduction operation, including the vector ops
3394    generated within the strip-mine loop, the initial definition before
3395    the loop, and the epilogue code that must be generated.  */
3396
3397 static bool
3398 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3399                            int ncopies, int reduc_index)
3400 {
3401   int prologue_cost = 0, epilogue_cost = 0;
3402   enum tree_code code;
3403   optab optab;
3404   tree vectype;
3405   gimple *stmt, *orig_stmt;
3406   tree reduction_op;
3407   machine_mode mode;
3408   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3409   struct loop *loop = NULL;
3410   void *target_cost_data;
3411
3412   if (loop_vinfo)
3413     {
3414       loop = LOOP_VINFO_LOOP (loop_vinfo);
3415       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3416     }
3417   else
3418     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3419
3420   /* Condition reductions generate two reductions in the loop.  */
3421   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3422     ncopies *= 2;
3423
3424   /* Cost of reduction op inside loop.  */
3425   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3426                                         stmt_info, 0, vect_body);
3427   stmt = STMT_VINFO_STMT (stmt_info);
3428
3429   reduction_op = get_reduction_op (stmt, reduc_index);
3430
3431   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3432   if (!vectype)
3433     {
3434       if (dump_enabled_p ())
3435         {
3436           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3437                            "unsupported data-type ");
3438           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3439                              TREE_TYPE (reduction_op));
3440           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3441         }
3442       return false;
3443    }
3444
3445   mode = TYPE_MODE (vectype);
3446   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3447
3448   if (!orig_stmt)
3449     orig_stmt = STMT_VINFO_STMT (stmt_info);
3450
3451   code = gimple_assign_rhs_code (orig_stmt);
3452
3453   /* Add in cost for initial definition.
3454      For cond reduction we have four vectors: initial index, step, initial
3455      result of the data reduction, initial value of the index reduction.  */
3456   int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3457                        == COND_REDUCTION ? 4 : 1;
3458   prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3459                                   scalar_to_vec, stmt_info, 0,
3460                                   vect_prologue);
3461
3462   /* Determine cost of epilogue code.
3463
3464      We have a reduction operator that will reduce the vector in one statement.
3465      Also requires scalar extract.  */
3466
3467   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3468     {
3469       if (reduc_code != ERROR_MARK)
3470         {
3471           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3472             {
3473               /* An EQ stmt and an COND_EXPR stmt.  */
3474               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3475                                               vector_stmt, stmt_info, 0,
3476                                               vect_epilogue);
3477               /* Reduction of the max index and a reduction of the found
3478                  values.  */
3479               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3480                                               vec_to_scalar, stmt_info, 0,
3481                                               vect_epilogue);
3482               /* A broadcast of the max value.  */
3483               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3484                                               scalar_to_vec, stmt_info, 0,
3485                                               vect_epilogue);
3486             }
3487           else
3488             {
3489               epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3490                                               stmt_info, 0, vect_epilogue);
3491               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3492                                               vec_to_scalar, stmt_info, 0,
3493                                               vect_epilogue);
3494             }
3495         }
3496       else
3497         {
3498           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3499           tree bitsize =
3500             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3501           int element_bitsize = tree_to_uhwi (bitsize);
3502           int nelements = vec_size_in_bits / element_bitsize;
3503
3504           optab = optab_for_tree_code (code, vectype, optab_default);
3505
3506           /* We have a whole vector shift available.  */
3507           if (VECTOR_MODE_P (mode)
3508               && optab_handler (optab, mode) != CODE_FOR_nothing
3509               && have_whole_vector_shift (mode))
3510             {
3511               /* Final reduction via vector shifts and the reduction operator.
3512                  Also requires scalar extract.  */
3513               epilogue_cost += add_stmt_cost (target_cost_data,
3514                                               exact_log2 (nelements) * 2,
3515                                               vector_stmt, stmt_info, 0,
3516                                               vect_epilogue);
3517               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3518                                               vec_to_scalar, stmt_info, 0,
3519                                               vect_epilogue);
3520             }
3521           else
3522             /* Use extracts and reduction op for final reduction.  For N
3523                elements, we have N extracts and N-1 reduction ops.  */
3524             epilogue_cost += add_stmt_cost (target_cost_data,
3525                                             nelements + nelements - 1,
3526                                             vector_stmt, stmt_info, 0,
3527                                             vect_epilogue);
3528         }
3529     }
3530
3531   if (dump_enabled_p ())
3532     dump_printf (MSG_NOTE,
3533                  "vect_model_reduction_cost: inside_cost = %d, "
3534                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3535                  prologue_cost, epilogue_cost);
3536
3537   return true;
3538 }
3539
3540
3541 /* Function vect_model_induction_cost.
3542
3543    Models cost for induction operations.  */
3544
3545 static void
3546 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3547 {
3548   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3549   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3550   unsigned inside_cost, prologue_cost;
3551
3552   /* loop cost for vec_loop.  */
3553   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3554                                stmt_info, 0, vect_body);
3555
3556   /* prologue cost for vec_init and vec_step.  */
3557   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3558                                  stmt_info, 0, vect_prologue);
3559
3560   if (dump_enabled_p ())
3561     dump_printf_loc (MSG_NOTE, vect_location,
3562                      "vect_model_induction_cost: inside_cost = %d, "
3563                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3564 }
3565
3566
3567 /* Function get_initial_def_for_induction
3568
3569    Input:
3570    STMT - a stmt that performs an induction operation in the loop.
3571    IV_PHI - the initial value of the induction variable
3572
3573    Output:
3574    Return a vector variable, initialized with the first VF values of
3575    the induction variable.  E.g., for an iv with IV_PHI='X' and
3576    evolution S, for a vector of 4 units, we want to return:
3577    [X, X + S, X + 2*S, X + 3*S].  */
3578
3579 static tree
3580 get_initial_def_for_induction (gimple *iv_phi)
3581 {
3582   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3583   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3584   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3585   tree vectype;
3586   int nunits;
3587   edge pe = loop_preheader_edge (loop);
3588   struct loop *iv_loop;
3589   basic_block new_bb;
3590   tree new_vec, vec_init, vec_step, t;
3591   tree new_name;
3592   gimple *new_stmt;
3593   gphi *induction_phi;
3594   tree induc_def, vec_def, vec_dest;
3595   tree init_expr, step_expr;
3596   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3597   int i;
3598   int ncopies;
3599   tree expr;
3600   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3601   bool nested_in_vect_loop = false;
3602   gimple_seq stmts;
3603   imm_use_iterator imm_iter;
3604   use_operand_p use_p;
3605   gimple *exit_phi;
3606   edge latch_e;
3607   tree loop_arg;
3608   gimple_stmt_iterator si;
3609   basic_block bb = gimple_bb (iv_phi);
3610   tree stepvectype;
3611   tree resvectype;
3612
3613   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3614   if (nested_in_vect_loop_p (loop, iv_phi))
3615     {
3616       nested_in_vect_loop = true;
3617       iv_loop = loop->inner;
3618     }
3619   else
3620     iv_loop = loop;
3621   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3622
3623   latch_e = loop_latch_edge (iv_loop);
3624   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3625
3626   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3627   gcc_assert (step_expr != NULL_TREE);
3628
3629   pe = loop_preheader_edge (iv_loop);
3630   init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3631                                      loop_preheader_edge (iv_loop));
3632
3633   vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3634   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3635   gcc_assert (vectype);
3636   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3637   ncopies = vf / nunits;
3638
3639   gcc_assert (phi_info);
3640   gcc_assert (ncopies >= 1);
3641
3642   /* Convert the step to the desired type.  */
3643   stmts = NULL;
3644   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
3645   if (stmts)
3646     {
3647       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3648       gcc_assert (!new_bb);
3649     }
3650
3651   /* Find the first insertion point in the BB.  */
3652   si = gsi_after_labels (bb);
3653
3654   /* Create the vector that holds the initial_value of the induction.  */
3655   if (nested_in_vect_loop)
3656     {
3657       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3658          been created during vectorization of previous stmts.  We obtain it
3659          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3660       vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi);
3661       /* If the initial value is not of proper type, convert it.  */
3662       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3663         {
3664           new_stmt
3665             = gimple_build_assign (vect_get_new_ssa_name (vectype,
3666                                                           vect_simple_var,
3667                                                           "vec_iv_"),
3668                                    VIEW_CONVERT_EXPR,
3669                                    build1 (VIEW_CONVERT_EXPR, vectype,
3670                                            vec_init));
3671           vec_init = gimple_assign_lhs (new_stmt);
3672           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3673                                                  new_stmt);
3674           gcc_assert (!new_bb);
3675           set_vinfo_for_stmt (new_stmt,
3676                               new_stmt_vec_info (new_stmt, loop_vinfo));
3677         }
3678     }
3679   else
3680     {
3681       vec<constructor_elt, va_gc> *v;
3682
3683       /* iv_loop is the loop to be vectorized. Create:
3684          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3685       stmts = NULL;
3686       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
3687
3688       vec_alloc (v, nunits);
3689       bool constant_p = is_gimple_min_invariant (new_name);
3690       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3691       for (i = 1; i < nunits; i++)
3692         {
3693           /* Create: new_name_i = new_name + step_expr  */
3694           new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
3695                                    new_name, step_expr);
3696           if (!is_gimple_min_invariant (new_name))
3697             constant_p = false;
3698           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3699         }
3700       if (stmts)
3701         {
3702           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3703           gcc_assert (!new_bb);
3704         }
3705
3706       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3707       if (constant_p)
3708         new_vec = build_vector_from_ctor (vectype, v);
3709       else
3710         new_vec = build_constructor (vectype, v);
3711       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3712     }
3713
3714
3715   /* Create the vector that holds the step of the induction.  */
3716   if (nested_in_vect_loop)
3717     /* iv_loop is nested in the loop to be vectorized. Generate:
3718        vec_step = [S, S, S, S]  */
3719     new_name = step_expr;
3720   else
3721     {
3722       /* iv_loop is the loop to be vectorized. Generate:
3723           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3724       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3725         {
3726           expr = build_int_cst (integer_type_node, vf);
3727           expr = fold_convert (TREE_TYPE (step_expr), expr);
3728         }
3729       else
3730         expr = build_int_cst (TREE_TYPE (step_expr), vf);
3731       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3732                               expr, step_expr);
3733       if (TREE_CODE (step_expr) == SSA_NAME)
3734         new_name = vect_init_vector (iv_phi, new_name,
3735                                      TREE_TYPE (step_expr), NULL);
3736     }
3737
3738   t = unshare_expr (new_name);
3739   gcc_assert (CONSTANT_CLASS_P (new_name)
3740               || TREE_CODE (new_name) == SSA_NAME);
3741   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3742   gcc_assert (stepvectype);
3743   new_vec = build_vector_from_val (stepvectype, t);
3744   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3745
3746
3747   /* Create the following def-use cycle:
3748      loop prolog:
3749          vec_init = ...
3750          vec_step = ...
3751      loop:
3752          vec_iv = PHI <vec_init, vec_loop>
3753          ...
3754          STMT
3755          ...
3756          vec_loop = vec_iv + vec_step;  */
3757
3758   /* Create the induction-phi that defines the induction-operand.  */
3759   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3760   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3761   set_vinfo_for_stmt (induction_phi,
3762                       new_stmt_vec_info (induction_phi, loop_vinfo));
3763   induc_def = PHI_RESULT (induction_phi);
3764
3765   /* Create the iv update inside the loop  */
3766   new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3767   vec_def = make_ssa_name (vec_dest, new_stmt);
3768   gimple_assign_set_lhs (new_stmt, vec_def);
3769   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3770   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
3771
3772   /* Set the arguments of the phi node:  */
3773   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3774   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3775                UNKNOWN_LOCATION);
3776
3777
3778   /* In case that vectorization factor (VF) is bigger than the number
3779      of elements that we can fit in a vectype (nunits), we have to generate
3780      more than one vector stmt - i.e - we need to "unroll" the
3781      vector stmt by a factor VF/nunits.  For more details see documentation
3782      in vectorizable_operation.  */
3783
3784   if (ncopies > 1)
3785     {
3786       stmt_vec_info prev_stmt_vinfo;
3787       /* FORNOW. This restriction should be relaxed.  */
3788       gcc_assert (!nested_in_vect_loop);
3789
3790       /* Create the vector that holds the step of the induction.  */
3791       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3792         {
3793           expr = build_int_cst (integer_type_node, nunits);
3794           expr = fold_convert (TREE_TYPE (step_expr), expr);
3795         }
3796       else
3797         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3798       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3799                               expr, step_expr);
3800       if (TREE_CODE (step_expr) == SSA_NAME)
3801         new_name = vect_init_vector (iv_phi, new_name,
3802                                      TREE_TYPE (step_expr), NULL);
3803       t = unshare_expr (new_name);
3804       gcc_assert (CONSTANT_CLASS_P (new_name)
3805                   || TREE_CODE (new_name) == SSA_NAME);
3806       new_vec = build_vector_from_val (stepvectype, t);
3807       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3808
3809       vec_def = induc_def;
3810       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3811       for (i = 1; i < ncopies; i++)
3812         {
3813           /* vec_i = vec_prev + vec_step  */
3814           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3815                                           vec_def, vec_step);
3816           vec_def = make_ssa_name (vec_dest, new_stmt);
3817           gimple_assign_set_lhs (new_stmt, vec_def);
3818
3819           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3820           if (!useless_type_conversion_p (resvectype, vectype))
3821             {
3822               new_stmt
3823                 = gimple_build_assign
3824                         (vect_get_new_vect_var (resvectype, vect_simple_var,
3825                                                 "vec_iv_"),
3826                          VIEW_CONVERT_EXPR,
3827                          build1 (VIEW_CONVERT_EXPR, resvectype,
3828                                  gimple_assign_lhs (new_stmt)));
3829               gimple_assign_set_lhs (new_stmt,
3830                                      make_ssa_name
3831                                        (gimple_assign_lhs (new_stmt), new_stmt));
3832               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3833             }
3834           set_vinfo_for_stmt (new_stmt,
3835                               new_stmt_vec_info (new_stmt, loop_vinfo));
3836           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3837           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3838         }
3839     }
3840
3841   if (nested_in_vect_loop)
3842     {
3843       /* Find the loop-closed exit-phi of the induction, and record
3844          the final vector of induction results:  */
3845       exit_phi = NULL;
3846       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3847         {
3848           gimple *use_stmt = USE_STMT (use_p);
3849           if (is_gimple_debug (use_stmt))
3850             continue;
3851
3852           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3853             {
3854               exit_phi = use_stmt;
3855               break;
3856             }
3857         }
3858       if (exit_phi)
3859         {
3860           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3861           /* FORNOW. Currently not supporting the case that an inner-loop induction
3862              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3863           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3864                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3865
3866           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3867           if (dump_enabled_p ())
3868             {
3869               dump_printf_loc (MSG_NOTE, vect_location,
3870                                "vector of inductions after inner-loop:");
3871               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3872               dump_printf (MSG_NOTE, "\n");
3873             }
3874         }
3875     }
3876
3877
3878   if (dump_enabled_p ())
3879     {
3880       dump_printf_loc (MSG_NOTE, vect_location,
3881                        "transform induction: created def-use cycle: ");
3882       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3883       dump_printf (MSG_NOTE, "\n");
3884       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3885                         SSA_NAME_DEF_STMT (vec_def), 0);
3886       dump_printf (MSG_NOTE, "\n");
3887     }
3888
3889   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3890   if (!useless_type_conversion_p (resvectype, vectype))
3891     {
3892       new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
3893                                                              vect_simple_var,
3894                                                              "vec_iv_"),
3895                                       VIEW_CONVERT_EXPR,
3896                                       build1 (VIEW_CONVERT_EXPR, resvectype,
3897                                               induc_def));
3898       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3899       gimple_assign_set_lhs (new_stmt, induc_def);
3900       si = gsi_after_labels (bb);
3901       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3902       set_vinfo_for_stmt (new_stmt,
3903                           new_stmt_vec_info (new_stmt, loop_vinfo));
3904       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3905         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3906     }
3907
3908   return induc_def;
3909 }
3910
3911
3912 /* Function get_initial_def_for_reduction
3913
3914    Input:
3915    STMT - a stmt that performs a reduction operation in the loop.
3916    INIT_VAL - the initial value of the reduction variable
3917
3918    Output:
3919    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3920         of the reduction (used for adjusting the epilog - see below).
3921    Return a vector variable, initialized according to the operation that STMT
3922         performs. This vector will be used as the initial value of the
3923         vector of partial results.
3924
3925    Option1 (adjust in epilog): Initialize the vector as follows:
3926      add/bit or/xor:    [0,0,...,0,0]
3927      mult/bit and:      [1,1,...,1,1]
3928      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3929    and when necessary (e.g. add/mult case) let the caller know
3930    that it needs to adjust the result by init_val.
3931
3932    Option2: Initialize the vector as follows:
3933      add/bit or/xor:    [init_val,0,0,...,0]
3934      mult/bit and:      [init_val,1,1,...,1]
3935      min/max/cond_expr: [init_val,init_val,...,init_val]
3936    and no adjustments are needed.
3937
3938    For example, for the following code:
3939
3940    s = init_val;
3941    for (i=0;i<n;i++)
3942      s = s + a[i];
3943
3944    STMT is 's = s + a[i]', and the reduction variable is 's'.
3945    For a vector of 4 units, we want to return either [0,0,0,init_val],
3946    or [0,0,0,0] and let the caller know that it needs to adjust
3947    the result at the end by 'init_val'.
3948
3949    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3950    initialization vector is simpler (same element in all entries), if
3951    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3952
3953    A cost model should help decide between these two schemes.  */
3954
3955 tree
3956 get_initial_def_for_reduction (gimple *stmt, tree init_val,
3957                                tree *adjustment_def)
3958 {
3959   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3960   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3961   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3962   tree scalar_type = TREE_TYPE (init_val);
3963   tree vectype = get_vectype_for_scalar_type (scalar_type);
3964   int nunits;
3965   enum tree_code code = gimple_assign_rhs_code (stmt);
3966   tree def_for_init;
3967   tree init_def;
3968   tree *elts;
3969   int i;
3970   bool nested_in_vect_loop = false;
3971   tree init_value;
3972   REAL_VALUE_TYPE real_init_val = dconst0;
3973   int int_init_val = 0;
3974   gimple *def_stmt = NULL;
3975
3976   gcc_assert (vectype);
3977   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3978
3979   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3980               || SCALAR_FLOAT_TYPE_P (scalar_type));
3981
3982   if (nested_in_vect_loop_p (loop, stmt))
3983     nested_in_vect_loop = true;
3984   else
3985     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3986
3987   /* In case of double reduction we only create a vector variable to be put
3988      in the reduction phi node.  The actual statement creation is done in
3989      vect_create_epilog_for_reduction.  */
3990   if (adjustment_def && nested_in_vect_loop
3991       && TREE_CODE (init_val) == SSA_NAME
3992       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3993       && gimple_code (def_stmt) == GIMPLE_PHI
3994       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3995       && vinfo_for_stmt (def_stmt)
3996       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3997           == vect_double_reduction_def)
3998     {
3999       *adjustment_def = NULL;
4000       return vect_create_destination_var (init_val, vectype);
4001     }
4002
4003   if (TREE_CONSTANT (init_val))
4004     {
4005       if (SCALAR_FLOAT_TYPE_P (scalar_type))
4006         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
4007       else
4008         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
4009     }
4010   else
4011     init_value = init_val;
4012
4013   switch (code)
4014     {
4015       case WIDEN_SUM_EXPR:
4016       case DOT_PROD_EXPR:
4017       case SAD_EXPR:
4018       case PLUS_EXPR:
4019       case MINUS_EXPR:
4020       case BIT_IOR_EXPR:
4021       case BIT_XOR_EXPR:
4022       case MULT_EXPR:
4023       case BIT_AND_EXPR:
4024         /* ADJUSMENT_DEF is NULL when called from
4025            vect_create_epilog_for_reduction to vectorize double reduction.  */
4026         if (adjustment_def)
4027           {
4028             if (nested_in_vect_loop)
4029               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt);
4030             else
4031               *adjustment_def = init_val;
4032           }
4033
4034         if (code == MULT_EXPR)
4035           {
4036             real_init_val = dconst1;
4037             int_init_val = 1;
4038           }
4039
4040         if (code == BIT_AND_EXPR)
4041           int_init_val = -1;
4042
4043         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4044           def_for_init = build_real (scalar_type, real_init_val);
4045         else
4046           def_for_init = build_int_cst (scalar_type, int_init_val);
4047
4048         /* Create a vector of '0' or '1' except the first element.  */
4049         elts = XALLOCAVEC (tree, nunits);
4050         for (i = nunits - 2; i >= 0; --i)
4051           elts[i + 1] = def_for_init;
4052
4053         /* Option1: the first element is '0' or '1' as well.  */
4054         if (adjustment_def)
4055           {
4056             elts[0] = def_for_init;
4057             init_def = build_vector (vectype, elts);
4058             break;
4059           }
4060
4061         /* Option2: the first element is INIT_VAL.  */
4062         elts[0] = init_val;
4063         if (TREE_CONSTANT (init_val))
4064           init_def = build_vector (vectype, elts);
4065         else
4066           {
4067             vec<constructor_elt, va_gc> *v;
4068             vec_alloc (v, nunits);
4069             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
4070             for (i = 1; i < nunits; ++i)
4071               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
4072             init_def = build_constructor (vectype, v);
4073           }
4074
4075         break;
4076
4077       case MIN_EXPR:
4078       case MAX_EXPR:
4079       case COND_EXPR:
4080         if (adjustment_def)
4081           {
4082             *adjustment_def = NULL_TREE;
4083             if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4084               {
4085                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4086                 break;
4087               }
4088           }
4089         init_def = build_vector_from_val (vectype, init_value);
4090         break;
4091
4092       default:
4093         gcc_unreachable ();
4094     }
4095
4096   return init_def;
4097 }
4098
4099 /* Function vect_create_epilog_for_reduction
4100
4101    Create code at the loop-epilog to finalize the result of a reduction
4102    computation.
4103
4104    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4105      reduction statements.
4106    STMT is the scalar reduction stmt that is being vectorized.
4107    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4108      number of elements that we can fit in a vectype (nunits).  In this case
4109      we have to generate more than one vector stmt - i.e - we need to "unroll"
4110      the vector stmt by a factor VF/nunits.  For more details see documentation
4111      in vectorizable_operation.
4112    REDUC_CODE is the tree-code for the epilog reduction.
4113    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4114      computation.
4115    REDUC_INDEX is the index of the operand in the right hand side of the
4116      statement that is defined by REDUCTION_PHI.
4117    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4118    SLP_NODE is an SLP node containing a group of reduction statements. The
4119      first one in this group is STMT.
4120    INDUCTION_INDEX is the index of the loop for condition reductions.
4121      Otherwise it is undefined.
4122
4123    This function:
4124    1. Creates the reduction def-use cycles: sets the arguments for
4125       REDUCTION_PHIS:
4126       The loop-entry argument is the vectorized initial-value of the reduction.
4127       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4128       sums.
4129    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4130       by applying the operation specified by REDUC_CODE if available, or by
4131       other means (whole-vector shifts or a scalar loop).
4132       The function also creates a new phi node at the loop exit to preserve
4133       loop-closed form, as illustrated below.
4134
4135      The flow at the entry to this function:
4136
4137         loop:
4138           vec_def = phi <null, null>            # REDUCTION_PHI
4139           VECT_DEF = vector_stmt                # vectorized form of STMT
4140           s_loop = scalar_stmt                  # (scalar) STMT
4141         loop_exit:
4142           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4143           use <s_out0>
4144           use <s_out0>
4145
4146      The above is transformed by this function into:
4147
4148         loop:
4149           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4150           VECT_DEF = vector_stmt                # vectorized form of STMT
4151           s_loop = scalar_stmt                  # (scalar) STMT
4152         loop_exit:
4153           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4154           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4155           v_out2 = reduce <v_out1>
4156           s_out3 = extract_field <v_out2, 0>
4157           s_out4 = adjust_result <s_out3>
4158           use <s_out4>
4159           use <s_out4>
4160 */
4161
4162 static void
4163 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4164                                   int ncopies, enum tree_code reduc_code,
4165                                   vec<gimple *> reduction_phis,
4166                                   int reduc_index, bool double_reduc,
4167                                   slp_tree slp_node, tree induction_index)
4168 {
4169   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4170   stmt_vec_info prev_phi_info;
4171   tree vectype;
4172   machine_mode mode;
4173   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4174   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4175   basic_block exit_bb;
4176   tree scalar_dest;
4177   tree scalar_type;
4178   gimple *new_phi = NULL, *phi;
4179   gimple_stmt_iterator exit_gsi;
4180   tree vec_dest;
4181   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4182   gimple *epilog_stmt = NULL;
4183   enum tree_code code = gimple_assign_rhs_code (stmt);
4184   gimple *exit_phi;
4185   tree bitsize;
4186   tree adjustment_def = NULL;
4187   tree vec_initial_def = NULL;
4188   tree reduction_op, expr, def, initial_def = NULL;
4189   tree orig_name, scalar_result;
4190   imm_use_iterator imm_iter, phi_imm_iter;
4191   use_operand_p use_p, phi_use_p;
4192   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4193   bool nested_in_vect_loop = false;
4194   auto_vec<gimple *> new_phis;
4195   auto_vec<gimple *> inner_phis;
4196   enum vect_def_type dt = vect_unknown_def_type;
4197   int j, i;
4198   auto_vec<tree> scalar_results;
4199   unsigned int group_size = 1, k, ratio;
4200   auto_vec<tree> vec_initial_defs;
4201   auto_vec<gimple *> phis;
4202   bool slp_reduc = false;
4203   tree new_phi_result;
4204   gimple *inner_phi = NULL;
4205
4206   if (slp_node)
4207     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4208
4209   if (nested_in_vect_loop_p (loop, stmt))
4210     {
4211       outer_loop = loop;
4212       loop = loop->inner;
4213       nested_in_vect_loop = true;
4214       gcc_assert (!slp_node);
4215     }
4216
4217   reduction_op = get_reduction_op (stmt, reduc_index);
4218
4219   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4220   gcc_assert (vectype);
4221   mode = TYPE_MODE (vectype);
4222
4223   /* 1. Create the reduction def-use cycle:
4224      Set the arguments of REDUCTION_PHIS, i.e., transform
4225
4226         loop:
4227           vec_def = phi <null, null>            # REDUCTION_PHI
4228           VECT_DEF = vector_stmt                # vectorized form of STMT
4229           ...
4230
4231      into:
4232
4233         loop:
4234           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4235           VECT_DEF = vector_stmt                # vectorized form of STMT
4236           ...
4237
4238      (in case of SLP, do it for all the phis). */
4239
4240   /* Get the loop-entry arguments.  */
4241   if (slp_node)
4242     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4243                        NULL, slp_node, reduc_index);
4244   else
4245     {
4246       /* Get at the scalar def before the loop, that defines the initial value
4247          of the reduction variable.  */
4248       gimple *def_stmt = SSA_NAME_DEF_STMT (reduction_op);
4249       initial_def = PHI_ARG_DEF_FROM_EDGE (def_stmt,
4250                                            loop_preheader_edge (loop));
4251       vec_initial_defs.create (1);
4252       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4253                                                        &adjustment_def);
4254       vec_initial_defs.quick_push (vec_initial_def);
4255     }
4256
4257   /* Set phi nodes arguments.  */
4258   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4259     {
4260       tree vec_init_def, def;
4261       gimple_seq stmts;
4262       vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4263                                            true, NULL_TREE);
4264       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4265       def = vect_defs[i];
4266       for (j = 0; j < ncopies; j++)
4267         {
4268           /* Set the loop-entry arg of the reduction-phi.  */
4269
4270           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4271               == INTEGER_INDUC_COND_REDUCTION)
4272             {
4273               /* Initialise the reduction phi to zero.  This prevents initial
4274                  values of non-zero interferring with the reduction op.  */
4275               gcc_assert (ncopies == 1);
4276               gcc_assert (i == 0);
4277
4278               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4279               tree zero_vec = build_zero_cst (vec_init_def_type);
4280
4281               add_phi_arg (as_a <gphi *> (phi), zero_vec,
4282                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4283             }
4284           else
4285             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4286                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4287
4288           /* Set the loop-latch arg for the reduction-phi.  */
4289           if (j > 0)
4290             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4291
4292           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4293                        UNKNOWN_LOCATION);
4294
4295           if (dump_enabled_p ())
4296             {
4297               dump_printf_loc (MSG_NOTE, vect_location,
4298                                "transform reduction: created def-use cycle: ");
4299               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4300               dump_printf (MSG_NOTE, "\n");
4301               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4302               dump_printf (MSG_NOTE, "\n");
4303             }
4304
4305           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4306         }
4307     }
4308
4309   /* 2. Create epilog code.
4310         The reduction epilog code operates across the elements of the vector
4311         of partial results computed by the vectorized loop.
4312         The reduction epilog code consists of:
4313
4314         step 1: compute the scalar result in a vector (v_out2)
4315         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4316         step 3: adjust the scalar result (s_out3) if needed.
4317
4318         Step 1 can be accomplished using one the following three schemes:
4319           (scheme 1) using reduc_code, if available.
4320           (scheme 2) using whole-vector shifts, if available.
4321           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4322                      combined.
4323
4324           The overall epilog code looks like this:
4325
4326           s_out0 = phi <s_loop>         # original EXIT_PHI
4327           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4328           v_out2 = reduce <v_out1>              # step 1
4329           s_out3 = extract_field <v_out2, 0>    # step 2
4330           s_out4 = adjust_result <s_out3>       # step 3
4331
4332           (step 3 is optional, and steps 1 and 2 may be combined).
4333           Lastly, the uses of s_out0 are replaced by s_out4.  */
4334
4335
4336   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4337          v_out1 = phi <VECT_DEF>
4338          Store them in NEW_PHIS.  */
4339
4340   exit_bb = single_exit (loop)->dest;
4341   prev_phi_info = NULL;
4342   new_phis.create (vect_defs.length ());
4343   FOR_EACH_VEC_ELT (vect_defs, i, def)
4344     {
4345       for (j = 0; j < ncopies; j++)
4346         {
4347           tree new_def = copy_ssa_name (def);
4348           phi = create_phi_node (new_def, exit_bb);
4349           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4350           if (j == 0)
4351             new_phis.quick_push (phi);
4352           else
4353             {
4354               def = vect_get_vec_def_for_stmt_copy (dt, def);
4355               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4356             }
4357
4358           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4359           prev_phi_info = vinfo_for_stmt (phi);
4360         }
4361     }
4362
4363   /* The epilogue is created for the outer-loop, i.e., for the loop being
4364      vectorized.  Create exit phis for the outer loop.  */
4365   if (double_reduc)
4366     {
4367       loop = outer_loop;
4368       exit_bb = single_exit (loop)->dest;
4369       inner_phis.create (vect_defs.length ());
4370       FOR_EACH_VEC_ELT (new_phis, i, phi)
4371         {
4372           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4373           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4374           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4375                            PHI_RESULT (phi));
4376           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4377                                                             loop_vinfo));
4378           inner_phis.quick_push (phi);
4379           new_phis[i] = outer_phi;
4380           prev_phi_info = vinfo_for_stmt (outer_phi);
4381           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4382             {
4383               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4384               new_result = copy_ssa_name (PHI_RESULT (phi));
4385               outer_phi = create_phi_node (new_result, exit_bb);
4386               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4387                                PHI_RESULT (phi));
4388               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4389                                                                 loop_vinfo));
4390               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4391               prev_phi_info = vinfo_for_stmt (outer_phi);
4392             }
4393         }
4394     }
4395
4396   exit_gsi = gsi_after_labels (exit_bb);
4397
4398   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4399          (i.e. when reduc_code is not available) and in the final adjustment
4400          code (if needed).  Also get the original scalar reduction variable as
4401          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4402          represents a reduction pattern), the tree-code and scalar-def are
4403          taken from the original stmt that the pattern-stmt (STMT) replaces.
4404          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4405          are taken from STMT.  */
4406
4407   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4408   if (!orig_stmt)
4409     {
4410       /* Regular reduction  */
4411       orig_stmt = stmt;
4412     }
4413   else
4414     {
4415       /* Reduction pattern  */
4416       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4417       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4418       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4419     }
4420
4421   code = gimple_assign_rhs_code (orig_stmt);
4422   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4423      partial results are added and not subtracted.  */
4424   if (code == MINUS_EXPR)
4425     code = PLUS_EXPR;
4426
4427   scalar_dest = gimple_assign_lhs (orig_stmt);
4428   scalar_type = TREE_TYPE (scalar_dest);
4429   scalar_results.create (group_size);
4430   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4431   bitsize = TYPE_SIZE (scalar_type);
4432
4433   /* In case this is a reduction in an inner-loop while vectorizing an outer
4434      loop - we don't need to extract a single scalar result at the end of the
4435      inner-loop (unless it is double reduction, i.e., the use of reduction is
4436      outside the outer-loop).  The final vector of partial results will be used
4437      in the vectorized outer-loop, or reduced to a scalar result at the end of
4438      the outer-loop.  */
4439   if (nested_in_vect_loop && !double_reduc)
4440     goto vect_finalize_reduction;
4441
4442   /* SLP reduction without reduction chain, e.g.,
4443      # a1 = phi <a2, a0>
4444      # b1 = phi <b2, b0>
4445      a2 = operation (a1)
4446      b2 = operation (b1)  */
4447   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4448
4449   /* In case of reduction chain, e.g.,
4450      # a1 = phi <a3, a0>
4451      a2 = operation (a1)
4452      a3 = operation (a2),
4453
4454      we may end up with more than one vector result.  Here we reduce them to
4455      one vector.  */
4456   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4457     {
4458       tree first_vect = PHI_RESULT (new_phis[0]);
4459       tree tmp;
4460       gassign *new_vec_stmt = NULL;
4461
4462       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4463       for (k = 1; k < new_phis.length (); k++)
4464         {
4465           gimple *next_phi = new_phis[k];
4466           tree second_vect = PHI_RESULT (next_phi);
4467
4468           tmp = build2 (code, vectype,  first_vect, second_vect);
4469           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4470           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4471           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4472           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4473         }
4474
4475       new_phi_result = first_vect;
4476       if (new_vec_stmt)
4477         {
4478           new_phis.truncate (0);
4479           new_phis.safe_push (new_vec_stmt);
4480         }
4481     }
4482   else
4483     new_phi_result = PHI_RESULT (new_phis[0]);
4484
4485   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4486     {
4487       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4488          various data values where the condition matched and another vector
4489          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4490          need to extract the last matching index (which will be the index with
4491          highest value) and use this to index into the data vector.
4492          For the case where there were no matches, the data vector will contain
4493          all default values and the index vector will be all zeros.  */
4494
4495       /* Get various versions of the type of the vector of indexes.  */
4496       tree index_vec_type = TREE_TYPE (induction_index);
4497       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4498       tree index_scalar_type = TREE_TYPE (index_vec_type);
4499       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4500         (index_vec_type);
4501
4502       /* Get an unsigned integer version of the type of the data vector.  */
4503       int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
4504       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4505       tree vectype_unsigned = build_vector_type
4506         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4507
4508       /* First we need to create a vector (ZERO_VEC) of zeros and another
4509          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4510          can create using a MAX reduction and then expanding.
4511          In the case where the loop never made any matches, the max index will
4512          be zero.  */
4513
4514       /* Vector of {0, 0, 0,...}.  */
4515       tree zero_vec = make_ssa_name (vectype);
4516       tree zero_vec_rhs = build_zero_cst (vectype);
4517       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4518       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4519
4520       /* Find maximum value from the vector of found indexes.  */
4521       tree max_index = make_ssa_name (index_scalar_type);
4522       gimple *max_index_stmt = gimple_build_assign (max_index, REDUC_MAX_EXPR,
4523                                                     induction_index);
4524       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4525
4526       /* Vector of {max_index, max_index, max_index,...}.  */
4527       tree max_index_vec = make_ssa_name (index_vec_type);
4528       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4529                                                       max_index);
4530       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4531                                                         max_index_vec_rhs);
4532       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4533
4534       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4535          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4536          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4537          otherwise.  Only one value should match, resulting in a vector
4538          (VEC_COND) with one data value and the rest zeros.
4539          In the case where the loop never made any matches, every index will
4540          match, resulting in a vector with all data values (which will all be
4541          the default value).  */
4542
4543       /* Compare the max index vector to the vector of found indexes to find
4544          the position of the max value.  */
4545       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4546       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4547                                                       induction_index,
4548                                                       max_index_vec);
4549       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4550
4551       /* Use the compare to choose either values from the data vector or
4552          zero.  */
4553       tree vec_cond = make_ssa_name (vectype);
4554       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4555                                                    vec_compare, new_phi_result,
4556                                                    zero_vec);
4557       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4558
4559       /* Finally we need to extract the data value from the vector (VEC_COND)
4560          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4561          reduction, but because this doesn't exist, we can use a MAX reduction
4562          instead.  The data value might be signed or a float so we need to cast
4563          it first.
4564          In the case where the loop never made any matches, the data values are
4565          all identical, and so will reduce down correctly.  */
4566
4567       /* Make the matched data values unsigned.  */
4568       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4569       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4570                                        vec_cond);
4571       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4572                                                         VIEW_CONVERT_EXPR,
4573                                                         vec_cond_cast_rhs);
4574       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4575
4576       /* Reduce down to a scalar value.  */
4577       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4578       optab ot = optab_for_tree_code (REDUC_MAX_EXPR, vectype_unsigned,
4579                                       optab_default);
4580       gcc_assert (optab_handler (ot, TYPE_MODE (vectype_unsigned))
4581                   != CODE_FOR_nothing);
4582       gimple *data_reduc_stmt = gimple_build_assign (data_reduc,
4583                                                      REDUC_MAX_EXPR,
4584                                                      vec_cond_cast);
4585       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4586
4587       /* Convert the reduced value back to the result type and set as the
4588          result.  */
4589       tree data_reduc_cast = build1 (VIEW_CONVERT_EXPR, scalar_type,
4590                                      data_reduc);
4591       epilog_stmt = gimple_build_assign (new_scalar_dest, data_reduc_cast);
4592       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4593       gimple_assign_set_lhs (epilog_stmt, new_temp);
4594       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4595       scalar_results.safe_push (new_temp);
4596     }
4597
4598   /* 2.3 Create the reduction code, using one of the three schemes described
4599          above. In SLP we simply need to extract all the elements from the
4600          vector (without reducing them), so we use scalar shifts.  */
4601   else if (reduc_code != ERROR_MARK && !slp_reduc)
4602     {
4603       tree tmp;
4604       tree vec_elem_type;
4605
4606       /*** Case 1:  Create:
4607            v_out2 = reduc_expr <v_out1>  */
4608
4609       if (dump_enabled_p ())
4610         dump_printf_loc (MSG_NOTE, vect_location,
4611                          "Reduce using direct vector reduction.\n");
4612
4613       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4614       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4615         {
4616           tree tmp_dest =
4617               vect_create_destination_var (scalar_dest, vec_elem_type);
4618           tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4619           epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4620           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4621           gimple_assign_set_lhs (epilog_stmt, new_temp);
4622           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4623
4624           tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4625         }
4626       else
4627         tmp = build1 (reduc_code, scalar_type, new_phi_result);
4628
4629       epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4630       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4631       gimple_assign_set_lhs (epilog_stmt, new_temp);
4632       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4633
4634       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4635           == INTEGER_INDUC_COND_REDUCTION)
4636         {
4637           /* Earlier we set the initial value to be zero.  Check the result
4638              and if it is zero then replace with the original initial
4639              value.  */
4640           tree zero = build_zero_cst (scalar_type);
4641           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
4642
4643           tmp = make_ssa_name (new_scalar_dest);
4644           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4645                                              initial_def, new_temp);
4646           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4647           new_temp = tmp;
4648         }
4649
4650       scalar_results.safe_push (new_temp);
4651     }
4652   else
4653     {
4654       bool reduce_with_shift = have_whole_vector_shift (mode);
4655       int element_bitsize = tree_to_uhwi (bitsize);
4656       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4657       tree vec_temp;
4658
4659       /* Regardless of whether we have a whole vector shift, if we're
4660          emulating the operation via tree-vect-generic, we don't want
4661          to use it.  Only the first round of the reduction is likely
4662          to still be profitable via emulation.  */
4663       /* ??? It might be better to emit a reduction tree code here, so that
4664          tree-vect-generic can expand the first round via bit tricks.  */
4665       if (!VECTOR_MODE_P (mode))
4666         reduce_with_shift = false;
4667       else
4668         {
4669           optab optab = optab_for_tree_code (code, vectype, optab_default);
4670           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4671             reduce_with_shift = false;
4672         }
4673
4674       if (reduce_with_shift && !slp_reduc)
4675         {
4676           int nelements = vec_size_in_bits / element_bitsize;
4677           unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4678
4679           int elt_offset;
4680
4681           tree zero_vec = build_zero_cst (vectype);
4682           /*** Case 2: Create:
4683              for (offset = nelements/2; offset >= 1; offset/=2)
4684                 {
4685                   Create:  va' = vec_shift <va, offset>
4686                   Create:  va = vop <va, va'>
4687                 }  */
4688
4689           tree rhs;
4690
4691           if (dump_enabled_p ())
4692             dump_printf_loc (MSG_NOTE, vect_location,
4693                              "Reduce using vector shifts\n");
4694
4695           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4696           new_temp = new_phi_result;
4697           for (elt_offset = nelements / 2;
4698                elt_offset >= 1;
4699                elt_offset /= 2)
4700             {
4701               calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4702               tree mask = vect_gen_perm_mask_any (vectype, sel);
4703               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4704                                                  new_temp, zero_vec, mask);
4705               new_name = make_ssa_name (vec_dest, epilog_stmt);
4706               gimple_assign_set_lhs (epilog_stmt, new_name);
4707               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4708
4709               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4710                                                  new_temp);
4711               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4712               gimple_assign_set_lhs (epilog_stmt, new_temp);
4713               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4714             }
4715
4716           /* 2.4  Extract the final scalar result.  Create:
4717              s_out3 = extract_field <v_out2, bitpos>  */
4718
4719           if (dump_enabled_p ())
4720             dump_printf_loc (MSG_NOTE, vect_location,
4721                              "extract scalar result\n");
4722
4723           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4724                         bitsize, bitsize_zero_node);
4725           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4726           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4727           gimple_assign_set_lhs (epilog_stmt, new_temp);
4728           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4729           scalar_results.safe_push (new_temp);
4730         }
4731       else
4732         {
4733           /*** Case 3: Create:
4734              s = extract_field <v_out2, 0>
4735              for (offset = element_size;
4736                   offset < vector_size;
4737                   offset += element_size;)
4738                {
4739                  Create:  s' = extract_field <v_out2, offset>
4740                  Create:  s = op <s, s'>  // For non SLP cases
4741                }  */
4742
4743           if (dump_enabled_p ())
4744             dump_printf_loc (MSG_NOTE, vect_location,
4745                              "Reduce using scalar code.\n");
4746
4747           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4748           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4749             {
4750               int bit_offset;
4751               if (gimple_code (new_phi) == GIMPLE_PHI)
4752                 vec_temp = PHI_RESULT (new_phi);
4753               else
4754                 vec_temp = gimple_assign_lhs (new_phi);
4755               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4756                             bitsize_zero_node);
4757               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4758               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4759               gimple_assign_set_lhs (epilog_stmt, new_temp);
4760               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4761
4762               /* In SLP we don't need to apply reduction operation, so we just
4763                  collect s' values in SCALAR_RESULTS.  */
4764               if (slp_reduc)
4765                 scalar_results.safe_push (new_temp);
4766
4767               for (bit_offset = element_bitsize;
4768                    bit_offset < vec_size_in_bits;
4769                    bit_offset += element_bitsize)
4770                 {
4771                   tree bitpos = bitsize_int (bit_offset);
4772                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4773                                      bitsize, bitpos);
4774
4775                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4776                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4777                   gimple_assign_set_lhs (epilog_stmt, new_name);
4778                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4779
4780                   if (slp_reduc)
4781                     {
4782                       /* In SLP we don't need to apply reduction operation, so
4783                          we just collect s' values in SCALAR_RESULTS.  */
4784                       new_temp = new_name;
4785                       scalar_results.safe_push (new_name);
4786                     }
4787                   else
4788                     {
4789                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4790                                                          new_name, new_temp);
4791                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4792                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4793                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4794                     }
4795                 }
4796             }
4797
4798           /* The only case where we need to reduce scalar results in SLP, is
4799              unrolling.  If the size of SCALAR_RESULTS is greater than
4800              GROUP_SIZE, we reduce them combining elements modulo
4801              GROUP_SIZE.  */
4802           if (slp_reduc)
4803             {
4804               tree res, first_res, new_res;
4805               gimple *new_stmt;
4806
4807               /* Reduce multiple scalar results in case of SLP unrolling.  */
4808               for (j = group_size; scalar_results.iterate (j, &res);
4809                    j++)
4810                 {
4811                   first_res = scalar_results[j % group_size];
4812                   new_stmt = gimple_build_assign (new_scalar_dest, code,
4813                                                   first_res, res);
4814                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4815                   gimple_assign_set_lhs (new_stmt, new_res);
4816                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4817                   scalar_results[j % group_size] = new_res;
4818                 }
4819             }
4820           else
4821             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4822             scalar_results.safe_push (new_temp);
4823         }
4824     }
4825
4826 vect_finalize_reduction:
4827
4828   if (double_reduc)
4829     loop = loop->inner;
4830
4831   /* 2.5 Adjust the final result by the initial value of the reduction
4832          variable. (When such adjustment is not needed, then
4833          'adjustment_def' is zero).  For example, if code is PLUS we create:
4834          new_temp = loop_exit_def + adjustment_def  */
4835
4836   if (adjustment_def)
4837     {
4838       gcc_assert (!slp_reduc);
4839       if (nested_in_vect_loop)
4840         {
4841           new_phi = new_phis[0];
4842           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4843           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4844           new_dest = vect_create_destination_var (scalar_dest, vectype);
4845         }
4846       else
4847         {
4848           new_temp = scalar_results[0];
4849           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4850           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4851           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4852         }
4853
4854       epilog_stmt = gimple_build_assign (new_dest, expr);
4855       new_temp = make_ssa_name (new_dest, epilog_stmt);
4856       gimple_assign_set_lhs (epilog_stmt, new_temp);
4857       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4858       if (nested_in_vect_loop)
4859         {
4860           set_vinfo_for_stmt (epilog_stmt,
4861                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
4862           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4863                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4864
4865           if (!double_reduc)
4866             scalar_results.quick_push (new_temp);
4867           else
4868             scalar_results[0] = new_temp;
4869         }
4870       else
4871         scalar_results[0] = new_temp;
4872
4873       new_phis[0] = epilog_stmt;
4874     }
4875
4876   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4877           phis with new adjusted scalar results, i.e., replace use <s_out0>
4878           with use <s_out4>.
4879
4880      Transform:
4881         loop_exit:
4882           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4883           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4884           v_out2 = reduce <v_out1>
4885           s_out3 = extract_field <v_out2, 0>
4886           s_out4 = adjust_result <s_out3>
4887           use <s_out0>
4888           use <s_out0>
4889
4890      into:
4891
4892         loop_exit:
4893           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4894           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4895           v_out2 = reduce <v_out1>
4896           s_out3 = extract_field <v_out2, 0>
4897           s_out4 = adjust_result <s_out3>
4898           use <s_out4>
4899           use <s_out4> */
4900
4901
4902   /* In SLP reduction chain we reduce vector results into one vector if
4903      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4904      the last stmt in the reduction chain, since we are looking for the loop
4905      exit phi node.  */
4906   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4907     {
4908       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
4909       /* Handle reduction patterns.  */
4910       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
4911         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
4912
4913       scalar_dest = gimple_assign_lhs (dest_stmt);
4914       group_size = 1;
4915     }
4916
4917   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4918      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4919      need to match SCALAR_RESULTS with corresponding statements.  The first
4920      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4921      the first vector stmt, etc.
4922      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4923   if (group_size > new_phis.length ())
4924     {
4925       ratio = group_size / new_phis.length ();
4926       gcc_assert (!(group_size % new_phis.length ()));
4927     }
4928   else
4929     ratio = 1;
4930
4931   for (k = 0; k < group_size; k++)
4932     {
4933       if (k % ratio == 0)
4934         {
4935           epilog_stmt = new_phis[k / ratio];
4936           reduction_phi = reduction_phis[k / ratio];
4937           if (double_reduc)
4938             inner_phi = inner_phis[k / ratio];
4939         }
4940
4941       if (slp_reduc)
4942         {
4943           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4944
4945           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4946           /* SLP statements can't participate in patterns.  */
4947           gcc_assert (!orig_stmt);
4948           scalar_dest = gimple_assign_lhs (current_stmt);
4949         }
4950
4951       phis.create (3);
4952       /* Find the loop-closed-use at the loop exit of the original scalar
4953          result.  (The reduction result is expected to have two immediate uses -
4954          one at the latch block, and one at the loop exit).  */
4955       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4956         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4957             && !is_gimple_debug (USE_STMT (use_p)))
4958           phis.safe_push (USE_STMT (use_p));
4959
4960       /* While we expect to have found an exit_phi because of loop-closed-ssa
4961          form we can end up without one if the scalar cycle is dead.  */
4962
4963       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4964         {
4965           if (outer_loop)
4966             {
4967               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4968               gphi *vect_phi;
4969
4970               /* FORNOW. Currently not supporting the case that an inner-loop
4971                  reduction is not used in the outer-loop (but only outside the
4972                  outer-loop), unless it is double reduction.  */
4973               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4974                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4975                           || double_reduc);
4976
4977               if (double_reduc)
4978                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
4979               else
4980                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4981               if (!double_reduc
4982                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4983                       != vect_double_reduction_def)
4984                 continue;
4985
4986               /* Handle double reduction:
4987
4988                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4989                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4990                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4991                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4992
4993                  At that point the regular reduction (stmt2 and stmt3) is
4994                  already vectorized, as well as the exit phi node, stmt4.
4995                  Here we vectorize the phi node of double reduction, stmt1, and
4996                  update all relevant statements.  */
4997
4998               /* Go through all the uses of s2 to find double reduction phi
4999                  node, i.e., stmt1 above.  */
5000               orig_name = PHI_RESULT (exit_phi);
5001               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5002                 {
5003                   stmt_vec_info use_stmt_vinfo;
5004                   stmt_vec_info new_phi_vinfo;
5005                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
5006                   basic_block bb = gimple_bb (use_stmt);
5007                   gimple *use;
5008
5009                   /* Check that USE_STMT is really double reduction phi
5010                      node.  */
5011                   if (gimple_code (use_stmt) != GIMPLE_PHI
5012                       || gimple_phi_num_args (use_stmt) != 2
5013                       || bb->loop_father != outer_loop)
5014                     continue;
5015                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5016                   if (!use_stmt_vinfo
5017                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5018                           != vect_double_reduction_def)
5019                     continue;
5020
5021                   /* Create vector phi node for double reduction:
5022                      vs1 = phi <vs0, vs2>
5023                      vs1 was created previously in this function by a call to
5024                        vect_get_vec_def_for_operand and is stored in
5025                        vec_initial_def;
5026                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5027                      vs0 is created here.  */
5028
5029                   /* Create vector phi node.  */
5030                   vect_phi = create_phi_node (vec_initial_def, bb);
5031                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5032                                     loop_vec_info_for_loop (outer_loop));
5033                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5034
5035                   /* Create vs0 - initial def of the double reduction phi.  */
5036                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5037                                              loop_preheader_edge (outer_loop));
5038                   init_def = get_initial_def_for_reduction (stmt,
5039                                                           preheader_arg, NULL);
5040                   vect_phi_init = vect_init_vector (use_stmt, init_def,
5041                                                     vectype, NULL);
5042
5043                   /* Update phi node arguments with vs0 and vs2.  */
5044                   add_phi_arg (vect_phi, vect_phi_init,
5045                                loop_preheader_edge (outer_loop),
5046                                UNKNOWN_LOCATION);
5047                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5048                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5049                   if (dump_enabled_p ())
5050                     {
5051                       dump_printf_loc (MSG_NOTE, vect_location,
5052                                        "created double reduction phi node: ");
5053                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5054                       dump_printf (MSG_NOTE, "\n");
5055                     }
5056
5057                   vect_phi_res = PHI_RESULT (vect_phi);
5058
5059                   /* Replace the use, i.e., set the correct vs1 in the regular
5060                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5061                      loop is redundant.  */
5062                   use = reduction_phi;
5063                   for (j = 0; j < ncopies; j++)
5064                     {
5065                       edge pr_edge = loop_preheader_edge (loop);
5066                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5067                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5068                     }
5069                 }
5070             }
5071         }
5072
5073       phis.release ();
5074       if (nested_in_vect_loop)
5075         {
5076           if (double_reduc)
5077             loop = outer_loop;
5078           else
5079             continue;
5080         }
5081
5082       phis.create (3);
5083       /* Find the loop-closed-use at the loop exit of the original scalar
5084          result.  (The reduction result is expected to have two immediate uses,
5085          one at the latch block, and one at the loop exit).  For double
5086          reductions we are looking for exit phis of the outer loop.  */
5087       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5088         {
5089           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5090             {
5091               if (!is_gimple_debug (USE_STMT (use_p)))
5092                 phis.safe_push (USE_STMT (use_p));
5093             }
5094           else
5095             {
5096               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5097                 {
5098                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5099
5100                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5101                     {
5102                       if (!flow_bb_inside_loop_p (loop,
5103                                              gimple_bb (USE_STMT (phi_use_p)))
5104                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5105                         phis.safe_push (USE_STMT (phi_use_p));
5106                     }
5107                 }
5108             }
5109         }
5110
5111       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5112         {
5113           /* Replace the uses:  */
5114           orig_name = PHI_RESULT (exit_phi);
5115           scalar_result = scalar_results[k];
5116           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5117             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5118               SET_USE (use_p, scalar_result);
5119         }
5120
5121       phis.release ();
5122     }
5123 }
5124
5125
5126 /* Function is_nonwrapping_integer_induction.
5127
5128    Check if STMT (which is part of loop LOOP) both increments and
5129    does not cause overflow.  */
5130
5131 static bool
5132 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5133 {
5134   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5135   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5136   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5137   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5138   widest_int ni, max_loop_value, lhs_max;
5139   bool overflow = false;
5140
5141   /* Make sure the loop is integer based.  */
5142   if (TREE_CODE (base) != INTEGER_CST
5143       || TREE_CODE (step) != INTEGER_CST)
5144     return false;
5145
5146   /* Check that the induction increments.  */
5147   if (tree_int_cst_sgn (step) == -1)
5148     return false;
5149
5150   /* Check that the max size of the loop will not wrap.  */
5151
5152   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5153     return true;
5154
5155   if (! max_stmt_executions (loop, &ni))
5156     return false;
5157
5158   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5159                             &overflow);
5160   if (overflow)
5161     return false;
5162
5163   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5164                             TYPE_SIGN (lhs_type), &overflow);
5165   if (overflow)
5166     return false;
5167
5168   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5169           <= TYPE_PRECISION (lhs_type));
5170 }
5171
5172 /* Function vectorizable_reduction.
5173
5174    Check if STMT performs a reduction operation that can be vectorized.
5175    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5176    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5177    Return FALSE if not a vectorizable STMT, TRUE otherwise.
5178
5179    This function also handles reduction idioms (patterns) that have been
5180    recognized in advance during vect_pattern_recog.  In this case, STMT may be
5181    of this form:
5182      X = pattern_expr (arg0, arg1, ..., X)
5183    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5184    sequence that had been detected and replaced by the pattern-stmt (STMT).
5185
5186    This function also handles reduction of condition expressions, for example:
5187      for (int i = 0; i < N; i++)
5188        if (a[i] < value)
5189          last = a[i];
5190    This is handled by vectorising the loop and creating an additional vector
5191    containing the loop indexes for which "a[i] < value" was true.  In the
5192    function epilogue this is reduced to a single max value and then used to
5193    index into the vector of results.
5194
5195    In some cases of reduction patterns, the type of the reduction variable X is
5196    different than the type of the other arguments of STMT.
5197    In such cases, the vectype that is used when transforming STMT into a vector
5198    stmt is different than the vectype that is used to determine the
5199    vectorization factor, because it consists of a different number of elements
5200    than the actual number of elements that are being operated upon in parallel.
5201
5202    For example, consider an accumulation of shorts into an int accumulator.
5203    On some targets it's possible to vectorize this pattern operating on 8
5204    shorts at a time (hence, the vectype for purposes of determining the
5205    vectorization factor should be V8HI); on the other hand, the vectype that
5206    is used to create the vector form is actually V4SI (the type of the result).
5207
5208    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5209    indicates what is the actual level of parallelism (V8HI in the example), so
5210    that the right vectorization factor would be derived.  This vectype
5211    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5212    be used to create the vectorized stmt.  The right vectype for the vectorized
5213    stmt is obtained from the type of the result X:
5214         get_vectype_for_scalar_type (TREE_TYPE (X))
5215
5216    This means that, contrary to "regular" reductions (or "regular" stmts in
5217    general), the following equation:
5218       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5219    does *NOT* necessarily hold for reduction patterns.  */
5220
5221 bool
5222 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5223                         gimple **vec_stmt, slp_tree slp_node)
5224 {
5225   tree vec_dest;
5226   tree scalar_dest;
5227   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
5228   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5229   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5230   tree vectype_in = NULL_TREE;
5231   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5232   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5233   enum tree_code code, orig_code, epilog_reduc_code;
5234   machine_mode vec_mode;
5235   int op_type;
5236   optab optab, reduc_optab;
5237   tree new_temp = NULL_TREE;
5238   gimple *def_stmt;
5239   enum vect_def_type dt;
5240   gphi *new_phi = NULL;
5241   tree scalar_type;
5242   bool is_simple_use;
5243   gimple *orig_stmt;
5244   stmt_vec_info orig_stmt_info;
5245   tree expr = NULL_TREE;
5246   int i;
5247   int ncopies;
5248   int epilog_copies;
5249   stmt_vec_info prev_stmt_info, prev_phi_info;
5250   bool single_defuse_cycle = false;
5251   tree reduc_def = NULL_TREE;
5252   gimple *new_stmt = NULL;
5253   int j;
5254   tree ops[3];
5255   bool nested_cycle = false, found_nested_cycle_def = false;
5256   gimple *reduc_def_stmt = NULL;
5257   bool double_reduc = false, dummy;
5258   basic_block def_bb;
5259   struct loop * def_stmt_loop, *outer_loop = NULL;
5260   tree def_arg;
5261   gimple *def_arg_stmt;
5262   auto_vec<tree> vec_oprnds0;
5263   auto_vec<tree> vec_oprnds1;
5264   auto_vec<tree> vect_defs;
5265   auto_vec<gimple *> phis;
5266   int vec_num;
5267   tree def0, def1, tem, op0, op1 = NULL_TREE;
5268   bool first_p = true;
5269   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5270   gimple *cond_expr_induction_def_stmt = NULL;
5271
5272   /* In case of reduction chain we switch to the first stmt in the chain, but
5273      we don't update STMT_INFO, since only the last stmt is marked as reduction
5274      and has reduction properties.  */
5275   if (GROUP_FIRST_ELEMENT (stmt_info)
5276       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5277     {
5278       stmt = GROUP_FIRST_ELEMENT (stmt_info);
5279       first_p = false;
5280     }
5281
5282   if (nested_in_vect_loop_p (loop, stmt))
5283     {
5284       outer_loop = loop;
5285       loop = loop->inner;
5286       nested_cycle = true;
5287     }
5288
5289   /* 1. Is vectorizable reduction?  */
5290   /* Not supportable if the reduction variable is used in the loop, unless
5291      it's a reduction chain.  */
5292   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5293       && !GROUP_FIRST_ELEMENT (stmt_info))
5294     return false;
5295
5296   /* Reductions that are not used even in an enclosing outer-loop,
5297      are expected to be "live" (used out of the loop).  */
5298   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5299       && !STMT_VINFO_LIVE_P (stmt_info))
5300     return false;
5301
5302   /* Make sure it was already recognized as a reduction computation.  */
5303   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5304       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5305     return false;
5306
5307   /* 2. Has this been recognized as a reduction pattern?
5308
5309      Check if STMT represents a pattern that has been recognized
5310      in earlier analysis stages.  For stmts that represent a pattern,
5311      the STMT_VINFO_RELATED_STMT field records the last stmt in
5312      the original sequence that constitutes the pattern.  */
5313
5314   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5315   if (orig_stmt)
5316     {
5317       orig_stmt_info = vinfo_for_stmt (orig_stmt);
5318       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5319       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5320     }
5321
5322   /* 3. Check the operands of the operation.  The first operands are defined
5323         inside the loop body. The last operand is the reduction variable,
5324         which is defined by the loop-header-phi.  */
5325
5326   gcc_assert (is_gimple_assign (stmt));
5327
5328   /* Flatten RHS.  */
5329   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5330     {
5331     case GIMPLE_SINGLE_RHS:
5332       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
5333       if (op_type == ternary_op)
5334         {
5335           tree rhs = gimple_assign_rhs1 (stmt);
5336           ops[0] = TREE_OPERAND (rhs, 0);
5337           ops[1] = TREE_OPERAND (rhs, 1);
5338           ops[2] = TREE_OPERAND (rhs, 2);
5339           code = TREE_CODE (rhs);
5340         }
5341       else
5342         return false;
5343       break;
5344
5345     case GIMPLE_BINARY_RHS:
5346       code = gimple_assign_rhs_code (stmt);
5347       op_type = TREE_CODE_LENGTH (code);
5348       gcc_assert (op_type == binary_op);
5349       ops[0] = gimple_assign_rhs1 (stmt);
5350       ops[1] = gimple_assign_rhs2 (stmt);
5351       break;
5352
5353     case GIMPLE_TERNARY_RHS:
5354       code = gimple_assign_rhs_code (stmt);
5355       op_type = TREE_CODE_LENGTH (code);
5356       gcc_assert (op_type == ternary_op);
5357       ops[0] = gimple_assign_rhs1 (stmt);
5358       ops[1] = gimple_assign_rhs2 (stmt);
5359       ops[2] = gimple_assign_rhs3 (stmt);
5360       break;
5361
5362     case GIMPLE_UNARY_RHS:
5363       return false;
5364
5365     default:
5366       gcc_unreachable ();
5367     }
5368   /* The default is that the reduction variable is the last in statement.  */
5369   int reduc_index = op_type - 1;
5370   if (code == MINUS_EXPR)
5371     reduc_index = 0;
5372
5373   if (code == COND_EXPR && slp_node)
5374     return false;
5375
5376   scalar_dest = gimple_assign_lhs (stmt);
5377   scalar_type = TREE_TYPE (scalar_dest);
5378   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5379       && !SCALAR_FLOAT_TYPE_P (scalar_type))
5380     return false;
5381
5382   /* Do not try to vectorize bit-precision reductions.  */
5383   if ((TYPE_PRECISION (scalar_type)
5384        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
5385     return false;
5386
5387   /* All uses but the last are expected to be defined in the loop.
5388      The last use is the reduction variable.  In case of nested cycle this
5389      assumption is not true: we use reduc_index to record the index of the
5390      reduction variable.  */
5391   for (i = 0; i < op_type; i++)
5392     {
5393       if (i == reduc_index)
5394         continue;
5395
5396       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5397       if (i == 0 && code == COND_EXPR)
5398         continue;
5399
5400       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5401                                           &def_stmt, &dt, &tem);
5402       if (!vectype_in)
5403         vectype_in = tem;
5404       gcc_assert (is_simple_use);
5405
5406       if (dt != vect_internal_def
5407           && dt != vect_external_def
5408           && dt != vect_constant_def
5409           && dt != vect_induction_def
5410           && !(dt == vect_nested_cycle && nested_cycle))
5411         return false;
5412
5413       if (dt == vect_nested_cycle)
5414         {
5415           found_nested_cycle_def = true;
5416           reduc_def_stmt = def_stmt;
5417           reduc_index = i;
5418         }
5419
5420       if (i == 1 && code == COND_EXPR && dt == vect_induction_def)
5421         cond_expr_induction_def_stmt = def_stmt;
5422     }
5423
5424   is_simple_use = vect_is_simple_use (ops[reduc_index], loop_vinfo,
5425                                       &def_stmt, &dt, &tem);
5426   if (!vectype_in)
5427     vectype_in = tem;
5428   gcc_assert (is_simple_use);
5429   if (!found_nested_cycle_def)
5430     reduc_def_stmt = def_stmt;
5431
5432   if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5433     return false;
5434
5435   if (!(dt == vect_reduction_def
5436         || dt == vect_nested_cycle
5437         || ((dt == vect_internal_def || dt == vect_external_def
5438              || dt == vect_constant_def || dt == vect_induction_def)
5439             && nested_cycle && found_nested_cycle_def)))
5440     {
5441       /* For pattern recognized stmts, orig_stmt might be a reduction,
5442          but some helper statements for the pattern might not, or
5443          might be COND_EXPRs with reduction uses in the condition.  */
5444       gcc_assert (orig_stmt);
5445       return false;
5446     }
5447
5448   enum vect_reduction_type v_reduc_type;
5449   gimple *tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5450                                           !nested_cycle, &dummy, false,
5451                                           &v_reduc_type);
5452
5453   /* If we have a condition reduction, see if we can simplify it further.  */
5454   if (v_reduc_type == COND_REDUCTION
5455       && cond_expr_induction_def_stmt != NULL
5456       && is_nonwrapping_integer_induction (cond_expr_induction_def_stmt, loop))
5457     {
5458       if (dump_enabled_p ())
5459         dump_printf_loc (MSG_NOTE, vect_location,
5460                          "condition expression based on integer induction.\n");
5461       STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = INTEGER_INDUC_COND_REDUCTION;
5462     }
5463   else
5464    STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5465
5466   if (orig_stmt)
5467     gcc_assert (tmp == orig_stmt
5468                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5469   else
5470     /* We changed STMT to be the first stmt in reduction chain, hence we
5471        check that in this case the first element in the chain is STMT.  */
5472     gcc_assert (stmt == tmp
5473                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5474
5475   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5476     return false;
5477
5478   if (slp_node || PURE_SLP_STMT (stmt_info))
5479     ncopies = 1;
5480   else
5481     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5482                / TYPE_VECTOR_SUBPARTS (vectype_in));
5483
5484   gcc_assert (ncopies >= 1);
5485
5486   vec_mode = TYPE_MODE (vectype_in);
5487
5488   if (code == COND_EXPR)
5489     {
5490       /* Only call during the analysis stage, otherwise we'll lose
5491          STMT_VINFO_TYPE.  */
5492       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
5493                                                 ops[reduc_index], 0, NULL))
5494         {
5495           if (dump_enabled_p ())
5496             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5497                              "unsupported condition in reduction\n");
5498           return false;
5499         }
5500     }
5501   else
5502     {
5503       /* 4. Supportable by target?  */
5504
5505       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5506           || code == LROTATE_EXPR || code == RROTATE_EXPR)
5507         {
5508           /* Shifts and rotates are only supported by vectorizable_shifts,
5509              not vectorizable_reduction.  */
5510           if (dump_enabled_p ())
5511             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5512                              "unsupported shift or rotation.\n");
5513           return false;
5514         }
5515
5516       /* 4.1. check support for the operation in the loop  */
5517       optab = optab_for_tree_code (code, vectype_in, optab_default);
5518       if (!optab)
5519         {
5520           if (dump_enabled_p ())
5521             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5522                              "no optab.\n");
5523
5524           return false;
5525         }
5526
5527       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5528         {
5529           if (dump_enabled_p ())
5530             dump_printf (MSG_NOTE, "op not supported by target.\n");
5531
5532           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5533               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5534                   < vect_min_worthwhile_factor (code))
5535             return false;
5536
5537           if (dump_enabled_p ())
5538             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5539         }
5540
5541       /* Worthwhile without SIMD support?  */
5542       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5543           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5544              < vect_min_worthwhile_factor (code))
5545         {
5546           if (dump_enabled_p ())
5547             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5548                              "not worthwhile without SIMD support.\n");
5549
5550           return false;
5551         }
5552     }
5553
5554   /* 4.2. Check support for the epilog operation.
5555
5556           If STMT represents a reduction pattern, then the type of the
5557           reduction variable may be different than the type of the rest
5558           of the arguments.  For example, consider the case of accumulation
5559           of shorts into an int accumulator; The original code:
5560                         S1: int_a = (int) short_a;
5561           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
5562
5563           was replaced with:
5564                         STMT: int_acc = widen_sum <short_a, int_acc>
5565
5566           This means that:
5567           1. The tree-code that is used to create the vector operation in the
5568              epilog code (that reduces the partial results) is not the
5569              tree-code of STMT, but is rather the tree-code of the original
5570              stmt from the pattern that STMT is replacing.  I.e, in the example
5571              above we want to use 'widen_sum' in the loop, but 'plus' in the
5572              epilog.
5573           2. The type (mode) we use to check available target support
5574              for the vector operation to be created in the *epilog*, is
5575              determined by the type of the reduction variable (in the example
5576              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5577              However the type (mode) we use to check available target support
5578              for the vector operation to be created *inside the loop*, is
5579              determined by the type of the other arguments to STMT (in the
5580              example we'd check this: optab_handler (widen_sum_optab,
5581              vect_short_mode)).
5582
5583           This is contrary to "regular" reductions, in which the types of all
5584           the arguments are the same as the type of the reduction variable.
5585           For "regular" reductions we can therefore use the same vector type
5586           (and also the same tree-code) when generating the epilog code and
5587           when generating the code inside the loop.  */
5588
5589   if (orig_stmt)
5590     {
5591       /* This is a reduction pattern: get the vectype from the type of the
5592          reduction variable, and get the tree-code from orig_stmt.  */
5593       gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5594                   == TREE_CODE_REDUCTION);
5595       orig_code = gimple_assign_rhs_code (orig_stmt);
5596       gcc_assert (vectype_out);
5597       vec_mode = TYPE_MODE (vectype_out);
5598     }
5599   else
5600     {
5601       /* Regular reduction: use the same vectype and tree-code as used for
5602          the vector code inside the loop can be used for the epilog code. */
5603       orig_code = code;
5604
5605       if (code == MINUS_EXPR)
5606         orig_code = PLUS_EXPR;
5607
5608       /* For simple condition reductions, replace with the actual expression
5609          we want to base our reduction around.  */
5610       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5611           == INTEGER_INDUC_COND_REDUCTION)
5612         orig_code = MAX_EXPR;
5613     }
5614
5615   if (nested_cycle)
5616     {
5617       def_bb = gimple_bb (reduc_def_stmt);
5618       def_stmt_loop = def_bb->loop_father;
5619       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5620                                        loop_preheader_edge (def_stmt_loop));
5621       if (TREE_CODE (def_arg) == SSA_NAME
5622           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5623           && gimple_code (def_arg_stmt) == GIMPLE_PHI
5624           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5625           && vinfo_for_stmt (def_arg_stmt)
5626           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5627               == vect_double_reduction_def)
5628         double_reduc = true;
5629     }
5630
5631   epilog_reduc_code = ERROR_MARK;
5632
5633   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == TREE_CODE_REDUCTION
5634       || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5635                 == INTEGER_INDUC_COND_REDUCTION)
5636     {
5637       if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5638         {
5639           reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5640                                          optab_default);
5641           if (!reduc_optab)
5642             {
5643               if (dump_enabled_p ())
5644                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5645                                  "no optab for reduction.\n");
5646
5647               epilog_reduc_code = ERROR_MARK;
5648             }
5649           else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5650             {
5651               optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5652               if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5653                 {
5654                   if (dump_enabled_p ())
5655                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5656                                      "reduc op not supported by target.\n");
5657
5658                   epilog_reduc_code = ERROR_MARK;
5659                 }
5660             }
5661
5662           /* When epilog_reduc_code is ERROR_MARK then a reduction will be
5663              generated in the epilog using multiple expressions.  This does not
5664              work for condition reductions.  */
5665           if (epilog_reduc_code == ERROR_MARK
5666               && STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5667                         == INTEGER_INDUC_COND_REDUCTION)
5668             {
5669               if (dump_enabled_p ())
5670                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5671                                  "no reduc code for scalar code.\n");
5672               return false;
5673             }
5674         }
5675       else
5676         {
5677           if (!nested_cycle || double_reduc)
5678             {
5679               if (dump_enabled_p ())
5680                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5681                                  "no reduc code for scalar code.\n");
5682
5683               return false;
5684             }
5685         }
5686     }
5687   else
5688     {
5689       int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
5690       cr_index_scalar_type = make_unsigned_type (scalar_precision);
5691       cr_index_vector_type = build_vector_type
5692         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
5693
5694       epilog_reduc_code = REDUC_MAX_EXPR;
5695       optab = optab_for_tree_code (REDUC_MAX_EXPR, cr_index_vector_type,
5696                                    optab_default);
5697       if (optab_handler (optab, TYPE_MODE (cr_index_vector_type))
5698           == CODE_FOR_nothing)
5699         {
5700           if (dump_enabled_p ())
5701             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5702                              "reduc max op not supported by target.\n");
5703           return false;
5704         }
5705     }
5706
5707   if ((double_reduc
5708        || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5709        || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5710                 == INTEGER_INDUC_COND_REDUCTION)
5711       && ncopies > 1)
5712     {
5713       if (dump_enabled_p ())
5714         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5715                          "multiple types in double reduction or condition "
5716                          "reduction.\n");
5717       return false;
5718     }
5719
5720   /* In case of widenning multiplication by a constant, we update the type
5721      of the constant to be the type of the other operand.  We check that the
5722      constant fits the type in the pattern recognition pass.  */
5723   if (code == DOT_PROD_EXPR
5724       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5725     {
5726       if (TREE_CODE (ops[0]) == INTEGER_CST)
5727         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5728       else if (TREE_CODE (ops[1]) == INTEGER_CST)
5729         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5730       else
5731         {
5732           if (dump_enabled_p ())
5733             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5734                              "invalid types in dot-prod\n");
5735
5736           return false;
5737         }
5738     }
5739
5740   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
5741     {
5742       widest_int ni;
5743
5744       if (! max_loop_iterations (loop, &ni))
5745         {
5746           if (dump_enabled_p ())
5747             dump_printf_loc (MSG_NOTE, vect_location,
5748                              "loop count not known, cannot create cond "
5749                              "reduction.\n");
5750           return false;
5751         }
5752       /* Convert backedges to iterations.  */
5753       ni += 1;
5754
5755       /* The additional index will be the same type as the condition.  Check
5756          that the loop can fit into this less one (because we'll use up the
5757          zero slot for when there are no matches).  */
5758       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
5759       if (wi::geu_p (ni, wi::to_widest (max_index)))
5760         {
5761           if (dump_enabled_p ())
5762             dump_printf_loc (MSG_NOTE, vect_location,
5763                              "loop size is greater than data size.\n");
5764           return false;
5765         }
5766     }
5767
5768   if (!vec_stmt) /* transformation not required.  */
5769     {
5770       if (first_p
5771           && !vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
5772                                          reduc_index))
5773         return false;
5774       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5775       return true;
5776     }
5777
5778   /** Transform.  **/
5779
5780   if (dump_enabled_p ())
5781     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5782
5783   /* FORNOW: Multiple types are not supported for condition.  */
5784   if (code == COND_EXPR)
5785     gcc_assert (ncopies == 1);
5786
5787   /* Create the destination vector  */
5788   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5789
5790   /* In case the vectorization factor (VF) is bigger than the number
5791      of elements that we can fit in a vectype (nunits), we have to generate
5792      more than one vector stmt - i.e - we need to "unroll" the
5793      vector stmt by a factor VF/nunits.  For more details see documentation
5794      in vectorizable_operation.  */
5795
5796   /* If the reduction is used in an outer loop we need to generate
5797      VF intermediate results, like so (e.g. for ncopies=2):
5798         r0 = phi (init, r0)
5799         r1 = phi (init, r1)
5800         r0 = x0 + r0;
5801         r1 = x1 + r1;
5802     (i.e. we generate VF results in 2 registers).
5803     In this case we have a separate def-use cycle for each copy, and therefore
5804     for each copy we get the vector def for the reduction variable from the
5805     respective phi node created for this copy.
5806
5807     Otherwise (the reduction is unused in the loop nest), we can combine
5808     together intermediate results, like so (e.g. for ncopies=2):
5809         r = phi (init, r)
5810         r = x0 + r;
5811         r = x1 + r;
5812    (i.e. we generate VF/2 results in a single register).
5813    In this case for each copy we get the vector def for the reduction variable
5814    from the vectorized reduction operation generated in the previous iteration.
5815   */
5816
5817   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5818     {
5819       single_defuse_cycle = true;
5820       epilog_copies = 1;
5821     }
5822   else
5823     epilog_copies = ncopies;
5824
5825   prev_stmt_info = NULL;
5826   prev_phi_info = NULL;
5827   if (slp_node)
5828     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5829   else
5830     {
5831       vec_num = 1;
5832       vec_oprnds0.create (1);
5833       if (op_type == ternary_op)
5834         vec_oprnds1.create (1);
5835     }
5836
5837   phis.create (vec_num);
5838   vect_defs.create (vec_num);
5839   if (!slp_node)
5840     vect_defs.quick_push (NULL_TREE);
5841
5842   for (j = 0; j < ncopies; j++)
5843     {
5844       if (j == 0 || !single_defuse_cycle)
5845         {
5846           for (i = 0; i < vec_num; i++)
5847             {
5848               /* Create the reduction-phi that defines the reduction
5849                  operand.  */
5850               new_phi = create_phi_node (vec_dest, loop->header);
5851               set_vinfo_for_stmt (new_phi,
5852                                   new_stmt_vec_info (new_phi, loop_vinfo));
5853                if (j == 0 || slp_node)
5854                  phis.quick_push (new_phi);
5855             }
5856         }
5857
5858       if (code == COND_EXPR)
5859         {
5860           gcc_assert (!slp_node);
5861           vectorizable_condition (stmt, gsi, vec_stmt,
5862                                   PHI_RESULT (phis[0]),
5863                                   reduc_index, NULL);
5864           /* Multiple types are not supported for condition.  */
5865           break;
5866         }
5867
5868       /* Handle uses.  */
5869       if (j == 0)
5870         {
5871           op0 = ops[!reduc_index];
5872           if (op_type == ternary_op)
5873             {
5874               if (reduc_index == 0)
5875                 op1 = ops[2];
5876               else
5877                 op1 = ops[1];
5878             }
5879
5880           if (slp_node)
5881             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5882                                slp_node, -1);
5883           else
5884             {
5885               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5886                                                             stmt);
5887               vec_oprnds0.quick_push (loop_vec_def0);
5888               if (op_type == ternary_op)
5889                {
5890                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt);
5891                  vec_oprnds1.quick_push (loop_vec_def1);
5892                }
5893             }
5894         }
5895       else
5896         {
5897           if (!slp_node)
5898             {
5899               enum vect_def_type dt;
5900               gimple *dummy_stmt;
5901
5902               vect_is_simple_use (ops[!reduc_index], loop_vinfo,
5903                                   &dummy_stmt, &dt);
5904               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5905                                                               loop_vec_def0);
5906               vec_oprnds0[0] = loop_vec_def0;
5907               if (op_type == ternary_op)
5908                 {
5909                   vect_is_simple_use (op1, loop_vinfo, &dummy_stmt, &dt);
5910                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5911                                                                 loop_vec_def1);
5912                   vec_oprnds1[0] = loop_vec_def1;
5913                 }
5914             }
5915
5916           if (single_defuse_cycle)
5917             reduc_def = gimple_assign_lhs (new_stmt);
5918
5919           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5920         }
5921
5922       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5923         {
5924           if (slp_node)
5925             reduc_def = PHI_RESULT (phis[i]);
5926           else
5927             {
5928               if (!single_defuse_cycle || j == 0)
5929                 reduc_def = PHI_RESULT (new_phi);
5930             }
5931
5932           def1 = ((op_type == ternary_op)
5933                   ? vec_oprnds1[i] : NULL);
5934           if (op_type == binary_op)
5935             {
5936               if (reduc_index == 0)
5937                 expr = build2 (code, vectype_out, reduc_def, def0);
5938               else
5939                 expr = build2 (code, vectype_out, def0, reduc_def);
5940             }
5941           else
5942             {
5943               if (reduc_index == 0)
5944                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5945               else
5946                 {
5947                   if (reduc_index == 1)
5948                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5949                   else
5950                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5951                 }
5952             }
5953
5954           new_stmt = gimple_build_assign (vec_dest, expr);
5955           new_temp = make_ssa_name (vec_dest, new_stmt);
5956           gimple_assign_set_lhs (new_stmt, new_temp);
5957           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5958
5959           if (slp_node)
5960             {
5961               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5962               vect_defs.quick_push (new_temp);
5963             }
5964           else
5965             vect_defs[0] = new_temp;
5966         }
5967
5968       if (slp_node)
5969         continue;
5970
5971       if (j == 0)
5972         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5973       else
5974         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5975
5976       prev_stmt_info = vinfo_for_stmt (new_stmt);
5977       prev_phi_info = vinfo_for_stmt (new_phi);
5978     }
5979
5980   tree indx_before_incr, indx_after_incr, cond_name = NULL;
5981
5982   /* Finalize the reduction-phi (set its arguments) and create the
5983      epilog reduction code.  */
5984   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5985     {
5986       new_temp = gimple_assign_lhs (*vec_stmt);
5987       vect_defs[0] = new_temp;
5988
5989       /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5990          which is updated with the current index of the loop for every match of
5991          the original loop's cond_expr (VEC_STMT).  This results in a vector
5992          containing the last time the condition passed for that vector lane.
5993          The first match will be a 1 to allow 0 to be used for non-matching
5994          indexes.  If there are no matches at all then the vector will be all
5995          zeroes.  */
5996       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
5997         {
5998           int nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
5999           int k;
6000
6001           gcc_assert (gimple_assign_rhs_code (*vec_stmt) == VEC_COND_EXPR);
6002
6003           /* First we create a simple vector induction variable which starts
6004              with the values {1,2,3,...} (SERIES_VECT) and increments by the
6005              vector size (STEP).  */
6006
6007           /* Create a {1,2,3,...} vector.  */
6008           tree *vtemp = XALLOCAVEC (tree, nunits_out);
6009           for (k = 0; k < nunits_out; ++k)
6010             vtemp[k] = build_int_cst (cr_index_scalar_type, k + 1);
6011           tree series_vect = build_vector (cr_index_vector_type, vtemp);
6012
6013           /* Create a vector of the step value.  */
6014           tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6015           tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6016
6017           /* Create an induction variable.  */
6018           gimple_stmt_iterator incr_gsi;
6019           bool insert_after;
6020           standard_iv_increment_position (loop, &incr_gsi, &insert_after);
6021           create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
6022                      insert_after, &indx_before_incr, &indx_after_incr);
6023
6024           /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6025              filled with zeros (VEC_ZERO).  */
6026
6027           /* Create a vector of 0s.  */
6028           tree zero = build_zero_cst (cr_index_scalar_type);
6029           tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6030
6031           /* Create a vector phi node.  */
6032           tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6033           new_phi = create_phi_node (new_phi_tree, loop->header);
6034           set_vinfo_for_stmt (new_phi,
6035                               new_stmt_vec_info (new_phi, loop_vinfo));
6036           add_phi_arg (new_phi, vec_zero, loop_preheader_edge (loop),
6037                        UNKNOWN_LOCATION);
6038
6039           /* Now take the condition from the loops original cond_expr
6040              (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
6041              every match uses values from the induction variable
6042              (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6043              (NEW_PHI_TREE).
6044              Finally, we update the phi (NEW_PHI_TREE) to take the value of
6045              the new cond_expr (INDEX_COND_EXPR).  */
6046
6047           /* Turn the condition from vec_stmt into an ssa name.  */
6048           gimple_stmt_iterator vec_stmt_gsi = gsi_for_stmt (*vec_stmt);
6049           tree ccompare = gimple_assign_rhs1 (*vec_stmt);
6050           tree ccompare_name = make_ssa_name (TREE_TYPE (ccompare));
6051           gimple *ccompare_stmt = gimple_build_assign (ccompare_name,
6052                                                        ccompare);
6053           gsi_insert_before (&vec_stmt_gsi, ccompare_stmt, GSI_SAME_STMT);
6054           gimple_assign_set_rhs1 (*vec_stmt, ccompare_name);
6055           update_stmt (*vec_stmt);
6056
6057           /* Create a conditional, where the condition is taken from vec_stmt
6058              (CCOMPARE_NAME), then is the induction index (INDEX_BEFORE_INCR)
6059              and else is the phi (NEW_PHI_TREE).  */
6060           tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
6061                                          ccompare_name, indx_before_incr,
6062                                          new_phi_tree);
6063           cond_name = make_ssa_name (cr_index_vector_type);
6064           gimple *index_condition = gimple_build_assign (cond_name,
6065                                                          index_cond_expr);
6066           gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
6067           stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
6068                                                             loop_vinfo);
6069           STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
6070           set_vinfo_for_stmt (index_condition, index_vec_info);
6071
6072           /* Update the phi with the vec cond.  */
6073           add_phi_arg (new_phi, cond_name, loop_latch_edge (loop),
6074                        UNKNOWN_LOCATION);
6075         }
6076     }
6077
6078   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
6079                                     epilog_reduc_code, phis, reduc_index,
6080                                     double_reduc, slp_node, cond_name);
6081
6082   return true;
6083 }
6084
6085 /* Function vect_min_worthwhile_factor.
6086
6087    For a loop where we could vectorize the operation indicated by CODE,
6088    return the minimum vectorization factor that makes it worthwhile
6089    to use generic vectors.  */
6090 int
6091 vect_min_worthwhile_factor (enum tree_code code)
6092 {
6093   switch (code)
6094     {
6095     case PLUS_EXPR:
6096     case MINUS_EXPR:
6097     case NEGATE_EXPR:
6098       return 4;
6099
6100     case BIT_AND_EXPR:
6101     case BIT_IOR_EXPR:
6102     case BIT_XOR_EXPR:
6103     case BIT_NOT_EXPR:
6104       return 2;
6105
6106     default:
6107       return INT_MAX;
6108     }
6109 }
6110
6111
6112 /* Function vectorizable_induction
6113
6114    Check if PHI performs an induction computation that can be vectorized.
6115    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6116    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6117    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
6118
6119 bool
6120 vectorizable_induction (gimple *phi,
6121                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6122                         gimple **vec_stmt)
6123 {
6124   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6125   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6126   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6127   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6128   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6129   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6130   tree vec_def;
6131
6132   gcc_assert (ncopies >= 1);
6133   /* FORNOW. These restrictions should be relaxed.  */
6134   if (nested_in_vect_loop_p (loop, phi))
6135     {
6136       imm_use_iterator imm_iter;
6137       use_operand_p use_p;
6138       gimple *exit_phi;
6139       edge latch_e;
6140       tree loop_arg;
6141
6142       if (ncopies > 1)
6143         {
6144           if (dump_enabled_p ())
6145             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6146                              "multiple types in nested loop.\n");
6147           return false;
6148         }
6149
6150       exit_phi = NULL;
6151       latch_e = loop_latch_edge (loop->inner);
6152       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6153       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6154         {
6155           gimple *use_stmt = USE_STMT (use_p);
6156           if (is_gimple_debug (use_stmt))
6157             continue;
6158
6159           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6160             {
6161               exit_phi = use_stmt;
6162               break;
6163             }
6164         }
6165       if (exit_phi)
6166         {
6167           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
6168           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6169                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6170             {
6171               if (dump_enabled_p ())
6172                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6173                                  "inner-loop induction only used outside "
6174                                  "of the outer vectorized loop.\n");
6175               return false;
6176             }
6177         }
6178     }
6179
6180   if (!STMT_VINFO_RELEVANT_P (stmt_info))
6181     return false;
6182
6183   /* FORNOW: SLP not supported.  */
6184   if (STMT_SLP_TYPE (stmt_info))
6185     return false;
6186
6187   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
6188
6189   if (gimple_code (phi) != GIMPLE_PHI)
6190     return false;
6191
6192   if (!vec_stmt) /* transformation not required.  */
6193     {
6194       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6195       if (dump_enabled_p ())
6196         dump_printf_loc (MSG_NOTE, vect_location,
6197                          "=== vectorizable_induction ===\n");
6198       vect_model_induction_cost (stmt_info, ncopies);
6199       return true;
6200     }
6201
6202   /** Transform.  **/
6203
6204   if (dump_enabled_p ())
6205     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6206
6207   vec_def = get_initial_def_for_induction (phi);
6208   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
6209   return true;
6210 }
6211
6212 /* Function vectorizable_live_operation.
6213
6214    STMT computes a value that is used outside the loop.  Check if
6215    it can be supported.  */
6216
6217 bool
6218 vectorizable_live_operation (gimple *stmt,
6219                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6220                              gimple **vec_stmt)
6221 {
6222   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6223   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6224   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6225   tree op;
6226   gimple *def_stmt;
6227   ssa_op_iter iter;
6228
6229   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
6230
6231   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6232     return false;
6233
6234   if (!is_gimple_assign (stmt))
6235     {
6236       if (gimple_call_internal_p (stmt)
6237           && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
6238           && gimple_call_lhs (stmt)
6239           && loop->simduid
6240           && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
6241           && loop->simduid
6242              == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
6243         {
6244           edge e = single_exit (loop);
6245           basic_block merge_bb = e->dest;
6246           imm_use_iterator imm_iter;
6247           use_operand_p use_p;
6248           tree lhs = gimple_call_lhs (stmt);
6249
6250           FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
6251             {
6252               gimple *use_stmt = USE_STMT (use_p);
6253               if (gimple_code (use_stmt) == GIMPLE_PHI
6254                   && gimple_bb (use_stmt) == merge_bb)
6255                 {
6256                   if (vec_stmt)
6257                     {
6258                       tree vfm1
6259                         = build_int_cst (unsigned_type_node,
6260                                          loop_vinfo->vectorization_factor - 1);
6261                       SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
6262                     }
6263                   return true;
6264                 }
6265             }
6266         }
6267
6268       return false;
6269     }
6270
6271   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
6272     return false;
6273
6274   /* FORNOW. CHECKME. */
6275   if (nested_in_vect_loop_p (loop, stmt))
6276     return false;
6277
6278   /* FORNOW: support only if all uses are invariant.  This means
6279      that the scalar operations can remain in place, unvectorized.
6280      The original last scalar value that they compute will be used.  */
6281   FOR_EACH_SSA_TREE_OPERAND (op, stmt, iter, SSA_OP_USE)
6282     {
6283       enum vect_def_type dt = vect_uninitialized_def;
6284
6285       if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &dt))
6286         {
6287           if (dump_enabled_p ())
6288             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6289                              "use not simple.\n");
6290           return false;
6291         }
6292
6293       if (dt != vect_external_def && dt != vect_constant_def)
6294         return false;
6295     }
6296
6297   /* No transformation is required for the cases we currently support.  */
6298   return true;
6299 }
6300
6301 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
6302
6303 static void
6304 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
6305 {
6306   ssa_op_iter op_iter;
6307   imm_use_iterator imm_iter;
6308   def_operand_p def_p;
6309   gimple *ustmt;
6310
6311   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
6312     {
6313       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
6314         {
6315           basic_block bb;
6316
6317           if (!is_gimple_debug (ustmt))
6318             continue;
6319
6320           bb = gimple_bb (ustmt);
6321
6322           if (!flow_bb_inside_loop_p (loop, bb))
6323             {
6324               if (gimple_debug_bind_p (ustmt))
6325                 {
6326                   if (dump_enabled_p ())
6327                     dump_printf_loc (MSG_NOTE, vect_location,
6328                                      "killing debug use\n");
6329
6330                   gimple_debug_bind_reset_value (ustmt);
6331                   update_stmt (ustmt);
6332                 }
6333               else
6334                 gcc_unreachable ();
6335             }
6336         }
6337     }
6338 }
6339
6340
6341 /* This function builds ni_name = number of iterations.  Statements
6342    are emitted on the loop preheader edge.  */
6343
6344 static tree
6345 vect_build_loop_niters (loop_vec_info loop_vinfo)
6346 {
6347   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
6348   if (TREE_CODE (ni) == INTEGER_CST)
6349     return ni;
6350   else
6351     {
6352       tree ni_name, var;
6353       gimple_seq stmts = NULL;
6354       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6355
6356       var = create_tmp_var (TREE_TYPE (ni), "niters");
6357       ni_name = force_gimple_operand (ni, &stmts, false, var);
6358       if (stmts)
6359         gsi_insert_seq_on_edge_immediate (pe, stmts);
6360
6361       return ni_name;
6362     }
6363 }
6364
6365
6366 /* This function generates the following statements:
6367
6368    ni_name = number of iterations loop executes
6369    ratio = ni_name / vf
6370    ratio_mult_vf_name = ratio * vf
6371
6372    and places them on the loop preheader edge.  */
6373
6374 static void
6375 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
6376                                  tree ni_name,
6377                                  tree *ratio_mult_vf_name_ptr,
6378                                  tree *ratio_name_ptr)
6379 {
6380   tree ni_minus_gap_name;
6381   tree var;
6382   tree ratio_name;
6383   tree ratio_mult_vf_name;
6384   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6385   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6386   tree log_vf;
6387
6388   log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
6389
6390   /* If epilogue loop is required because of data accesses with gaps, we
6391      subtract one iteration from the total number of iterations here for
6392      correct calculation of RATIO.  */
6393   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6394     {
6395       ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6396                                        ni_name,
6397                                        build_one_cst (TREE_TYPE (ni_name)));
6398       if (!is_gimple_val (ni_minus_gap_name))
6399         {
6400           var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
6401           gimple *stmts = NULL;
6402           ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
6403                                                     true, var);
6404           gsi_insert_seq_on_edge_immediate (pe, stmts);
6405         }
6406     }
6407   else
6408     ni_minus_gap_name = ni_name;
6409
6410   /* Create: ratio = ni >> log2(vf) */
6411   /* ???  As we have ni == number of latch executions + 1, ni could
6412      have overflown to zero.  So avoid computing ratio based on ni
6413      but compute it using the fact that we know ratio will be at least
6414      one, thus via (ni - vf) >> log2(vf) + 1.  */
6415   ratio_name
6416     = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
6417                    fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
6418                                 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6419                                              ni_minus_gap_name,
6420                                              build_int_cst
6421                                                (TREE_TYPE (ni_name), vf)),
6422                                 log_vf),
6423                    build_int_cst (TREE_TYPE (ni_name), 1));
6424   if (!is_gimple_val (ratio_name))
6425     {
6426       var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
6427       gimple *stmts = NULL;
6428       ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
6429       gsi_insert_seq_on_edge_immediate (pe, stmts);
6430     }
6431   *ratio_name_ptr = ratio_name;
6432
6433   /* Create: ratio_mult_vf = ratio << log2 (vf).  */
6434
6435   if (ratio_mult_vf_name_ptr)
6436     {
6437       ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
6438                                         ratio_name, log_vf);
6439       if (!is_gimple_val (ratio_mult_vf_name))
6440         {
6441           var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
6442           gimple *stmts = NULL;
6443           ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
6444                                                      true, var);
6445           gsi_insert_seq_on_edge_immediate (pe, stmts);
6446         }
6447       *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
6448     }
6449
6450   return;
6451 }
6452
6453
6454 /* Function vect_transform_loop.
6455
6456    The analysis phase has determined that the loop is vectorizable.
6457    Vectorize the loop - created vectorized stmts to replace the scalar
6458    stmts in the loop, and update the loop exit condition.  */
6459
6460 void
6461 vect_transform_loop (loop_vec_info loop_vinfo)
6462 {
6463   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6464   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
6465   int nbbs = loop->num_nodes;
6466   int i;
6467   tree ratio = NULL;
6468   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6469   bool grouped_store;
6470   bool slp_scheduled = false;
6471   gimple *stmt, *pattern_stmt;
6472   gimple_seq pattern_def_seq = NULL;
6473   gimple_stmt_iterator pattern_def_si = gsi_none ();
6474   bool transform_pattern_stmt = false;
6475   bool check_profitability = false;
6476   int th;
6477   /* Record number of iterations before we started tampering with the profile. */
6478   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
6479
6480   if (dump_enabled_p ())
6481     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
6482
6483   /* If profile is inprecise, we have chance to fix it up.  */
6484   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6485     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
6486
6487   /* Use the more conservative vectorization threshold.  If the number
6488      of iterations is constant assume the cost check has been performed
6489      by our caller.  If the threshold makes all loops profitable that
6490      run at least the vectorization factor number of times checking
6491      is pointless, too.  */
6492   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
6493   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
6494       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6495     {
6496       if (dump_enabled_p ())
6497         dump_printf_loc (MSG_NOTE, vect_location,
6498                          "Profitability threshold is %d loop iterations.\n",
6499                          th);
6500       check_profitability = true;
6501     }
6502
6503   /* Version the loop first, if required, so the profitability check
6504      comes first.  */
6505
6506   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
6507       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
6508     {
6509       vect_loop_versioning (loop_vinfo, th, check_profitability);
6510       check_profitability = false;
6511     }
6512
6513   tree ni_name = vect_build_loop_niters (loop_vinfo);
6514   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
6515
6516   /* Peel the loop if there are data refs with unknown alignment.
6517      Only one data ref with unknown store is allowed.  */
6518
6519   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
6520     {
6521       vect_do_peeling_for_alignment (loop_vinfo, ni_name,
6522                                      th, check_profitability);
6523       check_profitability = false;
6524       /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
6525          be re-computed.  */
6526       ni_name = NULL_TREE;
6527     }
6528
6529   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
6530      compile time constant), or it is a constant that doesn't divide by the
6531      vectorization factor, then an epilog loop needs to be created.
6532      We therefore duplicate the loop: the original loop will be vectorized,
6533      and will compute the first (n/VF) iterations.  The second copy of the loop
6534      will remain scalar and will compute the remaining (n%VF) iterations.
6535      (VF is the vectorization factor).  */
6536
6537   if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
6538       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6539     {
6540       tree ratio_mult_vf;
6541       if (!ni_name)
6542         ni_name = vect_build_loop_niters (loop_vinfo);
6543       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
6544                                        &ratio);
6545       vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
6546                                       th, check_profitability);
6547     }
6548   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6549     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
6550                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
6551   else
6552     {
6553       if (!ni_name)
6554         ni_name = vect_build_loop_niters (loop_vinfo);
6555       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
6556     }
6557
6558   /* 1) Make sure the loop header has exactly two entries
6559      2) Make sure we have a preheader basic block.  */
6560
6561   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
6562
6563   split_edge (loop_preheader_edge (loop));
6564
6565   /* FORNOW: the vectorizer supports only loops which body consist
6566      of one basic block (header + empty latch). When the vectorizer will
6567      support more involved loop forms, the order by which the BBs are
6568      traversed need to be reconsidered.  */
6569
6570   for (i = 0; i < nbbs; i++)
6571     {
6572       basic_block bb = bbs[i];
6573       stmt_vec_info stmt_info;
6574
6575       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
6576            gsi_next (&si))
6577         {
6578           gphi *phi = si.phi ();
6579           if (dump_enabled_p ())
6580             {
6581               dump_printf_loc (MSG_NOTE, vect_location,
6582                                "------>vectorizing phi: ");
6583               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
6584               dump_printf (MSG_NOTE, "\n");
6585             }
6586           stmt_info = vinfo_for_stmt (phi);
6587           if (!stmt_info)
6588             continue;
6589
6590           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6591             vect_loop_kill_debug_uses (loop, phi);
6592
6593           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6594               && !STMT_VINFO_LIVE_P (stmt_info))
6595             continue;
6596
6597           if (STMT_VINFO_VECTYPE (stmt_info)
6598               && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6599                   != (unsigned HOST_WIDE_INT) vectorization_factor)
6600               && dump_enabled_p ())
6601             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6602
6603           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6604             {
6605               if (dump_enabled_p ())
6606                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6607               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6608             }
6609         }
6610
6611       pattern_stmt = NULL;
6612       for (gimple_stmt_iterator si = gsi_start_bb (bb);
6613            !gsi_end_p (si) || transform_pattern_stmt;)
6614         {
6615           bool is_store;
6616
6617           if (transform_pattern_stmt)
6618             stmt = pattern_stmt;
6619           else
6620             {
6621               stmt = gsi_stmt (si);
6622               /* During vectorization remove existing clobber stmts.  */
6623               if (gimple_clobber_p (stmt))
6624                 {
6625                   unlink_stmt_vdef (stmt);
6626                   gsi_remove (&si, true);
6627                   release_defs (stmt);
6628                   continue;
6629                 }
6630             }
6631
6632           if (dump_enabled_p ())
6633             {
6634               dump_printf_loc (MSG_NOTE, vect_location,
6635                                "------>vectorizing statement: ");
6636               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6637               dump_printf (MSG_NOTE, "\n");
6638             }
6639
6640           stmt_info = vinfo_for_stmt (stmt);
6641
6642           /* vector stmts created in the outer-loop during vectorization of
6643              stmts in an inner-loop may not have a stmt_info, and do not
6644              need to be vectorized.  */
6645           if (!stmt_info)
6646             {
6647               gsi_next (&si);
6648               continue;
6649             }
6650
6651           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6652             vect_loop_kill_debug_uses (loop, stmt);
6653
6654           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6655               && !STMT_VINFO_LIVE_P (stmt_info))
6656             {
6657               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6658                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6659                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6660                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6661                 {
6662                   stmt = pattern_stmt;
6663                   stmt_info = vinfo_for_stmt (stmt);
6664                 }
6665               else
6666                 {
6667                   gsi_next (&si);
6668                   continue;
6669                 }
6670             }
6671           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6672                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6673                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6674                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6675             transform_pattern_stmt = true;
6676
6677           /* If pattern statement has def stmts, vectorize them too.  */
6678           if (is_pattern_stmt_p (stmt_info))
6679             {
6680               if (pattern_def_seq == NULL)
6681                 {
6682                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6683                   pattern_def_si = gsi_start (pattern_def_seq);
6684                 }
6685               else if (!gsi_end_p (pattern_def_si))
6686                 gsi_next (&pattern_def_si);
6687               if (pattern_def_seq != NULL)
6688                 {
6689                   gimple *pattern_def_stmt = NULL;
6690                   stmt_vec_info pattern_def_stmt_info = NULL;
6691
6692                   while (!gsi_end_p (pattern_def_si))
6693                     {
6694                       pattern_def_stmt = gsi_stmt (pattern_def_si);
6695                       pattern_def_stmt_info
6696                         = vinfo_for_stmt (pattern_def_stmt);
6697                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6698                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6699                         break;
6700                       gsi_next (&pattern_def_si);
6701                     }
6702
6703                   if (!gsi_end_p (pattern_def_si))
6704                     {
6705                       if (dump_enabled_p ())
6706                         {
6707                           dump_printf_loc (MSG_NOTE, vect_location,
6708                                            "==> vectorizing pattern def "
6709                                            "stmt: ");
6710                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6711                                             pattern_def_stmt, 0);
6712                           dump_printf (MSG_NOTE, "\n");
6713                         }
6714
6715                       stmt = pattern_def_stmt;
6716                       stmt_info = pattern_def_stmt_info;
6717                     }
6718                   else
6719                     {
6720                       pattern_def_si = gsi_none ();
6721                       transform_pattern_stmt = false;
6722                     }
6723                 }
6724               else
6725                 transform_pattern_stmt = false;
6726             }
6727
6728           if (STMT_VINFO_VECTYPE (stmt_info))
6729             {
6730               unsigned int nunits
6731                 = (unsigned int)
6732                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6733               if (!STMT_SLP_TYPE (stmt_info)
6734                   && nunits != (unsigned int) vectorization_factor
6735                   && dump_enabled_p ())
6736                   /* For SLP VF is set according to unrolling factor, and not
6737                      to vector size, hence for SLP this print is not valid.  */
6738                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6739             }
6740
6741           /* SLP. Schedule all the SLP instances when the first SLP stmt is
6742              reached.  */
6743           if (STMT_SLP_TYPE (stmt_info))
6744             {
6745               if (!slp_scheduled)
6746                 {
6747                   slp_scheduled = true;
6748
6749                   if (dump_enabled_p ())
6750                     dump_printf_loc (MSG_NOTE, vect_location,
6751                                      "=== scheduling SLP instances ===\n");
6752
6753                   vect_schedule_slp (loop_vinfo);
6754                 }
6755
6756               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
6757               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6758                 {
6759                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6760                     {
6761                       pattern_def_seq = NULL;
6762                       gsi_next (&si);
6763                     }
6764                   continue;
6765                 }
6766             }
6767
6768           /* -------- vectorize statement ------------ */
6769           if (dump_enabled_p ())
6770             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6771
6772           grouped_store = false;
6773           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6774           if (is_store)
6775             {
6776               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6777                 {
6778                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6779                      interleaving chain was completed - free all the stores in
6780                      the chain.  */
6781                   gsi_next (&si);
6782                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6783                 }
6784               else
6785                 {
6786                   /* Free the attached stmt_vec_info and remove the stmt.  */
6787                   gimple *store = gsi_stmt (si);
6788                   free_stmt_vec_info (store);
6789                   unlink_stmt_vdef (store);
6790                   gsi_remove (&si, true);
6791                   release_defs (store);
6792                 }
6793
6794               /* Stores can only appear at the end of pattern statements.  */
6795               gcc_assert (!transform_pattern_stmt);
6796               pattern_def_seq = NULL;
6797             }
6798           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6799             {
6800               pattern_def_seq = NULL;
6801               gsi_next (&si);
6802             }
6803         }                       /* stmts in BB */
6804     }                           /* BBs in loop */
6805
6806   slpeel_make_loop_iterate_ntimes (loop, ratio);
6807
6808   /* Reduce loop iterations by the vectorization factor.  */
6809   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6810                       expected_iterations / vectorization_factor);
6811   loop->nb_iterations_upper_bound
6812     = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6813   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6814       && loop->nb_iterations_upper_bound != 0)
6815     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6816   if (loop->any_estimate)
6817     {
6818       loop->nb_iterations_estimate
6819         = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6820        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6821            && loop->nb_iterations_estimate != 0)
6822          loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6823     }
6824
6825   if (dump_enabled_p ())
6826     {
6827       dump_printf_loc (MSG_NOTE, vect_location,
6828                        "LOOP VECTORIZED\n");
6829       if (loop->inner)
6830         dump_printf_loc (MSG_NOTE, vect_location,
6831                          "OUTER LOOP VECTORIZED\n");
6832       dump_printf (MSG_NOTE, "\n");
6833     }
6834 }