gcc/tree-vect-loop-manip.c

   1 /* Vectorizer Specific Loop Manipulations
   2    Copyright (C) 2003-2019 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "tree.h"
  27 #include "gimple.h"
  28 #include "cfghooks.h"
  29 #include "tree-pass.h"
  30 #include "ssa.h"
  31 #include "fold-const.h"
  32 #include "cfganal.h"
  33 #include "gimplify.h"
  34 #include "gimple-iterator.h"
  35 #include "gimplify-me.h"
  36 #include "tree-cfg.h"
  37 #include "tree-ssa-loop-manip.h"
  38 #include "tree-into-ssa.h"
  39 #include "tree-ssa.h"
  40 #include "cfgloop.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "gimple-fold.h"
  45 #include "tree-ssa-loop-niter.h"
  46 #include "internal-fn.h"
  47 #include "stor-layout.h"
  48 #include "optabs-query.h"
  49 #include "vec-perm-indices.h"
  50
  51 /*************************************************************************
  52   Simple Loop Peeling Utilities
  53
  54   Utilities to support loop peeling for vectorization purposes.
  55  *************************************************************************/
  56
  57
  58 /* Renames the use *OP_P.  */
  59
  60 static void
  61 rename_use_op (use_operand_p op_p)
  62 {
  63   tree new_name;
  64
  65   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
  66     return;
  67
  68   new_name = get_current_def (USE_FROM_PTR (op_p));
  69
  70   /* Something defined outside of the loop.  */
  71   if (!new_name)
  72     return;
  73
  74   /* An ordinary ssa name defined in the loop.  */
  75
  76   SET_USE (op_p, new_name);
  77 }
  78
  79
  80 /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
  81    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
  82    true.  */
  83
  84 static void
  85 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
  86 {
  87   gimple *stmt;
  88   use_operand_p use_p;
  89   ssa_op_iter iter;
  90   edge e;
  91   edge_iterator ei;
  92   class loop *loop = bb->loop_father;
  93   class loop *outer_loop = NULL;
  94
  95   if (rename_from_outer_loop)
  96     {
  97       gcc_assert (loop);
  98       outer_loop = loop_outer (loop);
  99     }
 100
 101   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
 102        gsi_next (&gsi))
 103     {
 104       stmt = gsi_stmt (gsi);
 105       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
 106         rename_use_op (use_p);
 107     }
 108
 109   FOR_EACH_EDGE (e, ei, bb->preds)
 110     {
 111       if (!flow_bb_inside_loop_p (loop, e->src))
 112         {
 113           if (!rename_from_outer_loop)
 114             continue;
 115           if (e->src != outer_loop->header)
 116             {
 117               if (outer_loop->inner->next)
 118                 {
 119                   /* If outer_loop has 2 inner loops, allow there to
 120                      be an extra basic block which decides which of the
 121                      two loops to use using LOOP_VECTORIZED.  */
 122                   if (!single_pred_p (e->src)
 123                       || single_pred (e->src) != outer_loop->header)
 124                     continue;
 125                 }
 126             }
 127         }
 128       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 129            gsi_next (&gsi))
 130         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
 131     }
 132 }
 133
 134
 135 struct adjust_info
 136 {
 137   tree from, to;
 138   basic_block bb;
 139 };
 140
 141 /* A stack of values to be adjusted in debug stmts.  We have to
 142    process them LIFO, so that the closest substitution applies.  If we
 143    processed them FIFO, without the stack, we might substitute uses
 144    with a PHI DEF that would soon become non-dominant, and when we got
 145    to the suitable one, it wouldn't have anything to substitute any
 146    more.  */
 147 static vec<adjust_info, va_heap> adjust_vec;
 148
 149 /* Adjust any debug stmts that referenced AI->from values to use the
 150    loop-closed AI->to, if the references are dominated by AI->bb and
 151    not by the definition of AI->from.  */
 152
 153 static void
 154 adjust_debug_stmts_now (adjust_info *ai)
 155 {
 156   basic_block bbphi = ai->bb;
 157   tree orig_def = ai->from;
 158   tree new_def = ai->to;
 159   imm_use_iterator imm_iter;
 160   gimple *stmt;
 161   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
 162
 163   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
 164
 165   /* Adjust any debug stmts that held onto non-loop-closed
 166      references.  */
 167   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
 168     {
 169       use_operand_p use_p;
 170       basic_block bbuse;
 171
 172       if (!is_gimple_debug (stmt))
 173         continue;
 174
 175       gcc_assert (gimple_debug_bind_p (stmt));
 176
 177       bbuse = gimple_bb (stmt);
 178
 179       if ((bbuse == bbphi
 180            || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
 181           && !(bbuse == bbdef
 182                || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
 183         {
 184           if (new_def)
 185             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
 186               SET_USE (use_p, new_def);
 187           else
 188             {
 189               gimple_debug_bind_reset_value (stmt);
 190               update_stmt (stmt);
 191             }
 192         }
 193     }
 194 }
 195
 196 /* Adjust debug stmts as scheduled before.  */
 197
 198 static void
 199 adjust_vec_debug_stmts (void)
 200 {
 201   if (!MAY_HAVE_DEBUG_BIND_STMTS)
 202     return;
 203
 204   gcc_assert (adjust_vec.exists ());
 205
 206   while (!adjust_vec.is_empty ())
 207     {
 208       adjust_debug_stmts_now (&adjust_vec.last ());
 209       adjust_vec.pop ();
 210     }
 211 }
 212
 213 /* Adjust any debug stmts that referenced FROM values to use the
 214    loop-closed TO, if the references are dominated by BB and not by
 215    the definition of FROM.  If adjust_vec is non-NULL, adjustments
 216    will be postponed until adjust_vec_debug_stmts is called.  */
 217
 218 static void
 219 adjust_debug_stmts (tree from, tree to, basic_block bb)
 220 {
 221   adjust_info ai;
 222
 223   if (MAY_HAVE_DEBUG_BIND_STMTS
 224       && TREE_CODE (from) == SSA_NAME
 225       && ! SSA_NAME_IS_DEFAULT_DEF (from)
 226       && ! virtual_operand_p (from))
 227     {
 228       ai.from = from;
 229       ai.to = to;
 230       ai.bb = bb;
 231
 232       if (adjust_vec.exists ())
 233         adjust_vec.safe_push (ai);
 234       else
 235         adjust_debug_stmts_now (&ai);
 236     }
 237 }
 238
 239 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
 240    to adjust any debug stmts that referenced the old phi arg,
 241    presumably non-loop-closed references left over from other
 242    transformations.  */
 243
 244 static void
 245 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
 246 {
 247   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
 248
 249   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
 250
 251   if (MAY_HAVE_DEBUG_BIND_STMTS)
 252     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
 253                         gimple_bb (update_phi));
 254 }
 255
 256 /* Define one loop mask MASK from loop LOOP.  INIT_MASK is the value that
 257    the mask should have during the first iteration and NEXT_MASK is the
 258    value that it should have on subsequent iterations.  */
 259
 260 static void
 261 vect_set_loop_mask (class loop *loop, tree mask, tree init_mask,
 262                     tree next_mask)
 263 {
 264   gphi *phi = create_phi_node (mask, loop->header);
 265   add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
 266   add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
 267 }
 268
 269 /* Add SEQ to the end of LOOP's preheader block.  */
 270
 271 static void
 272 add_preheader_seq (class loop *loop, gimple_seq seq)
 273 {
 274   if (seq)
 275     {
 276       edge pe = loop_preheader_edge (loop);
 277       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 278       gcc_assert (!new_bb);
 279     }
 280 }
 281
 282 /* Add SEQ to the beginning of LOOP's header block.  */
 283
 284 static void
 285 add_header_seq (class loop *loop, gimple_seq seq)
 286 {
 287   if (seq)
 288     {
 289       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
 290       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
 291     }
 292 }
 293
 294 /* Return true if the target can interleave elements of two vectors.
 295    OFFSET is 0 if the first half of the vectors should be interleaved
 296    or 1 if the second half should.  When returning true, store the
 297    associated permutation in INDICES.  */
 298
 299 static bool
 300 interleave_supported_p (vec_perm_indices *indices, tree vectype,
 301                         unsigned int offset)
 302 {
 303   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
 304   poly_uint64 base = exact_div (nelts, 2) * offset;
 305   vec_perm_builder sel (nelts, 2, 3);
 306   for (unsigned int i = 0; i < 3; ++i)
 307     {
 308       sel.quick_push (base + i);
 309       sel.quick_push (base + i + nelts);
 310     }
 311   indices->new_vector (sel, 2, nelts);
 312   return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
 313 }
 314
 315 /* Try to use permutes to define the masks in DEST_RGM using the masks
 316    in SRC_RGM, given that the former has twice as many masks as the
 317    latter.  Return true on success, adding any new statements to SEQ.  */
 318
 319 static bool
 320 vect_maybe_permute_loop_masks (loop_vec_info loop_vinfo, gimple_seq *seq,
 321                                rgroup_masks *dest_rgm,
 322                                rgroup_masks *src_rgm)
 323 {
 324   tree src_masktype = src_rgm->mask_type;
 325   tree dest_masktype = dest_rgm->mask_type;
 326   machine_mode src_mode = TYPE_MODE (src_masktype);
 327   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
 328       && optab_handler (vec_unpacku_hi_optab, src_mode) != CODE_FOR_nothing
 329       && optab_handler (vec_unpacku_lo_optab, src_mode) != CODE_FOR_nothing)
 330     {
 331       /* Unpacking the source masks gives at least as many mask bits as
 332          we need.  We can then VIEW_CONVERT any excess bits away.  */
 333       tree unpack_masktype = vect_halve_mask_nunits (loop_vinfo, src_masktype);
 334       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
 335         {
 336           tree src = src_rgm->masks[i / 2];
 337           tree dest = dest_rgm->masks[i];
 338           tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
 339                             ? VEC_UNPACK_HI_EXPR
 340                             : VEC_UNPACK_LO_EXPR);
 341           gassign *stmt;
 342           if (dest_masktype == unpack_masktype)
 343             stmt = gimple_build_assign (dest, code, src);
 344           else
 345             {
 346               tree temp = make_ssa_name (unpack_masktype);
 347               stmt = gimple_build_assign (temp, code, src);
 348               gimple_seq_add_stmt (seq, stmt);
 349               stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
 350                                           build1 (VIEW_CONVERT_EXPR,
 351                                                   dest_masktype, temp));
 352             }
 353           gimple_seq_add_stmt (seq, stmt);
 354         }
 355       return true;
 356     }
 357   vec_perm_indices indices[2];
 358   if (dest_masktype == src_masktype
 359       && interleave_supported_p (&indices[0], src_masktype, 0)
 360       && interleave_supported_p (&indices[1], src_masktype, 1))
 361     {
 362       /* The destination requires twice as many mask bits as the source, so
 363          we can use interleaving permutes to double up the number of bits.  */
 364       tree masks[2];
 365       for (unsigned int i = 0; i < 2; ++i)
 366         masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
 367       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
 368         {
 369           tree src = src_rgm->masks[i / 2];
 370           tree dest = dest_rgm->masks[i];
 371           gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
 372                                               src, src, masks[i & 1]);
 373           gimple_seq_add_stmt (seq, stmt);
 374         }
 375       return true;
 376     }
 377   return false;
 378 }
 379
 380 /* Helper for vect_set_loop_condition_masked.  Generate definitions for
 381    all the masks in RGM and return a mask that is nonzero when the loop
 382    needs to iterate.  Add any new preheader statements to PREHEADER_SEQ.
 383    Use LOOP_COND_GSI to insert code before the exit gcond.
 384
 385    RGM belongs to loop LOOP.  The loop originally iterated NITERS
 386    times and has been vectorized according to LOOP_VINFO.
 387
 388    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
 389    starts with NITERS_SKIP dummy iterations of the scalar loop before
 390    the real work starts.  The mask elements for these dummy iterations
 391    must be 0, to ensure that the extra iterations do not have an effect.
 392
 393    It is known that:
 394
 395      NITERS * RGM->max_nscalars_per_iter
 396
 397    does not overflow.  However, MIGHT_WRAP_P says whether an induction
 398    variable that starts at 0 and has step:
 399
 400      VF * RGM->max_nscalars_per_iter
 401
 402    might overflow before hitting a value above:
 403
 404      (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
 405
 406    This means that we cannot guarantee that such an induction variable
 407    would ever hit a value that produces a set of all-false masks for RGM.  */
 408
 409 static tree
 410 vect_set_loop_masks_directly (class loop *loop, loop_vec_info loop_vinfo,
 411                               gimple_seq *preheader_seq,
 412                               gimple_stmt_iterator loop_cond_gsi,
 413                               rgroup_masks *rgm, tree niters, tree niters_skip,
 414                               bool might_wrap_p)
 415 {
 416   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
 417   tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo);
 418   tree mask_type = rgm->mask_type;
 419   unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
 420   poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
 421   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 422
 423   /* Calculate the maximum number of scalar values that the rgroup
 424      handles in total, the number that it handles for each iteration
 425      of the vector loop, and the number that it should skip during the
 426      first iteration of the vector loop.  */
 427   tree nscalars_total = niters;
 428   tree nscalars_step = build_int_cst (iv_type, vf);
 429   tree nscalars_skip = niters_skip;
 430   if (nscalars_per_iter != 1)
 431     {
 432       /* We checked before choosing to use a fully-masked loop that these
 433          multiplications don't overflow.  */
 434       tree compare_factor = build_int_cst (compare_type, nscalars_per_iter);
 435       tree iv_factor = build_int_cst (iv_type, nscalars_per_iter);
 436       nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 437                                      nscalars_total, compare_factor);
 438       nscalars_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
 439                                     nscalars_step, iv_factor);
 440       if (nscalars_skip)
 441         nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 442                                       nscalars_skip, compare_factor);
 443     }
 444
 445   /* Create an induction variable that counts the number of scalars
 446      processed.  */
 447   tree index_before_incr, index_after_incr;
 448   gimple_stmt_iterator incr_gsi;
 449   bool insert_after;
 450   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 451   create_iv (build_int_cst (iv_type, 0), nscalars_step, NULL_TREE, loop,
 452              &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
 453
 454   tree zero_index = build_int_cst (compare_type, 0);
 455   tree test_index, test_limit, first_limit;
 456   gimple_stmt_iterator *test_gsi;
 457   if (might_wrap_p)
 458     {
 459       /* In principle the loop should stop iterating once the incremented
 460          IV reaches a value greater than or equal to:
 461
 462            NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
 463
 464          However, there's no guarantee that this addition doesn't overflow
 465          the comparison type, or that the IV hits a value above it before
 466          wrapping around.  We therefore adjust the limit down by one
 467          IV step:
 468
 469            (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
 470            -[infinite-prec] NSCALARS_STEP
 471
 472          and compare the IV against this limit _before_ incrementing it.
 473          Since the comparison type is unsigned, we actually want the
 474          subtraction to saturate at zero:
 475
 476            (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
 477            -[sat] NSCALARS_STEP
 478
 479          And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
 480
 481            NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
 482
 483          where the rightmost subtraction can be done directly in
 484          COMPARE_TYPE.  */
 485       test_index = index_before_incr;
 486       tree adjust = gimple_convert (preheader_seq, compare_type,
 487                                     nscalars_step);
 488       if (nscalars_skip)
 489         adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 490                                adjust, nscalars_skip);
 491       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
 492                                  nscalars_total, adjust);
 493       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 494                                  test_limit, adjust);
 495       test_gsi = &incr_gsi;
 496
 497       /* Get a safe limit for the first iteration.  */
 498       if (nscalars_skip)
 499         {
 500           /* The first vector iteration can handle at most NSCALARS_STEP
 501              scalars.  NSCALARS_STEP <= CONST_LIMIT, and adding
 502              NSCALARS_SKIP to that cannot overflow.  */
 503           tree const_limit = build_int_cst (compare_type,
 504                                             LOOP_VINFO_VECT_FACTOR (loop_vinfo)
 505                                             * nscalars_per_iter);
 506           first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
 507                                       nscalars_total, const_limit);
 508           first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 509                                       first_limit, nscalars_skip);
 510         }
 511       else
 512         /* For the first iteration it doesn't matter whether the IV hits
 513            a value above NSCALARS_TOTAL.  That only matters for the latch
 514            condition.  */
 515         first_limit = nscalars_total;
 516     }
 517   else
 518     {
 519       /* Test the incremented IV, which will always hit a value above
 520          the bound before wrapping.  */
 521       test_index = index_after_incr;
 522       test_limit = nscalars_total;
 523       if (nscalars_skip)
 524         test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 525                                    test_limit, nscalars_skip);
 526       test_gsi = &loop_cond_gsi;
 527
 528       first_limit = test_limit;
 529     }
 530
 531   /* Convert the IV value to the comparison type (either a no-op or
 532      a demotion).  */
 533   gimple_seq test_seq = NULL;
 534   test_index = gimple_convert (&test_seq, compare_type, test_index);
 535   gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
 536
 537   /* Provide a definition of each mask in the group.  */
 538   tree next_mask = NULL_TREE;
 539   tree mask;
 540   unsigned int i;
 541   FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
 542     {
 543       /* Previous masks will cover BIAS scalars.  This mask covers the
 544          next batch.  */
 545       poly_uint64 bias = nscalars_per_mask * i;
 546       tree bias_tree = build_int_cst (compare_type, bias);
 547       gimple *tmp_stmt;
 548
 549       /* See whether the first iteration of the vector loop is known
 550          to have a full mask.  */
 551       poly_uint64 const_limit;
 552       bool first_iteration_full
 553         = (poly_int_tree_p (first_limit, &const_limit)
 554            && known_ge (const_limit, (i + 1) * nscalars_per_mask));
 555
 556       /* Rather than have a new IV that starts at BIAS and goes up to
 557          TEST_LIMIT, prefer to use the same 0-based IV for each mask
 558          and adjust the bound down by BIAS.  */
 559       tree this_test_limit = test_limit;
 560       if (i != 0)
 561         {
 562           this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
 563                                           compare_type, this_test_limit,
 564                                           bias_tree);
 565           this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
 566                                           compare_type, this_test_limit,
 567                                           bias_tree);
 568         }
 569
 570       /* Create the initial mask.  First include all scalars that
 571          are within the loop limit.  */
 572       tree init_mask = NULL_TREE;
 573       if (!first_iteration_full)
 574         {
 575           tree start, end;
 576           if (first_limit == test_limit)
 577             {
 578               /* Use a natural test between zero (the initial IV value)
 579                  and the loop limit.  The "else" block would be valid too,
 580                  but this choice can avoid the need to load BIAS_TREE into
 581                  a register.  */
 582               start = zero_index;
 583               end = this_test_limit;
 584             }
 585           else
 586             {
 587               /* FIRST_LIMIT is the maximum number of scalars handled by the
 588                  first iteration of the vector loop.  Test the portion
 589                  associated with this mask.  */
 590               start = bias_tree;
 591               end = first_limit;
 592             }
 593
 594           init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
 595           tmp_stmt = vect_gen_while (init_mask, start, end);
 596           gimple_seq_add_stmt (preheader_seq, tmp_stmt);
 597         }
 598
 599       /* Now AND out the bits that are within the number of skipped
 600          scalars.  */
 601       poly_uint64 const_skip;
 602       if (nscalars_skip
 603           && !(poly_int_tree_p (nscalars_skip, &const_skip)
 604                && known_le (const_skip, bias)))
 605         {
 606           tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
 607                                                     bias_tree, nscalars_skip);
 608           if (init_mask)
 609             init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
 610                                       init_mask, unskipped_mask);
 611           else
 612             init_mask = unskipped_mask;
 613         }
 614
 615       if (!init_mask)
 616         /* First iteration is full.  */
 617         init_mask = build_minus_one_cst (mask_type);
 618
 619       /* Get the mask value for the next iteration of the loop.  */
 620       next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
 621       gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
 622       gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
 623
 624       vect_set_loop_mask (loop, mask, init_mask, next_mask);
 625     }
 626   return next_mask;
 627 }
 628
 629 /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
 630    LOOP_VINFO describes the vectorization of LOOP.  NITERS is the
 631    number of iterations of the original scalar loop that should be
 632    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are
 633    as for vect_set_loop_condition.
 634
 635    Insert the branch-back condition before LOOP_COND_GSI and return the
 636    final gcond.  */
 637
 638 static gcond *
 639 vect_set_loop_condition_masked (class loop *loop, loop_vec_info loop_vinfo,
 640                                 tree niters, tree final_iv,
 641                                 bool niters_maybe_zero,
 642                                 gimple_stmt_iterator loop_cond_gsi)
 643 {
 644   gimple_seq preheader_seq = NULL;
 645   gimple_seq header_seq = NULL;
 646
 647   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
 648   unsigned int compare_precision = TYPE_PRECISION (compare_type);
 649   tree orig_niters = niters;
 650
 651   /* Type of the initial value of NITERS.  */
 652   tree ni_actual_type = TREE_TYPE (niters);
 653   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
 654   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 655
 656   /* Convert NITERS to the same size as the compare.  */
 657   if (compare_precision > ni_actual_precision
 658       && niters_maybe_zero)
 659     {
 660       /* We know that there is always at least one iteration, so if the
 661          count is zero then it must have wrapped.  Cope with this by
 662          subtracting 1 before the conversion and adding 1 to the result.  */
 663       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
 664       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
 665                              niters, build_minus_one_cst (ni_actual_type));
 666       niters = gimple_convert (&preheader_seq, compare_type, niters);
 667       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
 668                              niters, build_one_cst (compare_type));
 669     }
 670   else
 671     niters = gimple_convert (&preheader_seq, compare_type, niters);
 672
 673   widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
 674
 675   /* Iterate over all the rgroups and fill in their masks.  We could use
 676      the first mask from any rgroup for the loop condition; here we
 677      arbitrarily pick the last.  */
 678   tree test_mask = NULL_TREE;
 679   rgroup_masks *rgm;
 680   unsigned int i;
 681   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
 682   FOR_EACH_VEC_ELT (*masks, i, rgm)
 683     if (!rgm->masks.is_empty ())
 684       {
 685         /* First try using permutes.  This adds a single vector
 686            instruction to the loop for each mask, but needs no extra
 687            loop invariants or IVs.  */
 688         unsigned int nmasks = i + 1;
 689         if ((nmasks & 1) == 0)
 690           {
 691             rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
 692             if (!half_rgm->masks.is_empty ()
 693                 && vect_maybe_permute_loop_masks (loop_vinfo, &header_seq,
 694                                                   rgm, half_rgm))
 695               continue;
 696           }
 697
 698         /* See whether zero-based IV would ever generate all-false masks
 699            before wrapping around.  */
 700         bool might_wrap_p
 701           = (iv_limit == -1
 702              || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
 703                                     UNSIGNED)
 704                  > compare_precision));
 705
 706         /* Set up all masks for this group.  */
 707         test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
 708                                                   &preheader_seq,
 709                                                   loop_cond_gsi, rgm,
 710                                                   niters, niters_skip,
 711                                                   might_wrap_p);
 712       }
 713
 714   /* Emit all accumulated statements.  */
 715   add_preheader_seq (loop, preheader_seq);
 716   add_header_seq (loop, header_seq);
 717
 718   /* Get a boolean result that tells us whether to iterate.  */
 719   edge exit_edge = single_exit (loop);
 720   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
 721   tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
 722   gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
 723                                         NULL_TREE, NULL_TREE);
 724   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 725
 726   /* The loop iterates (NITERS - 1) / VF + 1 times.
 727      Subtract one from this to get the latch count.  */
 728   tree step = build_int_cst (compare_type,
 729                              LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 730   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
 731                                        build_minus_one_cst (compare_type));
 732   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
 733                                      niters_minus_one, step);
 734
 735   if (final_iv)
 736     {
 737       gassign *assign = gimple_build_assign (final_iv, orig_niters);
 738       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 739     }
 740
 741   return cond_stmt;
 742 }
 743
 744 /* Like vect_set_loop_condition, but handle the case in which there
 745    are no loop masks.  */
 746
 747 static gcond *
 748 vect_set_loop_condition_unmasked (class loop *loop, tree niters,
 749                                   tree step, tree final_iv,
 750                                   bool niters_maybe_zero,
 751                                   gimple_stmt_iterator loop_cond_gsi)
 752 {
 753   tree indx_before_incr, indx_after_incr;
 754   gcond *cond_stmt;
 755   gcond *orig_cond;
 756   edge pe = loop_preheader_edge (loop);
 757   edge exit_edge = single_exit (loop);
 758   gimple_stmt_iterator incr_gsi;
 759   bool insert_after;
 760   enum tree_code code;
 761   tree niters_type = TREE_TYPE (niters);
 762
 763   orig_cond = get_loop_exit_condition (loop);
 764   gcc_assert (orig_cond);
 765   loop_cond_gsi = gsi_for_stmt (orig_cond);
 766
 767   tree init, limit;
 768   if (!niters_maybe_zero && integer_onep (step))
 769     {
 770       /* In this case we can use a simple 0-based IV:
 771
 772          A:
 773            x = 0;
 774            do
 775              {
 776                ...
 777                x += 1;
 778              }
 779            while (x < NITERS);  */
 780       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 781       init = build_zero_cst (niters_type);
 782       limit = niters;
 783     }
 784   else
 785     {
 786       /* The following works for all values of NITERS except 0:
 787
 788          B:
 789            x = 0;
 790            do
 791              {
 792                ...
 793                x += STEP;
 794              }
 795            while (x <= NITERS - STEP);
 796
 797          so that the loop continues to iterate if x + STEP - 1 < NITERS
 798          but stops if x + STEP - 1 >= NITERS.
 799
 800          However, if NITERS is zero, x never hits a value above NITERS - STEP
 801          before wrapping around.  There are two obvious ways of dealing with
 802          this:
 803
 804          - start at STEP - 1 and compare x before incrementing it
 805          - start at -1 and compare x after incrementing it
 806
 807          The latter is simpler and is what we use.  The loop in this case
 808          looks like:
 809
 810          C:
 811            x = -1;
 812            do
 813              {
 814                ...
 815                x += STEP;
 816              }
 817            while (x < NITERS - STEP);
 818
 819          In both cases the loop limit is NITERS - STEP.  */
 820       gimple_seq seq = NULL;
 821       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
 822       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
 823       if (seq)
 824         {
 825           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 826           gcc_assert (!new_bb);
 827         }
 828       if (niters_maybe_zero)
 829         {
 830           /* Case C.  */
 831           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 832           init = build_all_ones_cst (niters_type);
 833         }
 834       else
 835         {
 836           /* Case B.  */
 837           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
 838           init = build_zero_cst (niters_type);
 839         }
 840     }
 841
 842   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 843   create_iv (init, step, NULL_TREE, loop,
 844              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
 845   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
 846                                               true, NULL_TREE, true,
 847                                               GSI_SAME_STMT);
 848   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
 849                                      true, GSI_SAME_STMT);
 850
 851   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
 852                                  NULL_TREE);
 853
 854   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 855
 856   /* Record the number of latch iterations.  */
 857   if (limit == niters)
 858     /* Case A: the loop iterates NITERS times.  Subtract one to get the
 859        latch count.  */
 860     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
 861                                        build_int_cst (niters_type, 1));
 862   else
 863     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
 864        Subtract one from this to get the latch count.  */
 865     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
 866                                        limit, step);
 867
 868   if (final_iv)
 869     {
 870       gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
 871                                              indx_after_incr, init);
 872       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 873     }
 874
 875   return cond_stmt;
 876 }
 877
 878 /* If we're using fully-masked loops, make LOOP iterate:
 879
 880       N == (NITERS - 1) / STEP + 1
 881
 882    times.  When NITERS is zero, this is equivalent to making the loop
 883    execute (1 << M) / STEP times, where M is the precision of NITERS.
 884    NITERS_MAYBE_ZERO is true if this last case might occur.
 885
 886    If we're not using fully-masked loops, make LOOP iterate:
 887
 888       N == (NITERS - STEP) / STEP + 1
 889
 890    times, where NITERS is known to be outside the range [1, STEP - 1].
 891    This is equivalent to making the loop execute NITERS / STEP times
 892    when NITERS is nonzero and (1 << M) / STEP times otherwise.
 893    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
 894
 895    If FINAL_IV is nonnull, it is an SSA name that should be set to
 896    N * STEP on exit from the loop.
 897
 898    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
 899
 900 void
 901 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
 902                          tree niters, tree step, tree final_iv,
 903                          bool niters_maybe_zero)
 904 {
 905   gcond *cond_stmt;
 906   gcond *orig_cond = get_loop_exit_condition (loop);
 907   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
 908
 909   if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
 910     cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
 911                                                 final_iv, niters_maybe_zero,
 912                                                 loop_cond_gsi);
 913   else
 914     cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
 915                                                   final_iv, niters_maybe_zero,
 916                                                   loop_cond_gsi);
 917
 918   /* Remove old loop exit test.  */
 919   stmt_vec_info orig_cond_info;
 920   if (loop_vinfo
 921       && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
 922     loop_vinfo->remove_stmt (orig_cond_info);
 923   else
 924     gsi_remove (&loop_cond_gsi, true);
 925
 926   if (dump_enabled_p ())
 927     dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
 928                      cond_stmt);
 929 }
 930
 931 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
 932    For all PHI arguments in FROM->dest and TO->dest from those
 933    edges ensure that TO->dest PHI arguments have current_def
 934    to that in from.  */
 935
 936 static void
 937 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
 938 {
 939   gimple_stmt_iterator gsi_from, gsi_to;
 940
 941   for (gsi_from = gsi_start_phis (from->dest),
 942        gsi_to = gsi_start_phis (to->dest);
 943        !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
 944     {
 945       gimple *from_phi = gsi_stmt (gsi_from);
 946       gimple *to_phi = gsi_stmt (gsi_to);
 947       tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
 948       tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
 949       if (virtual_operand_p (from_arg))
 950         {
 951           gsi_next (&gsi_from);
 952           continue;
 953         }
 954       if (virtual_operand_p (to_arg))
 955         {
 956           gsi_next (&gsi_to);
 957           continue;
 958         }
 959       if (TREE_CODE (from_arg) != SSA_NAME)
 960         gcc_assert (operand_equal_p (from_arg, to_arg, 0));
 961       else if (TREE_CODE (to_arg) == SSA_NAME
 962                && from_arg != to_arg)
 963         {
 964           if (get_current_def (to_arg) == NULL_TREE)
 965             {
 966               gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
 967                                               TREE_TYPE (get_current_def
 968                                                            (from_arg))));
 969               set_current_def (to_arg, get_current_def (from_arg));
 970             }
 971         }
 972       gsi_next (&gsi_from);
 973       gsi_next (&gsi_to);
 974     }
 975
 976   gphi *from_phi = get_virtual_phi (from->dest);
 977   gphi *to_phi = get_virtual_phi (to->dest);
 978   if (from_phi)
 979     set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
 980                      get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
 981 }
 982
 983
 984 /* Given LOOP this function generates a new copy of it and puts it
 985    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
 986    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
 987    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
 988    entry or exit of LOOP.  */
 989
 990 class loop *
 991 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
 992                                         class loop *scalar_loop, edge e)
 993 {
 994   class loop *new_loop;
 995   basic_block *new_bbs, *bbs, *pbbs;
 996   bool at_exit;
 997   bool was_imm_dom;
 998   basic_block exit_dest;
 999   edge exit, new_exit;
1000   bool duplicate_outer_loop = false;
1001
1002   exit = single_exit (loop);
1003   at_exit = (e == exit);
1004   if (!at_exit && e != loop_preheader_edge (loop))
1005     return NULL;
1006
1007   if (scalar_loop == NULL)
1008     scalar_loop = loop;
1009
1010   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1011   pbbs = bbs + 1;
1012   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1013   /* Allow duplication of outer loops.  */
1014   if (scalar_loop->inner)
1015     duplicate_outer_loop = true;
1016   /* Check whether duplication is possible.  */
1017   if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1018     {
1019       free (bbs);
1020       return NULL;
1021     }
1022
1023   /* Generate new loop structure.  */
1024   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1025   duplicate_subloops (scalar_loop, new_loop);
1026
1027   exit_dest = exit->dest;
1028   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1029                                           exit_dest) == loop->header ?
1030                  true : false);
1031
1032   /* Also copy the pre-header, this avoids jumping through hoops to
1033      duplicate the loop entry PHI arguments.  Create an empty
1034      pre-header unconditionally for this.  */
1035   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1036   edge entry_e = single_pred_edge (preheader);
1037   bbs[0] = preheader;
1038   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1039
1040   exit = single_exit (scalar_loop);
1041   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1042             &exit, 1, &new_exit, NULL,
1043             at_exit ? loop->latch : e->src, true);
1044   exit = single_exit (loop);
1045   basic_block new_preheader = new_bbs[0];
1046
1047   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1048
1049   if (scalar_loop != loop)
1050     {
1051       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1052          SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1053          but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
1054          the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1055          header) to have current_def set, so copy them over.  */
1056       slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1057                                                 exit);
1058       slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1059                                                            0),
1060                                                 EDGE_SUCC (loop->latch, 0));
1061     }
1062
1063   if (at_exit) /* Add the loop copy at exit.  */
1064     {
1065       if (scalar_loop != loop)
1066         {
1067           gphi_iterator gsi;
1068           new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1069
1070           for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1071                gsi_next (&gsi))
1072             {
1073               gphi *phi = gsi.phi ();
1074               tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1075               location_t orig_locus
1076                 = gimple_phi_arg_location_from_edge (phi, e);
1077
1078               add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1079             }
1080         }
1081       redirect_edge_and_branch_force (e, new_preheader);
1082       flush_pending_stmts (e);
1083       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1084       if (was_imm_dom || duplicate_outer_loop)
1085         set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1086
1087       /* And remove the non-necessary forwarder again.  Keep the other
1088          one so we have a proper pre-header for the loop at the exit edge.  */
1089       redirect_edge_pred (single_succ_edge (preheader),
1090                           single_pred (preheader));
1091       delete_basic_block (preheader);
1092       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1093                                loop_preheader_edge (scalar_loop)->src);
1094     }
1095   else /* Add the copy at entry.  */
1096     {
1097       if (scalar_loop != loop)
1098         {
1099           /* Remove the non-necessary forwarder of scalar_loop again.  */
1100           redirect_edge_pred (single_succ_edge (preheader),
1101                               single_pred (preheader));
1102           delete_basic_block (preheader);
1103           set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1104                                    loop_preheader_edge (scalar_loop)->src);
1105           preheader = split_edge (loop_preheader_edge (loop));
1106           entry_e = single_pred_edge (preheader);
1107         }
1108
1109       redirect_edge_and_branch_force (entry_e, new_preheader);
1110       flush_pending_stmts (entry_e);
1111       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1112
1113       redirect_edge_and_branch_force (new_exit, preheader);
1114       flush_pending_stmts (new_exit);
1115       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1116
1117       /* And remove the non-necessary forwarder again.  Keep the other
1118          one so we have a proper pre-header for the loop at the exit edge.  */
1119       redirect_edge_pred (single_succ_edge (new_preheader),
1120                           single_pred (new_preheader));
1121       delete_basic_block (new_preheader);
1122       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1123                                loop_preheader_edge (new_loop)->src);
1124     }
1125
1126   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1127   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1128     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1129
1130   if (scalar_loop != loop)
1131     {
1132       /* Update new_loop->header PHIs, so that on the preheader
1133          edge they are the ones from loop rather than scalar_loop.  */
1134       gphi_iterator gsi_orig, gsi_new;
1135       edge orig_e = loop_preheader_edge (loop);
1136       edge new_e = loop_preheader_edge (new_loop);
1137
1138       for (gsi_orig = gsi_start_phis (loop->header),
1139            gsi_new = gsi_start_phis (new_loop->header);
1140            !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1141            gsi_next (&gsi_orig), gsi_next (&gsi_new))
1142         {
1143           gphi *orig_phi = gsi_orig.phi ();
1144           gphi *new_phi = gsi_new.phi ();
1145           tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1146           location_t orig_locus
1147             = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1148
1149           add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1150         }
1151     }
1152
1153   free (new_bbs);
1154   free (bbs);
1155
1156   checking_verify_dominators (CDI_DOMINATORS);
1157
1158   return new_loop;
1159 }
1160
1161
1162 /* Given the condition expression COND, put it as the last statement of
1163    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1164    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1165    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1166    new edge as irreducible if IRREDUCIBLE_P is true.  */
1167
1168 static edge
1169 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1170                        basic_block guard_to, basic_block dom_bb,
1171                        profile_probability probability, bool irreducible_p)
1172 {
1173   gimple_stmt_iterator gsi;
1174   edge new_e, enter_e;
1175   gcond *cond_stmt;
1176   gimple_seq gimplify_stmt_list = NULL;
1177
1178   enter_e = EDGE_SUCC (guard_bb, 0);
1179   enter_e->flags &= ~EDGE_FALLTHRU;
1180   enter_e->flags |= EDGE_FALSE_VALUE;
1181   gsi = gsi_last_bb (guard_bb);
1182
1183   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1184                                  NULL_TREE);
1185   if (gimplify_stmt_list)
1186     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1187
1188   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1189   gsi = gsi_last_bb (guard_bb);
1190   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1191
1192   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1193   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1194
1195   new_e->probability = probability;
1196   if (irreducible_p)
1197     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1198
1199   enter_e->probability = probability.invert ();
1200   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1201
1202   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1203   if (enter_e->dest->loop_father->header == enter_e->dest)
1204     split_edge (enter_e);
1205
1206   return new_e;
1207 }
1208
1209
1210 /* This function verifies that the following restrictions apply to LOOP:
1211    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1212        for innermost loop and 5 basic blocks for outer-loop.
1213    (2) it is single entry, single exit
1214    (3) its exit condition is the last stmt in the header
1215    (4) E is the entry/exit edge of LOOP.
1216  */
1217
1218 bool
1219 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1220 {
1221   edge exit_e = single_exit (loop);
1222   edge entry_e = loop_preheader_edge (loop);
1223   gcond *orig_cond = get_loop_exit_condition (loop);
1224   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1225   unsigned int num_bb = loop->inner? 5 : 2;
1226
1227   /* All loops have an outer scope; the only case loop->outer is NULL is for
1228      the function itself.  */
1229   if (!loop_outer (loop)
1230       || loop->num_nodes != num_bb
1231       || !empty_block_p (loop->latch)
1232       || !single_exit (loop)
1233       /* Verify that new loop exit condition can be trivially modified.  */
1234       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1235       || (e != exit_e && e != entry_e))
1236     return false;
1237
1238   return true;
1239 }
1240
1241 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1242    in the exit bb and rename all the uses after the loop.  This simplifies
1243    the *guard[12] routines, which assume loop closed SSA form for all PHIs
1244    (but normally loop closed SSA form doesn't require virtual PHIs to be
1245    in the same form).  Doing this early simplifies the checking what
1246    uses should be renamed.  */
1247
1248 static void
1249 create_lcssa_for_virtual_phi (class loop *loop)
1250 {
1251   gphi_iterator gsi;
1252   edge exit_e = single_exit (loop);
1253
1254   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1255     if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1256       {
1257         gphi *phi = gsi.phi ();
1258         for (gsi = gsi_start_phis (exit_e->dest);
1259              !gsi_end_p (gsi); gsi_next (&gsi))
1260           if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1261             break;
1262         if (gsi_end_p (gsi))
1263           {
1264             tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1265             gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1266             tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1267             imm_use_iterator imm_iter;
1268             gimple *stmt;
1269             use_operand_p use_p;
1270
1271             SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1272               = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1273             add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1274             gimple_phi_set_result (new_phi, new_vop);
1275             FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1276               if (stmt != new_phi
1277                   && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1278                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1279                   SET_USE (use_p, new_vop);
1280           }
1281         break;
1282       }
1283
1284 }
1285
1286 /* Function vect_get_loop_location.
1287
1288    Extract the location of the loop in the source code.
1289    If the loop is not well formed for vectorization, an estimated
1290    location is calculated.
1291    Return the loop location if succeed and NULL if not.  */
1292
1293 dump_user_location_t
1294 find_loop_location (class loop *loop)
1295 {
1296   gimple *stmt = NULL;
1297   basic_block bb;
1298   gimple_stmt_iterator si;
1299
1300   if (!loop)
1301     return dump_user_location_t ();
1302
1303   stmt = get_loop_exit_condition (loop);
1304
1305   if (stmt
1306       && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1307     return stmt;
1308
1309   /* If we got here the loop is probably not "well formed",
1310      try to estimate the loop location */
1311
1312   if (!loop->header)
1313     return dump_user_location_t ();
1314
1315   bb = loop->header;
1316
1317   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1318     {
1319       stmt = gsi_stmt (si);
1320       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1321         return stmt;
1322     }
1323
1324   return dump_user_location_t ();
1325 }
1326
1327 /* Return true if the phi described by STMT_INFO defines an IV of the
1328    loop to be vectorized.  */
1329
1330 static bool
1331 iv_phi_p (stmt_vec_info stmt_info)
1332 {
1333   gphi *phi = as_a <gphi *> (stmt_info->stmt);
1334   if (virtual_operand_p (PHI_RESULT (phi)))
1335     return false;
1336
1337   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1338       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1339     return false;
1340
1341   return true;
1342 }
1343
1344 /* Function vect_can_advance_ivs_p
1345
1346    In case the number of iterations that LOOP iterates is unknown at compile
1347    time, an epilog loop will be generated, and the loop induction variables
1348    (IVs) will be "advanced" to the value they are supposed to take just before
1349    the epilog loop.  Here we check that the access function of the loop IVs
1350    and the expression that represents the loop bound are simple enough.
1351    These restrictions will be relaxed in the future.  */
1352
1353 bool
1354 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1355 {
1356   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1357   basic_block bb = loop->header;
1358   gphi_iterator gsi;
1359
1360   /* Analyze phi functions of the loop header.  */
1361
1362   if (dump_enabled_p ())
1363     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1364   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1365     {
1366       tree evolution_part;
1367
1368       gphi *phi = gsi.phi ();
1369       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1370       if (dump_enabled_p ())
1371         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1372                          phi_info->stmt);
1373
1374       /* Skip virtual phi's. The data dependences that are associated with
1375          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1376
1377          Skip reduction phis.  */
1378       if (!iv_phi_p (phi_info))
1379         {
1380           if (dump_enabled_p ())
1381             dump_printf_loc (MSG_NOTE, vect_location,
1382                              "reduc or virtual phi. skip.\n");
1383           continue;
1384         }
1385
1386       /* Analyze the evolution function.  */
1387
1388       evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1389       if (evolution_part == NULL_TREE)
1390         {
1391           if (dump_enabled_p ())
1392             dump_printf (MSG_MISSED_OPTIMIZATION,
1393                          "No access function or evolution.\n");
1394           return false;
1395         }
1396
1397       /* FORNOW: We do not transform initial conditions of IVs
1398          which evolution functions are not invariants in the loop.  */
1399
1400       if (!expr_invariant_in_loop_p (loop, evolution_part))
1401         {
1402           if (dump_enabled_p ())
1403             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1404                              "evolution not invariant in loop.\n");
1405           return false;
1406         }
1407
1408       /* FORNOW: We do not transform initial conditions of IVs
1409          which evolution functions are a polynomial of degree >= 2.  */
1410
1411       if (tree_is_chrec (evolution_part))
1412         {
1413           if (dump_enabled_p ())
1414             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1415                              "evolution is chrec.\n");
1416           return false;
1417         }
1418     }
1419
1420   return true;
1421 }
1422
1423
1424 /*   Function vect_update_ivs_after_vectorizer.
1425
1426      "Advance" the induction variables of LOOP to the value they should take
1427      after the execution of LOOP.  This is currently necessary because the
1428      vectorizer does not handle induction variables that are used after the
1429      loop.  Such a situation occurs when the last iterations of LOOP are
1430      peeled, because:
1431      1. We introduced new uses after LOOP for IVs that were not originally used
1432         after LOOP: the IVs of LOOP are now used by an epilog loop.
1433      2. LOOP is going to be vectorized; this means that it will iterate N/VF
1434         times, whereas the loop IVs should be bumped N times.
1435
1436      Input:
1437      - LOOP - a loop that is going to be vectorized. The last few iterations
1438               of LOOP were peeled.
1439      - NITERS - the number of iterations that LOOP executes (before it is
1440                 vectorized). i.e, the number of times the ivs should be bumped.
1441      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1442                   coming out from LOOP on which there are uses of the LOOP ivs
1443                   (this is the path from LOOP->exit to epilog_loop->preheader).
1444
1445                   The new definitions of the ivs are placed in LOOP->exit.
1446                   The phi args associated with the edge UPDATE_E in the bb
1447                   UPDATE_E->dest are updated accordingly.
1448
1449      Assumption 1: Like the rest of the vectorizer, this function assumes
1450      a single loop exit that has a single predecessor.
1451
1452      Assumption 2: The phi nodes in the LOOP header and in update_bb are
1453      organized in the same order.
1454
1455      Assumption 3: The access function of the ivs is simple enough (see
1456      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
1457
1458      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1459      coming out of LOOP on which the ivs of LOOP are used (this is the path
1460      that leads to the epilog loop; other paths skip the epilog loop).  This
1461      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1462      needs to have its phis updated.
1463  */
1464
1465 static void
1466 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1467                                   tree niters, edge update_e)
1468 {
1469   gphi_iterator gsi, gsi1;
1470   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1471   basic_block update_bb = update_e->dest;
1472   basic_block exit_bb = single_exit (loop)->dest;
1473
1474   /* Make sure there exists a single-predecessor exit bb:  */
1475   gcc_assert (single_pred_p (exit_bb));
1476   gcc_assert (single_succ_edge (exit_bb) == update_e);
1477
1478   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1479        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1480        gsi_next (&gsi), gsi_next (&gsi1))
1481     {
1482       tree init_expr;
1483       tree step_expr, off;
1484       tree type;
1485       tree var, ni, ni_name;
1486       gimple_stmt_iterator last_gsi;
1487
1488       gphi *phi = gsi.phi ();
1489       gphi *phi1 = gsi1.phi ();
1490       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1491       if (dump_enabled_p ())
1492         dump_printf_loc (MSG_NOTE, vect_location,
1493                          "vect_update_ivs_after_vectorizer: phi: %G", phi);
1494
1495       /* Skip reduction and virtual phis.  */
1496       if (!iv_phi_p (phi_info))
1497         {
1498           if (dump_enabled_p ())
1499             dump_printf_loc (MSG_NOTE, vect_location,
1500                              "reduc or virtual phi. skip.\n");
1501           continue;
1502         }
1503
1504       type = TREE_TYPE (gimple_phi_result (phi));
1505       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1506       step_expr = unshare_expr (step_expr);
1507
1508       /* FORNOW: We do not support IVs whose evolution function is a polynomial
1509          of degree >= 2 or exponential.  */
1510       gcc_assert (!tree_is_chrec (step_expr));
1511
1512       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1513
1514       off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1515                          fold_convert (TREE_TYPE (step_expr), niters),
1516                          step_expr);
1517       if (POINTER_TYPE_P (type))
1518         ni = fold_build_pointer_plus (init_expr, off);
1519       else
1520         ni = fold_build2 (PLUS_EXPR, type,
1521                           init_expr, fold_convert (type, off));
1522
1523       var = create_tmp_var (type, "tmp");
1524
1525       last_gsi = gsi_last_bb (exit_bb);
1526       gimple_seq new_stmts = NULL;
1527       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1528       /* Exit_bb shouldn't be empty.  */
1529       if (!gsi_end_p (last_gsi))
1530         gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1531       else
1532         gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1533
1534       /* Fix phi expressions in the successor bb.  */
1535       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1536     }
1537 }
1538
1539 /* Return a gimple value containing the misalignment (measured in vector
1540    elements) for the loop described by LOOP_VINFO, i.e. how many elements
1541    it is away from a perfectly aligned address.  Add any new statements
1542    to SEQ.  */
1543
1544 static tree
1545 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1546 {
1547   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1548   stmt_vec_info stmt_info = dr_info->stmt;
1549   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1550
1551   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1552   unsigned HOST_WIDE_INT target_align_c;
1553   tree target_align_minus_1;
1554
1555   bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1556                                         size_zero_node) < 0;
1557   tree offset = (negative
1558                  ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1559                  : size_zero_node);
1560   tree start_addr = vect_create_addr_base_for_vector_ref (stmt_info, seq,
1561                                                           offset);
1562   tree type = unsigned_type_for (TREE_TYPE (start_addr));
1563   if (target_align.is_constant (&target_align_c))
1564     target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1565   else
1566     {
1567       tree vla = build_int_cst (type, target_align);
1568       tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1569                                     fold_build2 (MINUS_EXPR, type,
1570                                                  build_int_cst (type, 0), vla));
1571       target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1572                                           build_int_cst (type, 1));
1573     }
1574
1575   HOST_WIDE_INT elem_size
1576     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1577   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1578
1579   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
1580   tree int_start_addr = fold_convert (type, start_addr);
1581   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1582                                         target_align_minus_1);
1583
1584   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
1585   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1586                                         elem_size_log);
1587
1588   return misalign_in_elems;
1589 }
1590
1591 /* Function vect_gen_prolog_loop_niters
1592
1593    Generate the number of iterations which should be peeled as prolog for the
1594    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
1595    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1596    As a result, after the execution of this loop, the data reference DR will
1597    refer to an aligned location.  The following computation is generated:
1598
1599    If the misalignment of DR is known at compile time:
1600      addr_mis = int mis = DR_MISALIGNMENT (dr);
1601    Else, compute address misalignment in bytes:
1602      addr_mis = addr & (target_align - 1)
1603
1604    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1605
1606    (elem_size = element type size; an element is the scalar element whose type
1607    is the inner type of the vectype)
1608
1609    The computations will be emitted at the end of BB.  We also compute and
1610    store upper bound (included) of the result in BOUND.
1611
1612    When the step of the data-ref in the loop is not 1 (as in interleaved data
1613    and SLP), the number of iterations of the prolog must be divided by the step
1614    (which is equal to the size of interleaved group).
1615
1616    The above formulas assume that VF == number of elements in the vector. This
1617    may not hold when there are multiple-types in the loop.
1618    In this case, for some data-references in the loop the VF does not represent
1619    the number of elements that fit in the vector.  Therefore, instead of VF we
1620    use TYPE_VECTOR_SUBPARTS.  */
1621
1622 static tree
1623 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1624                              basic_block bb, int *bound)
1625 {
1626   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1627   tree var;
1628   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1629   gimple_seq stmts = NULL, new_stmts = NULL;
1630   tree iters, iters_name;
1631   stmt_vec_info stmt_info = dr_info->stmt;
1632   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1633   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1634
1635   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1636     {
1637       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1638
1639       if (dump_enabled_p ())
1640         dump_printf_loc (MSG_NOTE, vect_location,
1641                          "known peeling = %d.\n", npeel);
1642
1643       iters = build_int_cst (niters_type, npeel);
1644       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1645     }
1646   else
1647     {
1648       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1649       tree type = TREE_TYPE (misalign_in_elems);
1650       HOST_WIDE_INT elem_size
1651         = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1652       /* We only do prolog peeling if the target alignment is known at compile
1653          time.  */
1654       poly_uint64 align_in_elems =
1655         exact_div (target_align, elem_size);
1656       tree align_in_elems_minus_1 =
1657         build_int_cst (type, align_in_elems - 1);
1658       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1659
1660       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
1661                                  & (align_in_elems - 1)).  */
1662       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1663                                             size_zero_node) < 0;
1664       if (negative)
1665         iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1666                              align_in_elems_tree);
1667       else
1668         iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1669                              misalign_in_elems);
1670       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1671       iters = fold_convert (niters_type, iters);
1672       unsigned HOST_WIDE_INT align_in_elems_c;
1673       if (align_in_elems.is_constant (&align_in_elems_c))
1674         *bound = align_in_elems_c - 1;
1675       else
1676         *bound = -1;
1677     }
1678
1679   if (dump_enabled_p ())
1680     dump_printf_loc (MSG_NOTE, vect_location,
1681                      "niters for prolog loop: %T\n", iters);
1682
1683   var = create_tmp_var (niters_type, "prolog_loop_niters");
1684   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1685
1686   if (new_stmts)
1687     gimple_seq_add_seq (&stmts, new_stmts);
1688   if (stmts)
1689     {
1690       gcc_assert (single_succ_p (bb));
1691       gimple_stmt_iterator gsi = gsi_last_bb (bb);
1692       if (gsi_end_p (gsi))
1693         gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1694       else
1695         gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1696     }
1697   return iters_name;
1698 }
1699
1700
1701 /* Function vect_update_init_of_dr
1702
1703    If CODE is PLUS, the vector loop starts NITERS iterations after the
1704    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1705    iterations before the scalar one (using masking to skip inactive
1706    elements).  This function updates the information recorded in DR to
1707    account for the difference.  Specifically, it updates the OFFSET
1708    field of DR.  */
1709
1710 static void
1711 vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code)
1712 {
1713   tree offset = DR_OFFSET (dr);
1714
1715   niters = fold_build2 (MULT_EXPR, sizetype,
1716                         fold_convert (sizetype, niters),
1717                         fold_convert (sizetype, DR_STEP (dr)));
1718   offset = fold_build2 (code, sizetype,
1719                         fold_convert (sizetype, offset), niters);
1720   DR_OFFSET (dr) = offset;
1721 }
1722
1723
1724 /* Function vect_update_inits_of_drs
1725
1726    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1727    CODE and NITERS are as for vect_update_inits_of_dr.  */
1728
1729 void
1730 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1731                           tree_code code)
1732 {
1733   unsigned int i;
1734   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1735   struct data_reference *dr;
1736
1737   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1738
1739   /* Adjust niters to sizetype.  We used to insert the stmts on loop preheader
1740      here, but since we might use these niters to update the epilogues niters
1741      and data references we can't insert them here as this definition might not
1742      always dominate its uses.  */
1743   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1744     niters = fold_convert (sizetype, niters);
1745
1746   FOR_EACH_VEC_ELT (datarefs, i, dr)
1747     {
1748       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1749       if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
1750         vect_update_init_of_dr (dr, niters, code);
1751     }
1752 }
1753
1754 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1755    by masking.  This involves calculating the number of iterations to
1756    be peeled and then aligning all memory references appropriately.  */
1757
1758 void
1759 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1760 {
1761   tree misalign_in_elems;
1762   tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1763
1764   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1765
1766   /* From the information recorded in LOOP_VINFO get the number of iterations
1767      that need to be skipped via masking.  */
1768   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1769     {
1770       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1771                              - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1772       misalign_in_elems = build_int_cst (type, misalign);
1773     }
1774   else
1775     {
1776       gimple_seq seq1 = NULL, seq2 = NULL;
1777       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1778       misalign_in_elems = fold_convert (type, misalign_in_elems);
1779       misalign_in_elems = force_gimple_operand (misalign_in_elems,
1780                                                 &seq2, true, NULL_TREE);
1781       gimple_seq_add_seq (&seq1, seq2);
1782       if (seq1)
1783         {
1784           edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1785           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1786           gcc_assert (!new_bb);
1787         }
1788     }
1789
1790   if (dump_enabled_p ())
1791     dump_printf_loc (MSG_NOTE, vect_location,
1792                      "misalignment for fully-masked loop: %T\n",
1793                      misalign_in_elems);
1794
1795   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1796
1797   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1798 }
1799
1800 /* This function builds ni_name = number of iterations.  Statements
1801    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
1802    it to TRUE if new ssa_var is generated.  */
1803
1804 tree
1805 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1806 {
1807   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1808   if (TREE_CODE (ni) == INTEGER_CST)
1809     return ni;
1810   else
1811     {
1812       tree ni_name, var;
1813       gimple_seq stmts = NULL;
1814       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1815
1816       var = create_tmp_var (TREE_TYPE (ni), "niters");
1817       ni_name = force_gimple_operand (ni, &stmts, false, var);
1818       if (stmts)
1819         {
1820           gsi_insert_seq_on_edge_immediate (pe, stmts);
1821           if (new_var_p != NULL)
1822             *new_var_p = true;
1823         }
1824
1825       return ni_name;
1826     }
1827 }
1828
1829 /* Calculate the number of iterations above which vectorized loop will be
1830    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
1831    of prolog loop.  If it's integer const, the integer number is also passed
1832    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
1833    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
1834    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
1835    threshold below which the scalar (rather than vectorized) loop will be
1836    executed.  This function stores the upper bound (inclusive) of the result
1837    in BOUND_SCALAR.  */
1838
1839 static tree
1840 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1841                              int bound_prolog, poly_int64 bound_epilog, int th,
1842                              poly_uint64 *bound_scalar,
1843                              bool check_profitability)
1844 {
1845   tree type = TREE_TYPE (niters_prolog);
1846   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1847                              build_int_cst (type, bound_epilog));
1848
1849   *bound_scalar = bound_prolog + bound_epilog;
1850   if (check_profitability)
1851     {
1852       /* TH indicates the minimum niters of vectorized loop, while we
1853          compute the maximum niters of scalar loop.  */
1854       th--;
1855       /* Peeling for constant times.  */
1856       if (int_niters_prolog >= 0)
1857         {
1858           *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1859           return build_int_cst (type, *bound_scalar);
1860         }
1861       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
1862          and BOUND_EPILOG are inclusive upper bounds.  */
1863       if (known_ge (th, bound_prolog + bound_epilog))
1864         {
1865           *bound_scalar = th;
1866           return build_int_cst (type, th);
1867         }
1868       /* Need to do runtime comparison.  */
1869       else if (maybe_gt (th, bound_epilog))
1870         {
1871           *bound_scalar = upper_bound (*bound_scalar, th);
1872           return fold_build2 (MAX_EXPR, type,
1873                               build_int_cst (type, th), niters);
1874         }
1875     }
1876   return niters;
1877 }
1878
1879 /* NITERS is the number of times that the original scalar loop executes
1880    after peeling.  Work out the maximum number of iterations N that can
1881    be handled by the vectorized form of the loop and then either:
1882
1883    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1884
1885         niters_vector = N
1886
1887    b) set *STEP_VECTOR_PTR to one and generate:
1888
1889         niters_vector = N / vf
1890
1891    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1892    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
1893    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
1894
1895 void
1896 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1897                              tree *niters_vector_ptr, tree *step_vector_ptr,
1898                              bool niters_no_overflow)
1899 {
1900   tree ni_minus_gap, var;
1901   tree niters_vector, step_vector, type = TREE_TYPE (niters);
1902   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1903   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1904   tree log_vf = NULL_TREE;
1905
1906   /* If epilogue loop is required because of data accesses with gaps, we
1907      subtract one iteration from the total number of iterations here for
1908      correct calculation of RATIO.  */
1909   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1910     {
1911       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1912                                   build_one_cst (type));
1913       if (!is_gimple_val (ni_minus_gap))
1914         {
1915           var = create_tmp_var (type, "ni_gap");
1916           gimple *stmts = NULL;
1917           ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1918                                                true, var);
1919           gsi_insert_seq_on_edge_immediate (pe, stmts);
1920         }
1921     }
1922   else
1923     ni_minus_gap = niters;
1924
1925   unsigned HOST_WIDE_INT const_vf;
1926   if (vf.is_constant (&const_vf)
1927       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1928     {
1929       /* Create: niters >> log2(vf) */
1930       /* If it's known that niters == number of latch executions + 1 doesn't
1931          overflow, we can generate niters >> log2(vf); otherwise we generate
1932          (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1933          will be at least one.  */
1934       log_vf = build_int_cst (type, exact_log2 (const_vf));
1935       if (niters_no_overflow)
1936         niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1937       else
1938         niters_vector
1939           = fold_build2 (PLUS_EXPR, type,
1940                          fold_build2 (RSHIFT_EXPR, type,
1941                                       fold_build2 (MINUS_EXPR, type,
1942                                                    ni_minus_gap,
1943                                                    build_int_cst (type, vf)),
1944                                       log_vf),
1945                          build_int_cst (type, 1));
1946       step_vector = build_one_cst (type);
1947     }
1948   else
1949     {
1950       niters_vector = ni_minus_gap;
1951       step_vector = build_int_cst (type, vf);
1952     }
1953
1954   if (!is_gimple_val (niters_vector))
1955     {
1956       var = create_tmp_var (type, "bnd");
1957       gimple_seq stmts = NULL;
1958       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1959       gsi_insert_seq_on_edge_immediate (pe, stmts);
1960       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1961          we set range information to make niters analyzer's life easier.  */
1962       if (stmts != NULL && log_vf)
1963         set_range_info (niters_vector, VR_RANGE,
1964                         wi::to_wide (build_int_cst (type, 1)),
1965                         wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1966                                                   TYPE_MAX_VALUE (type),
1967                                                   log_vf)));
1968     }
1969   *niters_vector_ptr = niters_vector;
1970   *step_vector_ptr = step_vector;
1971
1972   return;
1973 }
1974
1975 /* Given NITERS_VECTOR which is the number of iterations for vectorized
1976    loop specified by LOOP_VINFO after vectorization, compute the number
1977    of iterations before vectorization (niters_vector * vf) and store it
1978    to NITERS_VECTOR_MULT_VF_PTR.  */
1979
1980 static void
1981 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
1982                                      tree niters_vector,
1983                                      tree *niters_vector_mult_vf_ptr)
1984 {
1985   /* We should be using a step_vector of VF if VF is variable.  */
1986   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
1987   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1988   tree type = TREE_TYPE (niters_vector);
1989   tree log_vf = build_int_cst (type, exact_log2 (vf));
1990   basic_block exit_bb = single_exit (loop)->dest;
1991
1992   gcc_assert (niters_vector_mult_vf_ptr != NULL);
1993   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
1994                                             niters_vector, log_vf);
1995   if (!is_gimple_val (niters_vector_mult_vf))
1996     {
1997       tree var = create_tmp_var (type, "niters_vector_mult_vf");
1998       gimple_seq stmts = NULL;
1999       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2000                                                     &stmts, true, var);
2001       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2002       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2003     }
2004   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2005 }
2006
2007 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2008    from SECOND/FIRST and puts it at the original loop's preheader/exit
2009    edge, the two loops are arranged as below:
2010
2011        preheader_a:
2012      first_loop:
2013        header_a:
2014          i_1 = PHI<i_0, i_2>;
2015          ...
2016          i_2 = i_1 + 1;
2017          if (cond_a)
2018            goto latch_a;
2019          else
2020            goto between_bb;
2021        latch_a:
2022          goto header_a;
2023
2024        between_bb:
2025          ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
2026
2027      second_loop:
2028        header_b:
2029          i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2030                                  or with i_2 if no LCSSA phi is created
2031                                  under condition of CREATE_LCSSA_FOR_IV_PHIS.
2032          ...
2033          i_4 = i_3 + 1;
2034          if (cond_b)
2035            goto latch_b;
2036          else
2037            goto exit_bb;
2038        latch_b:
2039          goto header_b;
2040
2041        exit_bb:
2042
2043    This function creates loop closed SSA for the first loop; update the
2044    second loop's PHI nodes by replacing argument on incoming edge with the
2045    result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
2046    is false, Loop closed ssa phis will only be created for non-iv phis for
2047    the first loop.
2048
2049    This function assumes exit bb of the first loop is preheader bb of the
2050    second loop, i.e, between_bb in the example code.  With PHIs updated,
2051    the second loop will execute rest iterations of the first.  */
2052
2053 static void
2054 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2055                                    class loop *first, class loop *second,
2056                                    bool create_lcssa_for_iv_phis)
2057 {
2058   gphi_iterator gsi_update, gsi_orig;
2059   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2060
2061   edge first_latch_e = EDGE_SUCC (first->latch, 0);
2062   edge second_preheader_e = loop_preheader_edge (second);
2063   basic_block between_bb = single_exit (first)->dest;
2064
2065   gcc_assert (between_bb == second_preheader_e->src);
2066   gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2067   /* Either the first loop or the second is the loop to be vectorized.  */
2068   gcc_assert (loop == first || loop == second);
2069
2070   for (gsi_orig = gsi_start_phis (first->header),
2071        gsi_update = gsi_start_phis (second->header);
2072        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2073        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2074     {
2075       gphi *orig_phi = gsi_orig.phi ();
2076       gphi *update_phi = gsi_update.phi ();
2077
2078       tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2079       /* Generate lcssa PHI node for the first loop.  */
2080       gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2081       stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2082       if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2083         {
2084           tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2085           gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2086           add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2087           arg = new_res;
2088         }
2089
2090       /* Update PHI node in the second loop by replacing arg on the loop's
2091          incoming edge.  */
2092       adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2093     }
2094 }
2095
2096 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2097    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2098    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2099    appear like below:
2100
2101        guard_bb:
2102          if (cond)
2103            goto merge_bb;
2104          else
2105            goto skip_loop;
2106
2107      skip_loop:
2108        header_a:
2109          i_1 = PHI<i_0, i_2>;
2110          ...
2111          i_2 = i_1 + 1;
2112          if (cond_a)
2113            goto latch_a;
2114          else
2115            goto exit_a;
2116        latch_a:
2117          goto header_a;
2118
2119        exit_a:
2120          i_5 = PHI<i_2>;
2121
2122        merge_bb:
2123          ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2124
2125      update_loop:
2126        header_b:
2127          i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2128          ...
2129          i_4 = i_3 + 1;
2130          if (cond_b)
2131            goto latch_b;
2132          else
2133            goto exit_bb;
2134        latch_b:
2135          goto header_b;
2136
2137        exit_bb:
2138
2139    This function creates PHI nodes at merge_bb and replaces the use of i_5
2140    in the update_loop's PHI node with the result of new PHI result.  */
2141
2142 static void
2143 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2144                                     class loop *update_loop,
2145                                     edge guard_edge, edge merge_edge)
2146 {
2147   location_t merge_loc, guard_loc;
2148   edge orig_e = loop_preheader_edge (skip_loop);
2149   edge update_e = loop_preheader_edge (update_loop);
2150   gphi_iterator gsi_orig, gsi_update;
2151
2152   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2153         gsi_update = gsi_start_phis (update_loop->header));
2154        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2155        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2156     {
2157       gphi *orig_phi = gsi_orig.phi ();
2158       gphi *update_phi = gsi_update.phi ();
2159
2160       /* Generate new phi node at merge bb of the guard.  */
2161       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2162       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2163
2164       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2165          args in NEW_PHI for these edges.  */
2166       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2167       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2168       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2169       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2170       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2171       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2172
2173       /* Update phi in UPDATE_PHI.  */
2174       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2175     }
2176 }
2177
2178 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2179    this function searches for the corresponding lcssa phi node in exit
2180    bb of LOOP.  If it is found, return the phi result; otherwise return
2181    NULL.  */
2182
2183 static tree
2184 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2185                 gphi *lcssa_phi)
2186 {
2187   gphi_iterator gsi;
2188   edge e = single_exit (loop);
2189
2190   gcc_assert (single_pred_p (e->dest));
2191   for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2192     {
2193       gphi *phi = gsi.phi ();
2194       if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2195                            PHI_ARG_DEF (lcssa_phi, 0), 0))
2196         return PHI_RESULT (phi);
2197     }
2198   return NULL_TREE;
2199 }
2200
2201 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2202    from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
2203    point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
2204    and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2205    The CFG looks like:
2206
2207      loop:
2208        header_a:
2209          i_1 = PHI<i_0, i_2>;
2210          ...
2211          i_2 = i_1 + 1;
2212          if (cond_a)
2213            goto latch_a;
2214          else
2215            goto exit_a;
2216        latch_a:
2217          goto header_a;
2218
2219        exit_a:
2220
2221        guard_bb:
2222          if (cond)
2223            goto merge_bb;
2224          else
2225            goto epilog_loop;
2226
2227        ;; fall_through_bb
2228
2229      epilog_loop:
2230        header_b:
2231          i_3 = PHI<i_2, i_4>;
2232          ...
2233          i_4 = i_3 + 1;
2234          if (cond_b)
2235            goto latch_b;
2236          else
2237            goto merge_bb;
2238        latch_b:
2239          goto header_b;
2240
2241        merge_bb:
2242          ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2243
2244        exit_bb:
2245          i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
2246
2247    For each name used out side EPILOG (i.e - for each name that has a lcssa
2248    phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
2249    args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
2250    the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2251    by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
2252    in exit_bb will also be updated.  */
2253
2254 static void
2255 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2256                                     edge guard_edge, edge merge_edge)
2257 {
2258   gphi_iterator gsi;
2259   basic_block merge_bb = guard_edge->dest;
2260
2261   gcc_assert (single_succ_p (merge_bb));
2262   edge e = single_succ_edge (merge_bb);
2263   basic_block exit_bb = e->dest;
2264   gcc_assert (single_pred_p (exit_bb));
2265   gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2266
2267   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2268     {
2269       gphi *update_phi = gsi.phi ();
2270       tree old_arg = PHI_ARG_DEF (update_phi, 0);
2271       /* This loop-closed-phi actually doesn't represent a use out of the
2272          loop - the phi arg is a constant.  */
2273       if (TREE_CODE (old_arg) != SSA_NAME)
2274         continue;
2275
2276       tree merge_arg = get_current_def (old_arg);
2277       if (!merge_arg)
2278         merge_arg = old_arg;
2279
2280       tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2281       /* If the var is live after loop but not a reduction, we simply
2282          use the old arg.  */
2283       if (!guard_arg)
2284         guard_arg = old_arg;
2285
2286       /* Create new phi node in MERGE_BB:  */
2287       tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2288       gphi *merge_phi = create_phi_node (new_res, merge_bb);
2289
2290       /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2291          the two PHI args in merge_phi for these edges.  */
2292       add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2293       add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2294
2295       /* Update the original phi in exit_bb.  */
2296       adjust_phi_and_debug_stmts (update_phi, e, new_res);
2297     }
2298 }
2299
2300 /* EPILOG loop is duplicated from the original loop for vectorizing,
2301    the arg of its loop closed ssa PHI needs to be updated.  */
2302
2303 static void
2304 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2305 {
2306   gphi_iterator gsi;
2307   basic_block exit_bb = single_exit (epilog)->dest;
2308
2309   gcc_assert (single_pred_p (exit_bb));
2310   edge e = EDGE_PRED (exit_bb, 0);
2311   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2312     rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2313 }
2314
2315 /* Function vect_do_peeling.
2316
2317    Input:
2318    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2319
2320        preheader:
2321      LOOP:
2322        header_bb:
2323          loop_body
2324          if (exit_loop_cond) goto exit_bb
2325          else                goto header_bb
2326        exit_bb:
2327
2328    - NITERS: The number of iterations of the loop.
2329    - NITERSM1: The number of iterations of the loop's latch.
2330    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2331    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2332                               CHECK_PROFITABILITY is true.
2333    Output:
2334    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2335      iterate after vectorization; see vect_set_loop_condition for details.
2336    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2337      should be set to the number of scalar iterations handled by the
2338      vector loop.  The SSA name is only used on exit from the loop.
2339
2340    This function peels prolog and epilog from the loop, adds guards skipping
2341    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2342    would look like:
2343
2344        guard_bb_1:
2345          if (prefer_scalar_loop) goto merge_bb_1
2346          else                    goto guard_bb_2
2347
2348        guard_bb_2:
2349          if (skip_prolog) goto merge_bb_2
2350          else             goto prolog_preheader
2351
2352        prolog_preheader:
2353      PROLOG:
2354        prolog_header_bb:
2355          prolog_body
2356          if (exit_prolog_cond) goto prolog_exit_bb
2357          else                  goto prolog_header_bb
2358        prolog_exit_bb:
2359
2360        merge_bb_2:
2361
2362        vector_preheader:
2363      VECTOR LOOP:
2364        vector_header_bb:
2365          vector_body
2366          if (exit_vector_cond) goto vector_exit_bb
2367          else                  goto vector_header_bb
2368        vector_exit_bb:
2369
2370        guard_bb_3:
2371          if (skip_epilog) goto merge_bb_3
2372          else             goto epilog_preheader
2373
2374        merge_bb_1:
2375
2376        epilog_preheader:
2377      EPILOG:
2378        epilog_header_bb:
2379          epilog_body
2380          if (exit_epilog_cond) goto merge_bb_3
2381          else                  goto epilog_header_bb
2382
2383        merge_bb_3:
2384
2385    Note this function peels prolog and epilog only if it's necessary,
2386    as well as guards.
2387    This function returns the epilogue loop if a decision was made to vectorize
2388    it, otherwise NULL.
2389
2390    The analysis resulting in this epilogue loop's loop_vec_info was performed
2391    in the same vect_analyze_loop call as the main loop's.  At that time
2392    vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2393    vectorization factors than the main loop.  This list is stored in the main
2394    loop's loop_vec_info in the 'epilogue_vinfos' member.  Everytime we decide to
2395    vectorize the epilogue loop for a lower vectorization factor,  the
2396    loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2397    updated and linked to the epilogue loop.  This is later used to vectorize
2398    the epilogue.  The reason the loop_vec_info needs updating is that it was
2399    constructed based on the original main loop, and the epilogue loop is a
2400    copy of this loop, so all links pointing to statements in the original loop
2401    need updating.  Furthermore, these loop_vec_infos share the
2402    data_reference's records, which will also need to be updated.
2403
2404    TODO: Guard for prefer_scalar_loop should be emitted along with
2405    versioning conditions if loop versioning is needed.  */
2406
2407
2408 class loop *
2409 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2410                  tree *niters_vector, tree *step_vector,
2411                  tree *niters_vector_mult_vf_var, int th,
2412                  bool check_profitability, bool niters_no_overflow,
2413                  tree *advance, drs_init_vec &orig_drs_init)
2414 {
2415   edge e, guard_e;
2416   tree type = TREE_TYPE (niters), guard_cond;
2417   basic_block guard_bb, guard_to;
2418   profile_probability prob_prolog, prob_vector, prob_epilog;
2419   int estimated_vf;
2420   int prolog_peeling = 0;
2421   bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2422   /* We currently do not support prolog peeling if the target alignment is not
2423      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
2424      target alignment being constant.  */
2425   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2426   if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2427     return NULL;
2428
2429   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2430     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2431
2432   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2433   poly_uint64 bound_epilog = 0;
2434   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2435       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2436     bound_epilog += vf - 1;
2437   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2438     bound_epilog += 1;
2439   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2440   poly_uint64 bound_scalar = bound_epilog;
2441
2442   if (!prolog_peeling && !epilog_peeling)
2443     return NULL;
2444
2445   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2446   estimated_vf = vect_vf_for_cost (loop_vinfo);
2447   if (estimated_vf == 2)
2448     estimated_vf = 3;
2449   prob_prolog = prob_epilog = profile_probability::guessed_always ()
2450                         .apply_scale (estimated_vf - 1, estimated_vf);
2451
2452   class loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
2453   class loop *first_loop = loop;
2454   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2455   create_lcssa_for_virtual_phi (loop);
2456   update_ssa (TODO_update_ssa_only_virtuals);
2457
2458   if (MAY_HAVE_DEBUG_BIND_STMTS)
2459     {
2460       gcc_assert (!adjust_vec.exists ());
2461       adjust_vec.create (32);
2462     }
2463   initialize_original_copy_tables ();
2464
2465   /* Record the anchor bb at which the guard should be placed if the scalar
2466      loop might be preferred.  */
2467   basic_block anchor = loop_preheader_edge (loop)->src;
2468
2469   /* Generate the number of iterations for the prolog loop.  We do this here
2470      so that we can also get the upper bound on the number of iterations.  */
2471   tree niters_prolog;
2472   int bound_prolog = 0;
2473   if (prolog_peeling)
2474     niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2475                                                   &bound_prolog);
2476   else
2477     niters_prolog = build_int_cst (type, 0);
2478
2479   loop_vec_info epilogue_vinfo = NULL;
2480   if (vect_epilogues)
2481     {
2482       epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2483       loop_vinfo->epilogue_vinfos.ordered_remove (0);
2484     }
2485
2486   tree niters_vector_mult_vf = NULL_TREE;
2487   /* Saving NITERs before the loop, as this may be changed by prologue.  */
2488   tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2489   edge update_e = NULL, skip_e = NULL;
2490   unsigned int lowest_vf = constant_lower_bound (vf);
2491   /* If we know the number of scalar iterations for the main loop we should
2492      check whether after the main loop there are enough iterations left over
2493      for the epilogue.  */
2494   if (vect_epilogues
2495       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2496       && prolog_peeling >= 0
2497       && known_eq (vf, lowest_vf))
2498     {
2499       unsigned HOST_WIDE_INT eiters
2500         = (LOOP_VINFO_INT_NITERS (loop_vinfo)
2501            - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2502
2503       eiters -= prolog_peeling;
2504       eiters
2505         = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2506
2507       unsigned int ratio;
2508       while (!(constant_multiple_p (loop_vinfo->vector_size,
2509                                     epilogue_vinfo->vector_size, &ratio)
2510                && eiters >= lowest_vf / ratio))
2511         {
2512           delete epilogue_vinfo;
2513           epilogue_vinfo = NULL;
2514           if (loop_vinfo->epilogue_vinfos.length () == 0)
2515             {
2516               vect_epilogues = false;
2517               break;
2518             }
2519           epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2520           loop_vinfo->epilogue_vinfos.ordered_remove (0);
2521         }
2522     }
2523   /* Prolog loop may be skipped.  */
2524   bool skip_prolog = (prolog_peeling != 0);
2525   /* Skip this loop to epilog when there are not enough iterations to enter this
2526      vectorized loop.  If true we should perform runtime checks on the NITERS
2527      to check whether we should skip the current vectorized loop.  If we know
2528      the number of scalar iterations we may choose to add a runtime check if
2529      this number "maybe" smaller than the number of iterations required
2530      when we know the number of scalar iterations may potentially
2531      be smaller than the number of iterations required to enter this loop, for
2532      this we use the upper bounds on the prolog and epilog peeling.  When we
2533      don't know the number of iterations and don't require versioning it is
2534      because we have asserted that there are enough scalar iterations to enter
2535      the main loop, so this skip is not necessary.  When we are versioning then
2536      we only add such a skip if we have chosen to vectorize the epilogue.  */
2537   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2538                       ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2539                                   bound_prolog + bound_epilog)
2540                       : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2541                          || vect_epilogues));
2542   /* Epilog loop must be executed if the number of iterations for epilog
2543      loop is known at compile time, otherwise we need to add a check at
2544      the end of vector loop and skip to the end of epilog loop.  */
2545   bool skip_epilog = (prolog_peeling < 0
2546                       || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2547                       || !vf.is_constant ());
2548   /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
2549   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2550     skip_epilog = false;
2551
2552   if (skip_vector)
2553     {
2554       split_edge (loop_preheader_edge (loop));
2555
2556       /* Due to the order in which we peel prolog and epilog, we first
2557          propagate probability to the whole loop.  The purpose is to
2558          avoid adjusting probabilities of both prolog and vector loops
2559          separately.  Note in this case, the probability of epilog loop
2560          needs to be scaled back later.  */
2561       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2562       if (prob_vector.initialized_p ())
2563         {
2564           scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2565           scale_loop_profile (loop, prob_vector, 0);
2566         }
2567     }
2568
2569   dump_user_location_t loop_loc = find_loop_location (loop);
2570   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2571   if (vect_epilogues)
2572     /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2573        use the original scalar loop as remaining epilogue if necessary.  */
2574     LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2575       = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2576
2577   if (prolog_peeling)
2578     {
2579       e = loop_preheader_edge (loop);
2580       if (!slpeel_can_duplicate_loop_p (loop, e))
2581         {
2582           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2583                            "loop can't be duplicated to preheader edge.\n");
2584           gcc_unreachable ();
2585         }
2586       /* Peel prolog and put it on preheader edge of loop.  */
2587       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2588       if (!prolog)
2589         {
2590           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2591                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2592           gcc_unreachable ();
2593         }
2594       prolog->force_vectorize = false;
2595       slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2596       first_loop = prolog;
2597       reset_original_copy_tables ();
2598
2599       /* Update the number of iterations for prolog loop.  */
2600       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2601       vect_set_loop_condition (prolog, NULL, niters_prolog,
2602                                step_prolog, NULL_TREE, false);
2603
2604       /* Skip the prolog loop.  */
2605       if (skip_prolog)
2606         {
2607           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2608                                     niters_prolog, build_int_cst (type, 0));
2609           guard_bb = loop_preheader_edge (prolog)->src;
2610           basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2611           guard_to = split_edge (loop_preheader_edge (loop));
2612           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2613                                            guard_to, guard_bb,
2614                                            prob_prolog.invert (),
2615                                            irred_flag);
2616           e = EDGE_PRED (guard_to, 0);
2617           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2618           slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2619
2620           scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2621           scale_loop_profile (prolog, prob_prolog, bound_prolog);
2622         }
2623
2624       /* Save original inits for each data_reference before advancing them with
2625          NITERS_PROLOG.  */
2626       unsigned int i;
2627       struct data_reference *dr;
2628       vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
2629       FOR_EACH_VEC_ELT (datarefs, i, dr)
2630         orig_drs_init.safe_push (std::make_pair (dr, DR_OFFSET (dr)));
2631
2632       /* Update init address of DRs.  */
2633       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2634       /* Update niters for vector loop.  */
2635       LOOP_VINFO_NITERS (loop_vinfo)
2636         = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2637       LOOP_VINFO_NITERSM1 (loop_vinfo)
2638         = fold_build2 (MINUS_EXPR, type,
2639                        LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2640       bool new_var_p = false;
2641       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2642       /* It's guaranteed that vector loop bound before vectorization is at
2643          least VF, so set range information for newly generated var.  */
2644       if (new_var_p)
2645         set_range_info (niters, VR_RANGE,
2646                         wi::to_wide (build_int_cst (type, vf)),
2647                         wi::to_wide (TYPE_MAX_VALUE (type)));
2648
2649       /* Prolog iterates at most bound_prolog times, latch iterates at
2650          most bound_prolog - 1 times.  */
2651       record_niter_bound (prolog, bound_prolog - 1, false, true);
2652       delete_update_ssa ();
2653       adjust_vec_debug_stmts ();
2654       scev_reset ();
2655     }
2656
2657   if (epilog_peeling)
2658     {
2659       e = single_exit (loop);
2660       if (!slpeel_can_duplicate_loop_p (loop, e))
2661         {
2662           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2663                            "loop can't be duplicated to exit edge.\n");
2664           gcc_unreachable ();
2665         }
2666       /* Peel epilog and put it on exit edge of loop.  If we are vectorizing
2667          said epilog then we should use a copy of the main loop as a starting
2668          point.  This loop may have already had some preliminary transformations
2669          to allow for more optimal vectorization, for example if-conversion.
2670          If we are not vectorizing the epilog then we should use the scalar loop
2671          as the transformations mentioned above make less or no sense when not
2672          vectorizing.  */
2673       epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2674       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
2675       if (!epilog)
2676         {
2677           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2678                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2679           gcc_unreachable ();
2680         }
2681       epilog->force_vectorize = false;
2682       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2683
2684       /* Scalar version loop may be preferred.  In this case, add guard
2685          and skip to epilog.  Note this only happens when the number of
2686          iterations of loop is unknown at compile time, otherwise this
2687          won't be vectorized.  */
2688       if (skip_vector)
2689         {
2690           /* Additional epilogue iteration is peeled if gap exists.  */
2691           tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2692                                                 bound_prolog, bound_epilog,
2693                                                 th, &bound_scalar,
2694                                                 check_profitability);
2695           /* Build guard against NITERSM1 since NITERS may overflow.  */
2696           guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2697           guard_bb = anchor;
2698           guard_to = split_edge (loop_preheader_edge (epilog));
2699           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2700                                            guard_to, guard_bb,
2701                                            prob_vector.invert (),
2702                                            irred_flag);
2703           skip_e = guard_e;
2704           e = EDGE_PRED (guard_to, 0);
2705           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2706           slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2707
2708           /* Simply propagate profile info from guard_bb to guard_to which is
2709              a merge point of control flow.  */
2710           guard_to->count = guard_bb->count;
2711
2712           /* Scale probability of epilog loop back.
2713              FIXME: We should avoid scaling down and back up.  Profile may
2714              get lost if we scale down to 0.  */
2715           basic_block *bbs = get_loop_body (epilog);
2716           for (unsigned int i = 0; i < epilog->num_nodes; i++)
2717             bbs[i]->count = bbs[i]->count.apply_scale
2718                                  (bbs[i]->count,
2719                                   bbs[i]->count.apply_probability
2720                                     (prob_vector));
2721           free (bbs);
2722         }
2723
2724       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2725       /* If loop is peeled for non-zero constant times, now niters refers to
2726          orig_niters - prolog_peeling, it won't overflow even the orig_niters
2727          overflows.  */
2728       niters_no_overflow |= (prolog_peeling > 0);
2729       vect_gen_vector_loop_niters (loop_vinfo, niters,
2730                                    niters_vector, step_vector,
2731                                    niters_no_overflow);
2732       if (!integer_onep (*step_vector))
2733         {
2734           /* On exit from the loop we will have an easy way of calcalating
2735              NITERS_VECTOR / STEP * STEP.  Install a dummy definition
2736              until then.  */
2737           niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2738           SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2739           *niters_vector_mult_vf_var = niters_vector_mult_vf;
2740         }
2741       else
2742         vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2743                                              &niters_vector_mult_vf);
2744       /* Update IVs of original loop as if they were advanced by
2745          niters_vector_mult_vf steps.  */
2746       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2747       update_e = skip_vector ? e : loop_preheader_edge (epilog);
2748       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2749                                         update_e);
2750
2751       if (skip_epilog)
2752         {
2753           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2754                                     niters, niters_vector_mult_vf);
2755           guard_bb = single_exit (loop)->dest;
2756           guard_to = split_edge (single_exit (epilog));
2757           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2758                                            skip_vector ? anchor : guard_bb,
2759                                            prob_epilog.invert (),
2760                                            irred_flag);
2761           slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2762                                               single_exit (epilog));
2763           /* Only need to handle basic block before epilog loop if it's not
2764              the guard_bb, which is the case when skip_vector is true.  */
2765           if (guard_bb != bb_before_epilog)
2766             {
2767               prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2768
2769               scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2770             }
2771           scale_loop_profile (epilog, prob_epilog, 0);
2772         }
2773       else
2774         slpeel_update_phi_nodes_for_lcssa (epilog);
2775
2776       unsigned HOST_WIDE_INT bound;
2777       if (bound_scalar.is_constant (&bound))
2778         {
2779           gcc_assert (bound != 0);
2780           /* -1 to convert loop iterations to latch iterations.  */
2781           record_niter_bound (epilog, bound - 1, false, true);
2782         }
2783
2784       delete_update_ssa ();
2785       adjust_vec_debug_stmts ();
2786       scev_reset ();
2787     }
2788
2789   if (vect_epilogues)
2790     {
2791       epilog->aux = epilogue_vinfo;
2792       LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
2793
2794       loop_constraint_clear (epilog, LOOP_C_INFINITE);
2795
2796       /* We now must calculate the number of NITERS performed by the previous
2797          loop and EPILOGUE_NITERS to be performed by the epilogue.  */
2798       tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
2799                                  niters_prolog, niters_vector_mult_vf);
2800
2801       /* If skip_vector we may skip the previous loop, we insert a phi-node to
2802          determine whether we are coming from the previous vectorized loop
2803          using the update_e edge or the skip_vector basic block using the
2804          skip_e edge.  */
2805       if (skip_vector)
2806         {
2807           gcc_assert (update_e != NULL && skip_e != NULL);
2808           gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
2809                                            update_e->dest);
2810           tree new_ssa = make_ssa_name (TREE_TYPE (niters));
2811           gimple *stmt = gimple_build_assign (new_ssa, niters);
2812           gimple_stmt_iterator gsi;
2813           if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
2814               && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
2815             {
2816               gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
2817               gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
2818             }
2819           else
2820             {
2821               gsi = gsi_last_bb (update_e->src);
2822               gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
2823             }
2824
2825           niters = new_ssa;
2826           add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
2827           add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
2828                        UNKNOWN_LOCATION);
2829           niters = PHI_RESULT (new_phi);
2830         }
2831
2832       /* Subtract the number of iterations performed by the vectorized loop
2833          from the number of total iterations.  */
2834       tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
2835                                           before_loop_niters,
2836                                           niters);
2837
2838       LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
2839       LOOP_VINFO_NITERSM1 (epilogue_vinfo)
2840         = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
2841                        epilogue_niters,
2842                        build_one_cst (TREE_TYPE (epilogue_niters)));
2843
2844       /* Set ADVANCE to the number of iterations performed by the previous
2845          loop and its prologue.  */
2846       *advance = niters;
2847
2848       /* Redo the peeling for niter analysis as the NITERs and alignment
2849          may have been updated to take the main loop into account.  */
2850       determine_peel_for_niter (epilogue_vinfo);
2851     }
2852
2853   adjust_vec.release ();
2854   free_original_copy_tables ();
2855
2856   return vect_epilogues ? epilog : NULL;
2857 }
2858
2859 /* Function vect_create_cond_for_niters_checks.
2860
2861    Create a conditional expression that represents the run-time checks for
2862    loop's niter.  The loop is guaranteed to terminate if the run-time
2863    checks hold.
2864
2865    Input:
2866    COND_EXPR  - input conditional expression.  New conditions will be chained
2867                 with logical AND operation.  If it is NULL, then the function
2868                 is used to return the number of alias checks.
2869    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2870                 to be checked.
2871
2872    Output:
2873    COND_EXPR - conditional expression.
2874
2875    The returned COND_EXPR is the conditional expression to be used in the
2876    if statement that controls which version of the loop gets executed at
2877    runtime.  */
2878
2879 static void
2880 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
2881 {
2882   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
2883
2884   if (*cond_expr)
2885     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2886                               *cond_expr, part_cond_expr);
2887   else
2888     *cond_expr = part_cond_expr;
2889 }
2890
2891 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2892    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
2893
2894 static void
2895 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
2896 {
2897   if (*cond_expr)
2898     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2899                               *cond_expr, part_cond_expr);
2900   else
2901     *cond_expr = part_cond_expr;
2902 }
2903
2904 /* Function vect_create_cond_for_align_checks.
2905
2906    Create a conditional expression that represents the alignment checks for
2907    all of data references (array element references) whose alignment must be
2908    checked at runtime.
2909
2910    Input:
2911    COND_EXPR  - input conditional expression.  New conditions will be chained
2912                 with logical AND operation.
2913    LOOP_VINFO - two fields of the loop information are used.
2914                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
2915                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
2916
2917    Output:
2918    COND_EXPR_STMT_LIST - statements needed to construct the conditional
2919                          expression.
2920    The returned value is the conditional expression to be used in the if
2921    statement that controls which version of the loop gets executed at runtime.
2922
2923    The algorithm makes two assumptions:
2924      1) The number of bytes "n" in a vector is a power of 2.
2925      2) An address "a" is aligned if a%n is zero and that this
2926         test can be done as a&(n-1) == 0.  For example, for 16
2927         byte vectors the test is a&0xf == 0.  */
2928
2929 static void
2930 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
2931                                    tree *cond_expr,
2932                                    gimple_seq *cond_expr_stmt_list)
2933 {
2934   vec<stmt_vec_info> may_misalign_stmts
2935     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2936   stmt_vec_info stmt_info;
2937   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
2938   tree mask_cst;
2939   unsigned int i;
2940   tree int_ptrsize_type;
2941   char tmp_name[20];
2942   tree or_tmp_name = NULL_TREE;
2943   tree and_tmp_name;
2944   gimple *and_stmt;
2945   tree ptrsize_zero;
2946   tree part_cond_expr;
2947
2948   /* Check that mask is one less than a power of 2, i.e., mask is
2949      all zeros followed by all ones.  */
2950   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
2951
2952   int_ptrsize_type = signed_type_for (ptr_type_node);
2953
2954   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
2955      of the first vector of the i'th data reference. */
2956
2957   FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2958     {
2959       gimple_seq new_stmt_list = NULL;
2960       tree addr_base;
2961       tree addr_tmp_name;
2962       tree new_or_tmp_name;
2963       gimple *addr_stmt, *or_stmt;
2964       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2965       bool negative = tree_int_cst_compare
2966         (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
2967       tree offset = negative
2968         ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
2969
2970       /* create: addr_tmp = (int)(address_of_first_vector) */
2971       addr_base =
2972         vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
2973                                               offset);
2974       if (new_stmt_list != NULL)
2975         gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
2976
2977       sprintf (tmp_name, "addr2int%d", i);
2978       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2979       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
2980       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
2981
2982       /* The addresses are OR together.  */
2983
2984       if (or_tmp_name != NULL_TREE)
2985         {
2986           /* create: or_tmp = or_tmp | addr_tmp */
2987           sprintf (tmp_name, "orptrs%d", i);
2988           new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2989           or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
2990                                          or_tmp_name, addr_tmp_name);
2991           gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
2992           or_tmp_name = new_or_tmp_name;
2993         }
2994       else
2995         or_tmp_name = addr_tmp_name;
2996
2997     } /* end for i */
2998
2999   mask_cst = build_int_cst (int_ptrsize_type, mask);
3000
3001   /* create: and_tmp = or_tmp & mask  */
3002   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3003
3004   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3005                                   or_tmp_name, mask_cst);
3006   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3007
3008   /* Make and_tmp the left operand of the conditional test against zero.
3009      if and_tmp has a nonzero bit then some address is unaligned.  */
3010   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3011   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3012                                 and_tmp_name, ptrsize_zero);
3013   chain_cond_expr (cond_expr, part_cond_expr);
3014 }
3015
3016 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3017    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3018    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3019    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
3020
3021 static void
3022 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3023 {
3024   vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3025   unsigned int i;
3026   vec_object_pair *pair;
3027   FOR_EACH_VEC_ELT (pairs, i, pair)
3028     {
3029       tree addr1 = build_fold_addr_expr (pair->first);
3030       tree addr2 = build_fold_addr_expr (pair->second);
3031       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3032                                          addr1, addr2);
3033       chain_cond_expr (cond_expr, part_cond_expr);
3034     }
3035 }
3036
3037 /* Create an expression that is true when all lower-bound conditions for
3038    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
3039
3040 static void
3041 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3042 {
3043   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3044   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3045     {
3046       tree expr = lower_bounds[i].expr;
3047       tree type = unsigned_type_for (TREE_TYPE (expr));
3048       expr = fold_convert (type, expr);
3049       poly_uint64 bound = lower_bounds[i].min_value;
3050       if (!lower_bounds[i].unsigned_p)
3051         {
3052           expr = fold_build2 (PLUS_EXPR, type, expr,
3053                               build_int_cstu (type, bound - 1));
3054           bound += bound - 1;
3055         }
3056       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3057                                          build_int_cstu (type, bound));
3058       chain_cond_expr (cond_expr, part_cond_expr);
3059     }
3060 }
3061
3062 /* Function vect_create_cond_for_alias_checks.
3063
3064    Create a conditional expression that represents the run-time checks for
3065    overlapping of address ranges represented by a list of data references
3066    relations passed as input.
3067
3068    Input:
3069    COND_EXPR  - input conditional expression.  New conditions will be chained
3070                 with logical AND operation.  If it is NULL, then the function
3071                 is used to return the number of alias checks.
3072    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3073                 to be checked.
3074
3075    Output:
3076    COND_EXPR - conditional expression.
3077
3078    The returned COND_EXPR is the conditional expression to be used in the if
3079    statement that controls which version of the loop gets executed at runtime.
3080 */
3081
3082 void
3083 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3084 {
3085   vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
3086     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3087
3088   if (comp_alias_ddrs.is_empty ())
3089     return;
3090
3091   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3092                                &comp_alias_ddrs, cond_expr);
3093   if (dump_enabled_p ())
3094     dump_printf_loc (MSG_NOTE, vect_location,
3095                      "created %u versioning for alias checks.\n",
3096                      comp_alias_ddrs.length ());
3097 }
3098
3099
3100 /* Function vect_loop_versioning.
3101
3102    If the loop has data references that may or may not be aligned or/and
3103    has data reference relations whose independence was not proven then
3104    two versions of the loop need to be generated, one which is vectorized
3105    and one which isn't.  A test is then generated to control which of the
3106    loops is executed.  The test checks for the alignment of all of the
3107    data references that may or may not be aligned.  An additional
3108    sequence of runtime tests is generated for each pairs of DDRs whose
3109    independence was not proven.  The vectorized version of loop is
3110    executed only if both alias and alignment tests are passed.
3111
3112    The test generated to check which version of loop is executed
3113    is modified to also check for profitability as indicated by the
3114    cost model threshold TH.
3115
3116    The versioning precondition(s) are placed in *COND_EXPR and
3117    *COND_EXPR_STMT_LIST.  */
3118
3119 class loop *
3120 vect_loop_versioning (loop_vec_info loop_vinfo)
3121 {
3122   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3123   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3124   basic_block condition_bb;
3125   gphi_iterator gsi;
3126   gimple_stmt_iterator cond_exp_gsi;
3127   basic_block merge_bb;
3128   basic_block new_exit_bb;
3129   edge new_exit_e, e;
3130   gphi *orig_phi, *new_phi;
3131   tree cond_expr = NULL_TREE;
3132   gimple_seq cond_expr_stmt_list = NULL;
3133   tree arg;
3134   profile_probability prob = profile_probability::likely ();
3135   gimple_seq gimplify_stmt_list = NULL;
3136   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3137   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3138   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3139   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3140   poly_uint64 versioning_threshold
3141     = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3142   tree version_simd_if_cond
3143     = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3144   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3145
3146   if (th >= vect_vf_for_cost (loop_vinfo)
3147       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3148       && !ordered_p (th, versioning_threshold))
3149     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3150                              build_int_cst (TREE_TYPE (scalar_loop_iters),
3151                                             th - 1));
3152   if (maybe_ne (versioning_threshold, 0U))
3153     {
3154       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3155                                build_int_cst (TREE_TYPE (scalar_loop_iters),
3156                                               versioning_threshold - 1));
3157       if (cond_expr)
3158         cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3159                                  expr, cond_expr);
3160       else
3161         cond_expr = expr;
3162     }
3163
3164   if (version_niter)
3165     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3166
3167   if (cond_expr)
3168     cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3169                                         &cond_expr_stmt_list,
3170                                         is_gimple_condexpr, NULL_TREE);
3171
3172   if (version_align)
3173     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3174                                        &cond_expr_stmt_list);
3175
3176   if (version_alias)
3177     {
3178       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3179       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3180       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3181     }
3182
3183   if (version_simd_if_cond)
3184     {
3185       gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3186       if (flag_checking)
3187         if (basic_block bb
3188             = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3189           gcc_assert (bb != loop->header
3190                       && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3191                       && (scalar_loop == NULL
3192                           || (bb != scalar_loop->header
3193                               && dominated_by_p (CDI_DOMINATORS,
3194                                                  scalar_loop->header, bb))));
3195       tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3196       tree c = fold_build2 (NE_EXPR, boolean_type_node,
3197                             version_simd_if_cond, zero);
3198       if (cond_expr)
3199         cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3200                                  c, cond_expr);
3201       else
3202         cond_expr = c;
3203       if (dump_enabled_p ())
3204         dump_printf_loc (MSG_NOTE, vect_location,
3205                          "created versioning for simd if condition check.\n");
3206     }
3207
3208   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3209                                       &gimplify_stmt_list,
3210                                       is_gimple_condexpr, NULL_TREE);
3211   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3212
3213   /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3214      invariant in.  */
3215   class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3216   for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3217        !gsi_end_p (gsi); gsi_next (&gsi))
3218     {
3219       gimple *stmt = gsi_stmt (gsi);
3220       update_stmt (stmt);
3221       ssa_op_iter iter;
3222       use_operand_p use_p;
3223       basic_block def_bb;
3224       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3225         if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3226             && flow_bb_inside_loop_p (outermost, def_bb))
3227           outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3228     }
3229
3230   /* Search for the outermost loop we can version.  Avoid versioning of
3231      non-perfect nests but allow if-conversion versioned loops inside.  */
3232   class loop *loop_to_version = loop;
3233   if (flow_loop_nested_p (outermost, loop))
3234     {
3235       if (dump_enabled_p ())
3236         dump_printf_loc (MSG_NOTE, vect_location,
3237                          "trying to apply versioning to outer loop %d\n",
3238                          outermost->num);
3239       if (outermost->num == 0)
3240         outermost = superloop_at_depth (loop, 1);
3241       /* And avoid applying versioning on non-perfect nests.  */
3242       while (loop_to_version != outermost
3243              && single_exit (loop_outer (loop_to_version))
3244              && (!loop_outer (loop_to_version)->inner->next
3245                  || vect_loop_vectorized_call (loop_to_version))
3246              && (!loop_outer (loop_to_version)->inner->next
3247                  || !loop_outer (loop_to_version)->inner->next->next))
3248         loop_to_version = loop_outer (loop_to_version);
3249     }
3250
3251   /* Apply versioning.  If there is already a scalar version created by
3252      if-conversion re-use that.  Note we cannot re-use the copy of
3253      an if-converted outer-loop when vectorizing the inner loop only.  */
3254   gcond *cond;
3255   gimple *call;
3256   if ((!loop_to_version->inner || loop == loop_to_version)
3257       && (call = vect_loop_vectorized_call (loop_to_version, &cond)))
3258     {
3259       gcc_assert (scalar_loop);
3260       condition_bb = gimple_bb (cond);
3261       gimple_cond_set_condition_from_tree (cond, cond_expr);
3262       update_stmt (cond);
3263
3264       if (cond_expr_stmt_list)
3265         {
3266           cond_exp_gsi = gsi_for_stmt (call);
3267           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3268                                  GSI_SAME_STMT);
3269         }
3270
3271       /* if-conversion uses profile_probability::always () for both paths,
3272          reset the paths probabilities appropriately.  */
3273       edge te, fe;
3274       extract_true_false_edges_from_block (condition_bb, &te, &fe);
3275       te->probability = prob;
3276       fe->probability = prob.invert ();
3277       /* We can scale loops counts immediately but have to postpone
3278          scaling the scalar loop because we re-use it during peeling.  */
3279       scale_loop_frequencies (loop_to_version, te->probability);
3280       LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3281
3282       nloop = scalar_loop;
3283       if (dump_enabled_p ())
3284         dump_printf_loc (MSG_NOTE, vect_location,
3285                          "reusing %sloop version created by if conversion\n",
3286                          loop_to_version != loop ? "outer " : "");
3287     }
3288   else
3289     {
3290       if (loop_to_version != loop
3291           && dump_enabled_p ())
3292         dump_printf_loc (MSG_NOTE, vect_location,
3293                          "applying loop versioning to outer loop %d\n",
3294                          loop_to_version->num);
3295
3296       initialize_original_copy_tables ();
3297       nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3298                             prob, prob.invert (), prob, prob.invert (), true);
3299       gcc_assert (nloop);
3300       nloop = get_loop_copy (loop);
3301
3302       /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3303          reap those otherwise;  they also refer to the original
3304          loops.  */
3305       class loop *l = loop;
3306       while (gimple *call = vect_loop_vectorized_call (l))
3307         {
3308           call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3309           fold_loop_internal_call (call, boolean_false_node);
3310           l = loop_outer (l);
3311         }
3312       free_original_copy_tables ();
3313
3314       if (cond_expr_stmt_list)
3315         {
3316           cond_exp_gsi = gsi_last_bb (condition_bb);
3317           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3318                                  GSI_SAME_STMT);
3319         }
3320
3321       /* Loop versioning violates an assumption we try to maintain during
3322          vectorization - that the loop exit block has a single predecessor.
3323          After versioning, the exit block of both loop versions is the same
3324          basic block (i.e. it has two predecessors). Just in order to simplify
3325          following transformations in the vectorizer, we fix this situation
3326          here by adding a new (empty) block on the exit-edge of the loop,
3327          with the proper loop-exit phis to maintain loop-closed-form.
3328          If loop versioning wasn't done from loop, but scalar_loop instead,
3329          merge_bb will have already just a single successor.  */
3330
3331       merge_bb = single_exit (loop_to_version)->dest;
3332       if (EDGE_COUNT (merge_bb->preds) >= 2)
3333         {
3334           gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3335           new_exit_bb = split_edge (single_exit (loop_to_version));
3336           new_exit_e = single_exit (loop_to_version);
3337           e = EDGE_SUCC (new_exit_bb, 0);
3338
3339           for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3340                gsi_next (&gsi))
3341             {
3342               tree new_res;
3343               orig_phi = gsi.phi ();
3344               new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3345               new_phi = create_phi_node (new_res, new_exit_bb);
3346               arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3347               add_phi_arg (new_phi, arg, new_exit_e,
3348                            gimple_phi_arg_location_from_edge (orig_phi, e));
3349               adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3350             }
3351         }
3352
3353       update_ssa (TODO_update_ssa);
3354     }
3355
3356   if (version_niter)
3357     {
3358       /* The versioned loop could be infinite, we need to clear existing
3359          niter information which is copied from the original loop.  */
3360       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3361       vect_free_loop_info_assumptions (nloop);
3362       /* And set constraint LOOP_C_INFINITE for niter analyzer.  */
3363       loop_constraint_set (loop, LOOP_C_INFINITE);
3364     }
3365
3366   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3367       && dump_enabled_p ())
3368     {
3369       if (version_alias)
3370         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3371                          vect_location,
3372                          "loop versioned for vectorization because of "
3373                          "possible aliasing\n");
3374       if (version_align)
3375         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3376                          vect_location,
3377                          "loop versioned for vectorization to enhance "
3378                          "alignment\n");
3379
3380     }
3381
3382   return nloop;
3383 }