gcc/tree-vect-loop-manip.c

   1 /* Vectorizer Specific Loop Manipulations
   2    Copyright (C) 2003-2019 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "tree.h"
  27 #include "gimple.h"
  28 #include "cfghooks.h"
  29 #include "tree-pass.h"
  30 #include "ssa.h"
  31 #include "fold-const.h"
  32 #include "cfganal.h"
  33 #include "gimplify.h"
  34 #include "gimple-iterator.h"
  35 #include "gimplify-me.h"
  36 #include "tree-cfg.h"
  37 #include "tree-ssa-loop-manip.h"
  38 #include "tree-into-ssa.h"
  39 #include "tree-ssa.h"
  40 #include "cfgloop.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "gimple-fold.h"
  45 #include "tree-ssa-loop-niter.h"
  46 #include "internal-fn.h"
  47 #include "stor-layout.h"
  48 #include "optabs-query.h"
  49 #include "vec-perm-indices.h"
  50
  51 /*************************************************************************
  52   Simple Loop Peeling Utilities
  53
  54   Utilities to support loop peeling for vectorization purposes.
  55  *************************************************************************/
  56
  57
  58 /* Renames the use *OP_P.  */
  59
  60 static void
  61 rename_use_op (use_operand_p op_p)
  62 {
  63   tree new_name;
  64
  65   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
  66     return;
  67
  68   new_name = get_current_def (USE_FROM_PTR (op_p));
  69
  70   /* Something defined outside of the loop.  */
  71   if (!new_name)
  72     return;
  73
  74   /* An ordinary ssa name defined in the loop.  */
  75
  76   SET_USE (op_p, new_name);
  77 }
  78
  79
  80 /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
  81    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
  82    true.  */
  83
  84 static void
  85 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
  86 {
  87   gimple *stmt;
  88   use_operand_p use_p;
  89   ssa_op_iter iter;
  90   edge e;
  91   edge_iterator ei;
  92   class loop *loop = bb->loop_father;
  93   class loop *outer_loop = NULL;
  94
  95   if (rename_from_outer_loop)
  96     {
  97       gcc_assert (loop);
  98       outer_loop = loop_outer (loop);
  99     }
 100
 101   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
 102        gsi_next (&gsi))
 103     {
 104       stmt = gsi_stmt (gsi);
 105       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
 106         rename_use_op (use_p);
 107     }
 108
 109   FOR_EACH_EDGE (e, ei, bb->preds)
 110     {
 111       if (!flow_bb_inside_loop_p (loop, e->src))
 112         {
 113           if (!rename_from_outer_loop)
 114             continue;
 115           if (e->src != outer_loop->header)
 116             {
 117               if (outer_loop->inner->next)
 118                 {
 119                   /* If outer_loop has 2 inner loops, allow there to
 120                      be an extra basic block which decides which of the
 121                      two loops to use using LOOP_VECTORIZED.  */
 122                   if (!single_pred_p (e->src)
 123                       || single_pred (e->src) != outer_loop->header)
 124                     continue;
 125                 }
 126             }
 127         }
 128       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 129            gsi_next (&gsi))
 130         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
 131     }
 132 }
 133
 134
 135 struct adjust_info
 136 {
 137   tree from, to;
 138   basic_block bb;
 139 };
 140
 141 /* A stack of values to be adjusted in debug stmts.  We have to
 142    process them LIFO, so that the closest substitution applies.  If we
 143    processed them FIFO, without the stack, we might substitute uses
 144    with a PHI DEF that would soon become non-dominant, and when we got
 145    to the suitable one, it wouldn't have anything to substitute any
 146    more.  */
 147 static vec<adjust_info, va_heap> adjust_vec;
 148
 149 /* Adjust any debug stmts that referenced AI->from values to use the
 150    loop-closed AI->to, if the references are dominated by AI->bb and
 151    not by the definition of AI->from.  */
 152
 153 static void
 154 adjust_debug_stmts_now (adjust_info *ai)
 155 {
 156   basic_block bbphi = ai->bb;
 157   tree orig_def = ai->from;
 158   tree new_def = ai->to;
 159   imm_use_iterator imm_iter;
 160   gimple *stmt;
 161   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
 162
 163   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
 164
 165   /* Adjust any debug stmts that held onto non-loop-closed
 166      references.  */
 167   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
 168     {
 169       use_operand_p use_p;
 170       basic_block bbuse;
 171
 172       if (!is_gimple_debug (stmt))
 173         continue;
 174
 175       gcc_assert (gimple_debug_bind_p (stmt));
 176
 177       bbuse = gimple_bb (stmt);
 178
 179       if ((bbuse == bbphi
 180            || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
 181           && !(bbuse == bbdef
 182                || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
 183         {
 184           if (new_def)
 185             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
 186               SET_USE (use_p, new_def);
 187           else
 188             {
 189               gimple_debug_bind_reset_value (stmt);
 190               update_stmt (stmt);
 191             }
 192         }
 193     }
 194 }
 195
 196 /* Adjust debug stmts as scheduled before.  */
 197
 198 static void
 199 adjust_vec_debug_stmts (void)
 200 {
 201   if (!MAY_HAVE_DEBUG_BIND_STMTS)
 202     return;
 203
 204   gcc_assert (adjust_vec.exists ());
 205
 206   while (!adjust_vec.is_empty ())
 207     {
 208       adjust_debug_stmts_now (&adjust_vec.last ());
 209       adjust_vec.pop ();
 210     }
 211 }
 212
 213 /* Adjust any debug stmts that referenced FROM values to use the
 214    loop-closed TO, if the references are dominated by BB and not by
 215    the definition of FROM.  If adjust_vec is non-NULL, adjustments
 216    will be postponed until adjust_vec_debug_stmts is called.  */
 217
 218 static void
 219 adjust_debug_stmts (tree from, tree to, basic_block bb)
 220 {
 221   adjust_info ai;
 222
 223   if (MAY_HAVE_DEBUG_BIND_STMTS
 224       && TREE_CODE (from) == SSA_NAME
 225       && ! SSA_NAME_IS_DEFAULT_DEF (from)
 226       && ! virtual_operand_p (from))
 227     {
 228       ai.from = from;
 229       ai.to = to;
 230       ai.bb = bb;
 231
 232       if (adjust_vec.exists ())
 233         adjust_vec.safe_push (ai);
 234       else
 235         adjust_debug_stmts_now (&ai);
 236     }
 237 }
 238
 239 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
 240    to adjust any debug stmts that referenced the old phi arg,
 241    presumably non-loop-closed references left over from other
 242    transformations.  */
 243
 244 static void
 245 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
 246 {
 247   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
 248
 249   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
 250
 251   if (MAY_HAVE_DEBUG_BIND_STMTS)
 252     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
 253                         gimple_bb (update_phi));
 254 }
 255
 256 /* Define one loop mask MASK from loop LOOP.  INIT_MASK is the value that
 257    the mask should have during the first iteration and NEXT_MASK is the
 258    value that it should have on subsequent iterations.  */
 259
 260 static void
 261 vect_set_loop_mask (class loop *loop, tree mask, tree init_mask,
 262                     tree next_mask)
 263 {
 264   gphi *phi = create_phi_node (mask, loop->header);
 265   add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
 266   add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
 267 }
 268
 269 /* Add SEQ to the end of LOOP's preheader block.  */
 270
 271 static void
 272 add_preheader_seq (class loop *loop, gimple_seq seq)
 273 {
 274   if (seq)
 275     {
 276       edge pe = loop_preheader_edge (loop);
 277       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 278       gcc_assert (!new_bb);
 279     }
 280 }
 281
 282 /* Add SEQ to the beginning of LOOP's header block.  */
 283
 284 static void
 285 add_header_seq (class loop *loop, gimple_seq seq)
 286 {
 287   if (seq)
 288     {
 289       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
 290       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
 291     }
 292 }
 293
 294 /* Return true if the target can interleave elements of two vectors.
 295    OFFSET is 0 if the first half of the vectors should be interleaved
 296    or 1 if the second half should.  When returning true, store the
 297    associated permutation in INDICES.  */
 298
 299 static bool
 300 interleave_supported_p (vec_perm_indices *indices, tree vectype,
 301                         unsigned int offset)
 302 {
 303   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
 304   poly_uint64 base = exact_div (nelts, 2) * offset;
 305   vec_perm_builder sel (nelts, 2, 3);
 306   for (unsigned int i = 0; i < 3; ++i)
 307     {
 308       sel.quick_push (base + i);
 309       sel.quick_push (base + i + nelts);
 310     }
 311   indices->new_vector (sel, 2, nelts);
 312   return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
 313 }
 314
 315 /* Try to use permutes to define the masks in DEST_RGM using the masks
 316    in SRC_RGM, given that the former has twice as many masks as the
 317    latter.  Return true on success, adding any new statements to SEQ.  */
 318
 319 static bool
 320 vect_maybe_permute_loop_masks (loop_vec_info loop_vinfo, gimple_seq *seq,
 321                                rgroup_masks *dest_rgm,
 322                                rgroup_masks *src_rgm)
 323 {
 324   tree src_masktype = src_rgm->mask_type;
 325   tree dest_masktype = dest_rgm->mask_type;
 326   machine_mode src_mode = TYPE_MODE (src_masktype);
 327   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
 328       && optab_handler (vec_unpacku_hi_optab, src_mode) != CODE_FOR_nothing
 329       && optab_handler (vec_unpacku_lo_optab, src_mode) != CODE_FOR_nothing)
 330     {
 331       /* Unpacking the source masks gives at least as many mask bits as
 332          we need.  We can then VIEW_CONVERT any excess bits away.  */
 333       tree unpack_masktype = vect_halve_mask_nunits (loop_vinfo, src_masktype);
 334       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
 335         {
 336           tree src = src_rgm->masks[i / 2];
 337           tree dest = dest_rgm->masks[i];
 338           tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
 339                             ? VEC_UNPACK_HI_EXPR
 340                             : VEC_UNPACK_LO_EXPR);
 341           gassign *stmt;
 342           if (dest_masktype == unpack_masktype)
 343             stmt = gimple_build_assign (dest, code, src);
 344           else
 345             {
 346               tree temp = make_ssa_name (unpack_masktype);
 347               stmt = gimple_build_assign (temp, code, src);
 348               gimple_seq_add_stmt (seq, stmt);
 349               stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
 350                                           build1 (VIEW_CONVERT_EXPR,
 351                                                   dest_masktype, temp));
 352             }
 353           gimple_seq_add_stmt (seq, stmt);
 354         }
 355       return true;
 356     }
 357   vec_perm_indices indices[2];
 358   if (dest_masktype == src_masktype
 359       && interleave_supported_p (&indices[0], src_masktype, 0)
 360       && interleave_supported_p (&indices[1], src_masktype, 1))
 361     {
 362       /* The destination requires twice as many mask bits as the source, so
 363          we can use interleaving permutes to double up the number of bits.  */
 364       tree masks[2];
 365       for (unsigned int i = 0; i < 2; ++i)
 366         masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
 367       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
 368         {
 369           tree src = src_rgm->masks[i / 2];
 370           tree dest = dest_rgm->masks[i];
 371           gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
 372                                               src, src, masks[i & 1]);
 373           gimple_seq_add_stmt (seq, stmt);
 374         }
 375       return true;
 376     }
 377   return false;
 378 }
 379
 380 /* Helper for vect_set_loop_condition_masked.  Generate definitions for
 381    all the masks in RGM and return a mask that is nonzero when the loop
 382    needs to iterate.  Add any new preheader statements to PREHEADER_SEQ.
 383    Use LOOP_COND_GSI to insert code before the exit gcond.
 384
 385    RGM belongs to loop LOOP.  The loop originally iterated NITERS
 386    times and has been vectorized according to LOOP_VINFO.
 387
 388    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
 389    starts with NITERS_SKIP dummy iterations of the scalar loop before
 390    the real work starts.  The mask elements for these dummy iterations
 391    must be 0, to ensure that the extra iterations do not have an effect.
 392
 393    It is known that:
 394
 395      NITERS * RGM->max_nscalars_per_iter
 396
 397    does not overflow.  However, MIGHT_WRAP_P says whether an induction
 398    variable that starts at 0 and has step:
 399
 400      VF * RGM->max_nscalars_per_iter
 401
 402    might overflow before hitting a value above:
 403
 404      (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
 405
 406    This means that we cannot guarantee that such an induction variable
 407    would ever hit a value that produces a set of all-false masks for RGM.  */
 408
 409 static tree
 410 vect_set_loop_masks_directly (class loop *loop, loop_vec_info loop_vinfo,
 411                               gimple_seq *preheader_seq,
 412                               gimple_stmt_iterator loop_cond_gsi,
 413                               rgroup_masks *rgm, tree niters, tree niters_skip,
 414                               bool might_wrap_p)
 415 {
 416   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
 417   tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo);
 418   tree mask_type = rgm->mask_type;
 419   unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
 420   poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
 421   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 422
 423   /* Calculate the maximum number of scalar values that the rgroup
 424      handles in total, the number that it handles for each iteration
 425      of the vector loop, and the number that it should skip during the
 426      first iteration of the vector loop.  */
 427   tree nscalars_total = niters;
 428   tree nscalars_step = build_int_cst (iv_type, vf);
 429   tree nscalars_skip = niters_skip;
 430   if (nscalars_per_iter != 1)
 431     {
 432       /* We checked before choosing to use a fully-masked loop that these
 433          multiplications don't overflow.  */
 434       tree compare_factor = build_int_cst (compare_type, nscalars_per_iter);
 435       tree iv_factor = build_int_cst (iv_type, nscalars_per_iter);
 436       nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 437                                      nscalars_total, compare_factor);
 438       nscalars_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
 439                                     nscalars_step, iv_factor);
 440       if (nscalars_skip)
 441         nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 442                                       nscalars_skip, compare_factor);
 443     }
 444
 445   /* Create an induction variable that counts the number of scalars
 446      processed.  */
 447   tree index_before_incr, index_after_incr;
 448   gimple_stmt_iterator incr_gsi;
 449   bool insert_after;
 450   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 451   create_iv (build_int_cst (iv_type, 0), nscalars_step, NULL_TREE, loop,
 452              &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
 453
 454   tree zero_index = build_int_cst (compare_type, 0);
 455   tree test_index, test_limit, first_limit;
 456   gimple_stmt_iterator *test_gsi;
 457   if (might_wrap_p)
 458     {
 459       /* In principle the loop should stop iterating once the incremented
 460          IV reaches a value greater than or equal to:
 461
 462            NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
 463
 464          However, there's no guarantee that this addition doesn't overflow
 465          the comparison type, or that the IV hits a value above it before
 466          wrapping around.  We therefore adjust the limit down by one
 467          IV step:
 468
 469            (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
 470            -[infinite-prec] NSCALARS_STEP
 471
 472          and compare the IV against this limit _before_ incrementing it.
 473          Since the comparison type is unsigned, we actually want the
 474          subtraction to saturate at zero:
 475
 476            (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
 477            -[sat] NSCALARS_STEP
 478
 479          And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
 480
 481            NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
 482
 483          where the rightmost subtraction can be done directly in
 484          COMPARE_TYPE.  */
 485       test_index = index_before_incr;
 486       tree adjust = gimple_convert (preheader_seq, compare_type,
 487                                     nscalars_step);
 488       if (nscalars_skip)
 489         adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 490                                adjust, nscalars_skip);
 491       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
 492                                  nscalars_total, adjust);
 493       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 494                                  test_limit, adjust);
 495       test_gsi = &incr_gsi;
 496
 497       /* Get a safe limit for the first iteration.  */
 498       if (nscalars_skip)
 499         {
 500           /* The first vector iteration can handle at most NSCALARS_STEP
 501              scalars.  NSCALARS_STEP <= CONST_LIMIT, and adding
 502              NSCALARS_SKIP to that cannot overflow.  */
 503           tree const_limit = build_int_cst (compare_type,
 504                                             LOOP_VINFO_VECT_FACTOR (loop_vinfo)
 505                                             * nscalars_per_iter);
 506           first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
 507                                       nscalars_total, const_limit);
 508           first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 509                                       first_limit, nscalars_skip);
 510         }
 511       else
 512         /* For the first iteration it doesn't matter whether the IV hits
 513            a value above NSCALARS_TOTAL.  That only matters for the latch
 514            condition.  */
 515         first_limit = nscalars_total;
 516     }
 517   else
 518     {
 519       /* Test the incremented IV, which will always hit a value above
 520          the bound before wrapping.  */
 521       test_index = index_after_incr;
 522       test_limit = nscalars_total;
 523       if (nscalars_skip)
 524         test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 525                                    test_limit, nscalars_skip);
 526       test_gsi = &loop_cond_gsi;
 527
 528       first_limit = test_limit;
 529     }
 530
 531   /* Convert the IV value to the comparison type (either a no-op or
 532      a demotion).  */
 533   gimple_seq test_seq = NULL;
 534   test_index = gimple_convert (&test_seq, compare_type, test_index);
 535   gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
 536
 537   /* Provide a definition of each mask in the group.  */
 538   tree next_mask = NULL_TREE;
 539   tree mask;
 540   unsigned int i;
 541   FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
 542     {
 543       /* Previous masks will cover BIAS scalars.  This mask covers the
 544          next batch.  */
 545       poly_uint64 bias = nscalars_per_mask * i;
 546       tree bias_tree = build_int_cst (compare_type, bias);
 547       gimple *tmp_stmt;
 548
 549       /* See whether the first iteration of the vector loop is known
 550          to have a full mask.  */
 551       poly_uint64 const_limit;
 552       bool first_iteration_full
 553         = (poly_int_tree_p (first_limit, &const_limit)
 554            && known_ge (const_limit, (i + 1) * nscalars_per_mask));
 555
 556       /* Rather than have a new IV that starts at BIAS and goes up to
 557          TEST_LIMIT, prefer to use the same 0-based IV for each mask
 558          and adjust the bound down by BIAS.  */
 559       tree this_test_limit = test_limit;
 560       if (i != 0)
 561         {
 562           this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
 563                                           compare_type, this_test_limit,
 564                                           bias_tree);
 565           this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
 566                                           compare_type, this_test_limit,
 567                                           bias_tree);
 568         }
 569
 570       /* Create the initial mask.  First include all scalars that
 571          are within the loop limit.  */
 572       tree init_mask = NULL_TREE;
 573       if (!first_iteration_full)
 574         {
 575           tree start, end;
 576           if (first_limit == test_limit)
 577             {
 578               /* Use a natural test between zero (the initial IV value)
 579                  and the loop limit.  The "else" block would be valid too,
 580                  but this choice can avoid the need to load BIAS_TREE into
 581                  a register.  */
 582               start = zero_index;
 583               end = this_test_limit;
 584             }
 585           else
 586             {
 587               /* FIRST_LIMIT is the maximum number of scalars handled by the
 588                  first iteration of the vector loop.  Test the portion
 589                  associated with this mask.  */
 590               start = bias_tree;
 591               end = first_limit;
 592             }
 593
 594           init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
 595           tmp_stmt = vect_gen_while (init_mask, start, end);
 596           gimple_seq_add_stmt (preheader_seq, tmp_stmt);
 597         }
 598
 599       /* Now AND out the bits that are within the number of skipped
 600          scalars.  */
 601       poly_uint64 const_skip;
 602       if (nscalars_skip
 603           && !(poly_int_tree_p (nscalars_skip, &const_skip)
 604                && known_le (const_skip, bias)))
 605         {
 606           tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
 607                                                     bias_tree, nscalars_skip);
 608           if (init_mask)
 609             init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
 610                                       init_mask, unskipped_mask);
 611           else
 612             init_mask = unskipped_mask;
 613         }
 614
 615       if (!init_mask)
 616         /* First iteration is full.  */
 617         init_mask = build_minus_one_cst (mask_type);
 618
 619       /* Get the mask value for the next iteration of the loop.  */
 620       next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
 621       gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
 622       gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
 623
 624       vect_set_loop_mask (loop, mask, init_mask, next_mask);
 625     }
 626   return next_mask;
 627 }
 628
 629 /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
 630    LOOP_VINFO describes the vectorization of LOOP.  NITERS is the
 631    number of iterations of the original scalar loop that should be
 632    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are
 633    as for vect_set_loop_condition.
 634
 635    Insert the branch-back condition before LOOP_COND_GSI and return the
 636    final gcond.  */
 637
 638 static gcond *
 639 vect_set_loop_condition_masked (class loop *loop, loop_vec_info loop_vinfo,
 640                                 tree niters, tree final_iv,
 641                                 bool niters_maybe_zero,
 642                                 gimple_stmt_iterator loop_cond_gsi)
 643 {
 644   gimple_seq preheader_seq = NULL;
 645   gimple_seq header_seq = NULL;
 646
 647   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
 648   unsigned int compare_precision = TYPE_PRECISION (compare_type);
 649   tree orig_niters = niters;
 650
 651   /* Type of the initial value of NITERS.  */
 652   tree ni_actual_type = TREE_TYPE (niters);
 653   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
 654   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 655
 656   /* Convert NITERS to the same size as the compare.  */
 657   if (compare_precision > ni_actual_precision
 658       && niters_maybe_zero)
 659     {
 660       /* We know that there is always at least one iteration, so if the
 661          count is zero then it must have wrapped.  Cope with this by
 662          subtracting 1 before the conversion and adding 1 to the result.  */
 663       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
 664       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
 665                              niters, build_minus_one_cst (ni_actual_type));
 666       niters = gimple_convert (&preheader_seq, compare_type, niters);
 667       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
 668                              niters, build_one_cst (compare_type));
 669     }
 670   else
 671     niters = gimple_convert (&preheader_seq, compare_type, niters);
 672
 673   widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
 674
 675   /* Iterate over all the rgroups and fill in their masks.  We could use
 676      the first mask from any rgroup for the loop condition; here we
 677      arbitrarily pick the last.  */
 678   tree test_mask = NULL_TREE;
 679   rgroup_masks *rgm;
 680   unsigned int i;
 681   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
 682   FOR_EACH_VEC_ELT (*masks, i, rgm)
 683     if (!rgm->masks.is_empty ())
 684       {
 685         /* First try using permutes.  This adds a single vector
 686            instruction to the loop for each mask, but needs no extra
 687            loop invariants or IVs.  */
 688         unsigned int nmasks = i + 1;
 689         if ((nmasks & 1) == 0)
 690           {
 691             rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
 692             if (!half_rgm->masks.is_empty ()
 693                 && vect_maybe_permute_loop_masks (loop_vinfo, &header_seq,
 694                                                   rgm, half_rgm))
 695               continue;
 696           }
 697
 698         /* See whether zero-based IV would ever generate all-false masks
 699            before wrapping around.  */
 700         bool might_wrap_p
 701           = (iv_limit == -1
 702              || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
 703                                     UNSIGNED)
 704                  > compare_precision));
 705
 706         /* Set up all masks for this group.  */
 707         test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
 708                                                   &preheader_seq,
 709                                                   loop_cond_gsi, rgm,
 710                                                   niters, niters_skip,
 711                                                   might_wrap_p);
 712       }
 713
 714   /* Emit all accumulated statements.  */
 715   add_preheader_seq (loop, preheader_seq);
 716   add_header_seq (loop, header_seq);
 717
 718   /* Get a boolean result that tells us whether to iterate.  */
 719   edge exit_edge = single_exit (loop);
 720   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
 721   tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
 722   gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
 723                                         NULL_TREE, NULL_TREE);
 724   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 725
 726   /* The loop iterates (NITERS - 1) / VF + 1 times.
 727      Subtract one from this to get the latch count.  */
 728   tree step = build_int_cst (compare_type,
 729                              LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 730   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
 731                                        build_minus_one_cst (compare_type));
 732   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
 733                                      niters_minus_one, step);
 734
 735   if (final_iv)
 736     {
 737       gassign *assign = gimple_build_assign (final_iv, orig_niters);
 738       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 739     }
 740
 741   return cond_stmt;
 742 }
 743
 744 /* Like vect_set_loop_condition, but handle the case in which there
 745    are no loop masks.  */
 746
 747 static gcond *
 748 vect_set_loop_condition_unmasked (class loop *loop, tree niters,
 749                                   tree step, tree final_iv,
 750                                   bool niters_maybe_zero,
 751                                   gimple_stmt_iterator loop_cond_gsi)
 752 {
 753   tree indx_before_incr, indx_after_incr;
 754   gcond *cond_stmt;
 755   gcond *orig_cond;
 756   edge pe = loop_preheader_edge (loop);
 757   edge exit_edge = single_exit (loop);
 758   gimple_stmt_iterator incr_gsi;
 759   bool insert_after;
 760   enum tree_code code;
 761   tree niters_type = TREE_TYPE (niters);
 762
 763   orig_cond = get_loop_exit_condition (loop);
 764   gcc_assert (orig_cond);
 765   loop_cond_gsi = gsi_for_stmt (orig_cond);
 766
 767   tree init, limit;
 768   if (!niters_maybe_zero && integer_onep (step))
 769     {
 770       /* In this case we can use a simple 0-based IV:
 771
 772          A:
 773            x = 0;
 774            do
 775              {
 776                ...
 777                x += 1;
 778              }
 779            while (x < NITERS);  */
 780       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 781       init = build_zero_cst (niters_type);
 782       limit = niters;
 783     }
 784   else
 785     {
 786       /* The following works for all values of NITERS except 0:
 787
 788          B:
 789            x = 0;
 790            do
 791              {
 792                ...
 793                x += STEP;
 794              }
 795            while (x <= NITERS - STEP);
 796
 797          so that the loop continues to iterate if x + STEP - 1 < NITERS
 798          but stops if x + STEP - 1 >= NITERS.
 799
 800          However, if NITERS is zero, x never hits a value above NITERS - STEP
 801          before wrapping around.  There are two obvious ways of dealing with
 802          this:
 803
 804          - start at STEP - 1 and compare x before incrementing it
 805          - start at -1 and compare x after incrementing it
 806
 807          The latter is simpler and is what we use.  The loop in this case
 808          looks like:
 809
 810          C:
 811            x = -1;
 812            do
 813              {
 814                ...
 815                x += STEP;
 816              }
 817            while (x < NITERS - STEP);
 818
 819          In both cases the loop limit is NITERS - STEP.  */
 820       gimple_seq seq = NULL;
 821       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
 822       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
 823       if (seq)
 824         {
 825           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 826           gcc_assert (!new_bb);
 827         }
 828       if (niters_maybe_zero)
 829         {
 830           /* Case C.  */
 831           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 832           init = build_all_ones_cst (niters_type);
 833         }
 834       else
 835         {
 836           /* Case B.  */
 837           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
 838           init = build_zero_cst (niters_type);
 839         }
 840     }
 841
 842   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 843   create_iv (init, step, NULL_TREE, loop,
 844              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
 845   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
 846                                               true, NULL_TREE, true,
 847                                               GSI_SAME_STMT);
 848   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
 849                                      true, GSI_SAME_STMT);
 850
 851   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
 852                                  NULL_TREE);
 853
 854   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 855
 856   /* Record the number of latch iterations.  */
 857   if (limit == niters)
 858     /* Case A: the loop iterates NITERS times.  Subtract one to get the
 859        latch count.  */
 860     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
 861                                        build_int_cst (niters_type, 1));
 862   else
 863     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
 864        Subtract one from this to get the latch count.  */
 865     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
 866                                        limit, step);
 867
 868   if (final_iv)
 869     {
 870       gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
 871                                              indx_after_incr, init);
 872       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 873     }
 874
 875   return cond_stmt;
 876 }
 877
 878 /* If we're using fully-masked loops, make LOOP iterate:
 879
 880       N == (NITERS - 1) / STEP + 1
 881
 882    times.  When NITERS is zero, this is equivalent to making the loop
 883    execute (1 << M) / STEP times, where M is the precision of NITERS.
 884    NITERS_MAYBE_ZERO is true if this last case might occur.
 885
 886    If we're not using fully-masked loops, make LOOP iterate:
 887
 888       N == (NITERS - STEP) / STEP + 1
 889
 890    times, where NITERS is known to be outside the range [1, STEP - 1].
 891    This is equivalent to making the loop execute NITERS / STEP times
 892    when NITERS is nonzero and (1 << M) / STEP times otherwise.
 893    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
 894
 895    If FINAL_IV is nonnull, it is an SSA name that should be set to
 896    N * STEP on exit from the loop.
 897
 898    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
 899
 900 void
 901 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
 902                          tree niters, tree step, tree final_iv,
 903                          bool niters_maybe_zero)
 904 {
 905   gcond *cond_stmt;
 906   gcond *orig_cond = get_loop_exit_condition (loop);
 907   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
 908
 909   if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
 910     cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
 911                                                 final_iv, niters_maybe_zero,
 912                                                 loop_cond_gsi);
 913   else
 914     cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
 915                                                   final_iv, niters_maybe_zero,
 916                                                   loop_cond_gsi);
 917
 918   /* Remove old loop exit test.  */
 919   stmt_vec_info orig_cond_info;
 920   if (loop_vinfo
 921       && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
 922     loop_vinfo->remove_stmt (orig_cond_info);
 923   else
 924     gsi_remove (&loop_cond_gsi, true);
 925
 926   if (dump_enabled_p ())
 927     dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
 928                      cond_stmt);
 929 }
 930
 931 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
 932    For all PHI arguments in FROM->dest and TO->dest from those
 933    edges ensure that TO->dest PHI arguments have current_def
 934    to that in from.  */
 935
 936 static void
 937 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
 938 {
 939   gimple_stmt_iterator gsi_from, gsi_to;
 940
 941   for (gsi_from = gsi_start_phis (from->dest),
 942        gsi_to = gsi_start_phis (to->dest);
 943        !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
 944     {
 945       gimple *from_phi = gsi_stmt (gsi_from);
 946       gimple *to_phi = gsi_stmt (gsi_to);
 947       tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
 948       tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
 949       if (virtual_operand_p (from_arg))
 950         {
 951           gsi_next (&gsi_from);
 952           continue;
 953         }
 954       if (virtual_operand_p (to_arg))
 955         {
 956           gsi_next (&gsi_to);
 957           continue;
 958         }
 959       if (TREE_CODE (from_arg) != SSA_NAME)
 960         gcc_assert (operand_equal_p (from_arg, to_arg, 0));
 961       else if (TREE_CODE (to_arg) == SSA_NAME
 962                && from_arg != to_arg)
 963         {
 964           if (get_current_def (to_arg) == NULL_TREE)
 965             {
 966               gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
 967                                               TREE_TYPE (get_current_def
 968                                                            (from_arg))));
 969               set_current_def (to_arg, get_current_def (from_arg));
 970             }
 971         }
 972       gsi_next (&gsi_from);
 973       gsi_next (&gsi_to);
 974     }
 975
 976   gphi *from_phi = get_virtual_phi (from->dest);
 977   gphi *to_phi = get_virtual_phi (to->dest);
 978   if (from_phi)
 979     set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
 980                      get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
 981 }
 982
 983
 984 /* Given LOOP this function generates a new copy of it and puts it
 985    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
 986    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
 987    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
 988    entry or exit of LOOP.  */
 989
 990 class loop *
 991 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
 992                                         class loop *scalar_loop, edge e)
 993 {
 994   class loop *new_loop;
 995   basic_block *new_bbs, *bbs, *pbbs;
 996   bool at_exit;
 997   bool was_imm_dom;
 998   basic_block exit_dest;
 999   edge exit, new_exit;
1000   bool duplicate_outer_loop = false;
1001
1002   exit = single_exit (loop);
1003   at_exit = (e == exit);
1004   if (!at_exit && e != loop_preheader_edge (loop))
1005     return NULL;
1006
1007   if (scalar_loop == NULL)
1008     scalar_loop = loop;
1009
1010   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1011   pbbs = bbs + 1;
1012   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1013   /* Allow duplication of outer loops.  */
1014   if (scalar_loop->inner)
1015     duplicate_outer_loop = true;
1016   /* Check whether duplication is possible.  */
1017   if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1018     {
1019       free (bbs);
1020       return NULL;
1021     }
1022
1023   /* Generate new loop structure.  */
1024   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1025   duplicate_subloops (scalar_loop, new_loop);
1026
1027   exit_dest = exit->dest;
1028   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1029                                           exit_dest) == loop->header ?
1030                  true : false);
1031
1032   /* Also copy the pre-header, this avoids jumping through hoops to
1033      duplicate the loop entry PHI arguments.  Create an empty
1034      pre-header unconditionally for this.  */
1035   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1036   edge entry_e = single_pred_edge (preheader);
1037   bbs[0] = preheader;
1038   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1039
1040   exit = single_exit (scalar_loop);
1041   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1042             &exit, 1, &new_exit, NULL,
1043             at_exit ? loop->latch : e->src, true);
1044   exit = single_exit (loop);
1045   basic_block new_preheader = new_bbs[0];
1046
1047   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1048
1049   if (scalar_loop != loop)
1050     {
1051       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1052          SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1053          but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
1054          the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1055          header) to have current_def set, so copy them over.  */
1056       slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1057                                                 exit);
1058       slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1059                                                            0),
1060                                                 EDGE_SUCC (loop->latch, 0));
1061     }
1062
1063   if (at_exit) /* Add the loop copy at exit.  */
1064     {
1065       if (scalar_loop != loop)
1066         {
1067           gphi_iterator gsi;
1068           new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1069
1070           for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1071                gsi_next (&gsi))
1072             {
1073               gphi *phi = gsi.phi ();
1074               tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1075               location_t orig_locus
1076                 = gimple_phi_arg_location_from_edge (phi, e);
1077
1078               add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1079             }
1080         }
1081       redirect_edge_and_branch_force (e, new_preheader);
1082       flush_pending_stmts (e);
1083       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1084       if (was_imm_dom || duplicate_outer_loop)
1085         set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1086
1087       /* And remove the non-necessary forwarder again.  Keep the other
1088          one so we have a proper pre-header for the loop at the exit edge.  */
1089       redirect_edge_pred (single_succ_edge (preheader),
1090                           single_pred (preheader));
1091       delete_basic_block (preheader);
1092       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1093                                loop_preheader_edge (scalar_loop)->src);
1094     }
1095   else /* Add the copy at entry.  */
1096     {
1097       if (scalar_loop != loop)
1098         {
1099           /* Remove the non-necessary forwarder of scalar_loop again.  */
1100           redirect_edge_pred (single_succ_edge (preheader),
1101                               single_pred (preheader));
1102           delete_basic_block (preheader);
1103           set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1104                                    loop_preheader_edge (scalar_loop)->src);
1105           preheader = split_edge (loop_preheader_edge (loop));
1106           entry_e = single_pred_edge (preheader);
1107         }
1108
1109       redirect_edge_and_branch_force (entry_e, new_preheader);
1110       flush_pending_stmts (entry_e);
1111       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1112
1113       redirect_edge_and_branch_force (new_exit, preheader);
1114       flush_pending_stmts (new_exit);
1115       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1116
1117       /* And remove the non-necessary forwarder again.  Keep the other
1118          one so we have a proper pre-header for the loop at the exit edge.  */
1119       redirect_edge_pred (single_succ_edge (new_preheader),
1120                           single_pred (new_preheader));
1121       delete_basic_block (new_preheader);
1122       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1123                                loop_preheader_edge (new_loop)->src);
1124     }
1125
1126   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1127   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1128     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1129
1130   if (scalar_loop != loop)
1131     {
1132       /* Update new_loop->header PHIs, so that on the preheader
1133          edge they are the ones from loop rather than scalar_loop.  */
1134       gphi_iterator gsi_orig, gsi_new;
1135       edge orig_e = loop_preheader_edge (loop);
1136       edge new_e = loop_preheader_edge (new_loop);
1137
1138       for (gsi_orig = gsi_start_phis (loop->header),
1139            gsi_new = gsi_start_phis (new_loop->header);
1140            !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1141            gsi_next (&gsi_orig), gsi_next (&gsi_new))
1142         {
1143           gphi *orig_phi = gsi_orig.phi ();
1144           gphi *new_phi = gsi_new.phi ();
1145           tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1146           location_t orig_locus
1147             = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1148
1149           add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1150         }
1151     }
1152
1153   free (new_bbs);
1154   free (bbs);
1155
1156   checking_verify_dominators (CDI_DOMINATORS);
1157
1158   return new_loop;
1159 }
1160
1161
1162 /* Given the condition expression COND, put it as the last statement of
1163    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1164    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1165    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1166    new edge as irreducible if IRREDUCIBLE_P is true.  */
1167
1168 static edge
1169 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1170                        basic_block guard_to, basic_block dom_bb,
1171                        profile_probability probability, bool irreducible_p)
1172 {
1173   gimple_stmt_iterator gsi;
1174   edge new_e, enter_e;
1175   gcond *cond_stmt;
1176   gimple_seq gimplify_stmt_list = NULL;
1177
1178   enter_e = EDGE_SUCC (guard_bb, 0);
1179   enter_e->flags &= ~EDGE_FALLTHRU;
1180   enter_e->flags |= EDGE_FALSE_VALUE;
1181   gsi = gsi_last_bb (guard_bb);
1182
1183   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1184                                  NULL_TREE);
1185   if (gimplify_stmt_list)
1186     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1187
1188   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1189   gsi = gsi_last_bb (guard_bb);
1190   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1191
1192   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1193   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1194
1195   new_e->probability = probability;
1196   if (irreducible_p)
1197     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1198
1199   enter_e->probability = probability.invert ();
1200   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1201
1202   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1203   if (enter_e->dest->loop_father->header == enter_e->dest)
1204     split_edge (enter_e);
1205
1206   return new_e;
1207 }
1208
1209
1210 /* This function verifies that the following restrictions apply to LOOP:
1211    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1212        for innermost loop and 5 basic blocks for outer-loop.
1213    (2) it is single entry, single exit
1214    (3) its exit condition is the last stmt in the header
1215    (4) E is the entry/exit edge of LOOP.
1216  */
1217
1218 bool
1219 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1220 {
1221   edge exit_e = single_exit (loop);
1222   edge entry_e = loop_preheader_edge (loop);
1223   gcond *orig_cond = get_loop_exit_condition (loop);
1224   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1225   unsigned int num_bb = loop->inner? 5 : 2;
1226
1227   /* All loops have an outer scope; the only case loop->outer is NULL is for
1228      the function itself.  */
1229   if (!loop_outer (loop)
1230       || loop->num_nodes != num_bb
1231       || !empty_block_p (loop->latch)
1232       || !single_exit (loop)
1233       /* Verify that new loop exit condition can be trivially modified.  */
1234       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1235       || (e != exit_e && e != entry_e))
1236     return false;
1237
1238   return true;
1239 }
1240
1241 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1242    in the exit bb and rename all the uses after the loop.  This simplifies
1243    the *guard[12] routines, which assume loop closed SSA form for all PHIs
1244    (but normally loop closed SSA form doesn't require virtual PHIs to be
1245    in the same form).  Doing this early simplifies the checking what
1246    uses should be renamed.  */
1247
1248 static void
1249 create_lcssa_for_virtual_phi (class loop *loop)
1250 {
1251   gphi_iterator gsi;
1252   edge exit_e = single_exit (loop);
1253
1254   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1255     if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1256       {
1257         gphi *phi = gsi.phi ();
1258         for (gsi = gsi_start_phis (exit_e->dest);
1259              !gsi_end_p (gsi); gsi_next (&gsi))
1260           if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1261             break;
1262         if (gsi_end_p (gsi))
1263           {
1264             tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1265             gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1266             tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1267             imm_use_iterator imm_iter;
1268             gimple *stmt;
1269             use_operand_p use_p;
1270
1271             SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1272               = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1273             add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1274             gimple_phi_set_result (new_phi, new_vop);
1275             FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1276               if (stmt != new_phi
1277                   && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1278                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1279                   SET_USE (use_p, new_vop);
1280           }
1281         break;
1282       }
1283
1284 }
1285
1286 /* Function vect_get_loop_location.
1287
1288    Extract the location of the loop in the source code.
1289    If the loop is not well formed for vectorization, an estimated
1290    location is calculated.
1291    Return the loop location if succeed and NULL if not.  */
1292
1293 dump_user_location_t
1294 find_loop_location (class loop *loop)
1295 {
1296   gimple *stmt = NULL;
1297   basic_block bb;
1298   gimple_stmt_iterator si;
1299
1300   if (!loop)
1301     return dump_user_location_t ();
1302
1303   stmt = get_loop_exit_condition (loop);
1304
1305   if (stmt
1306       && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1307     return stmt;
1308
1309   /* If we got here the loop is probably not "well formed",
1310      try to estimate the loop location */
1311
1312   if (!loop->header)
1313     return dump_user_location_t ();
1314
1315   bb = loop->header;
1316
1317   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1318     {
1319       stmt = gsi_stmt (si);
1320       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1321         return stmt;
1322     }
1323
1324   return dump_user_location_t ();
1325 }
1326
1327 /* Return true if the phi described by STMT_INFO defines an IV of the
1328    loop to be vectorized.  */
1329
1330 static bool
1331 iv_phi_p (stmt_vec_info stmt_info)
1332 {
1333   gphi *phi = as_a <gphi *> (stmt_info->stmt);
1334   if (virtual_operand_p (PHI_RESULT (phi)))
1335     return false;
1336
1337   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1338       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1339     return false;
1340
1341   return true;
1342 }
1343
1344 /* Function vect_can_advance_ivs_p
1345
1346    In case the number of iterations that LOOP iterates is unknown at compile
1347    time, an epilog loop will be generated, and the loop induction variables
1348    (IVs) will be "advanced" to the value they are supposed to take just before
1349    the epilog loop.  Here we check that the access function of the loop IVs
1350    and the expression that represents the loop bound are simple enough.
1351    These restrictions will be relaxed in the future.  */
1352
1353 bool
1354 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1355 {
1356   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1357   basic_block bb = loop->header;
1358   gphi_iterator gsi;
1359
1360   /* Analyze phi functions of the loop header.  */
1361
1362   if (dump_enabled_p ())
1363     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1364   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1365     {
1366       tree evolution_part;
1367
1368       gphi *phi = gsi.phi ();
1369       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1370       if (dump_enabled_p ())
1371         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1372                          phi_info->stmt);
1373
1374       /* Skip virtual phi's. The data dependences that are associated with
1375          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1376
1377          Skip reduction phis.  */
1378       if (!iv_phi_p (phi_info))
1379         {
1380           if (dump_enabled_p ())
1381             dump_printf_loc (MSG_NOTE, vect_location,
1382                              "reduc or virtual phi. skip.\n");
1383           continue;
1384         }
1385
1386       /* Analyze the evolution function.  */
1387
1388       evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1389       if (evolution_part == NULL_TREE)
1390         {
1391           if (dump_enabled_p ())
1392             dump_printf (MSG_MISSED_OPTIMIZATION,
1393                          "No access function or evolution.\n");
1394           return false;
1395         }
1396
1397       /* FORNOW: We do not transform initial conditions of IVs
1398          which evolution functions are not invariants in the loop.  */
1399
1400       if (!expr_invariant_in_loop_p (loop, evolution_part))
1401         {
1402           if (dump_enabled_p ())
1403             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1404                              "evolution not invariant in loop.\n");
1405           return false;
1406         }
1407
1408       /* FORNOW: We do not transform initial conditions of IVs
1409          which evolution functions are a polynomial of degree >= 2.  */
1410
1411       if (tree_is_chrec (evolution_part))
1412         {
1413           if (dump_enabled_p ())
1414             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1415                              "evolution is chrec.\n");
1416           return false;
1417         }
1418     }
1419
1420   return true;
1421 }
1422
1423
1424 /*   Function vect_update_ivs_after_vectorizer.
1425
1426      "Advance" the induction variables of LOOP to the value they should take
1427      after the execution of LOOP.  This is currently necessary because the
1428      vectorizer does not handle induction variables that are used after the
1429      loop.  Such a situation occurs when the last iterations of LOOP are
1430      peeled, because:
1431      1. We introduced new uses after LOOP for IVs that were not originally used
1432         after LOOP: the IVs of LOOP are now used by an epilog loop.
1433      2. LOOP is going to be vectorized; this means that it will iterate N/VF
1434         times, whereas the loop IVs should be bumped N times.
1435
1436      Input:
1437      - LOOP - a loop that is going to be vectorized. The last few iterations
1438               of LOOP were peeled.
1439      - NITERS - the number of iterations that LOOP executes (before it is
1440                 vectorized). i.e, the number of times the ivs should be bumped.
1441      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1442                   coming out from LOOP on which there are uses of the LOOP ivs
1443                   (this is the path from LOOP->exit to epilog_loop->preheader).
1444
1445                   The new definitions of the ivs are placed in LOOP->exit.
1446                   The phi args associated with the edge UPDATE_E in the bb
1447                   UPDATE_E->dest are updated accordingly.
1448
1449      Assumption 1: Like the rest of the vectorizer, this function assumes
1450      a single loop exit that has a single predecessor.
1451
1452      Assumption 2: The phi nodes in the LOOP header and in update_bb are
1453      organized in the same order.
1454
1455      Assumption 3: The access function of the ivs is simple enough (see
1456      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
1457
1458      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1459      coming out of LOOP on which the ivs of LOOP are used (this is the path
1460      that leads to the epilog loop; other paths skip the epilog loop).  This
1461      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1462      needs to have its phis updated.
1463  */
1464
1465 static void
1466 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1467                                   tree niters, edge update_e)
1468 {
1469   gphi_iterator gsi, gsi1;
1470   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1471   basic_block update_bb = update_e->dest;
1472   basic_block exit_bb = single_exit (loop)->dest;
1473
1474   /* Make sure there exists a single-predecessor exit bb:  */
1475   gcc_assert (single_pred_p (exit_bb));
1476   gcc_assert (single_succ_edge (exit_bb) == update_e);
1477
1478   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1479        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1480        gsi_next (&gsi), gsi_next (&gsi1))
1481     {
1482       tree init_expr;
1483       tree step_expr, off;
1484       tree type;
1485       tree var, ni, ni_name;
1486       gimple_stmt_iterator last_gsi;
1487
1488       gphi *phi = gsi.phi ();
1489       gphi *phi1 = gsi1.phi ();
1490       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1491       if (dump_enabled_p ())
1492         dump_printf_loc (MSG_NOTE, vect_location,
1493                          "vect_update_ivs_after_vectorizer: phi: %G", phi);
1494
1495       /* Skip reduction and virtual phis.  */
1496       if (!iv_phi_p (phi_info))
1497         {
1498           if (dump_enabled_p ())
1499             dump_printf_loc (MSG_NOTE, vect_location,
1500                              "reduc or virtual phi. skip.\n");
1501           continue;
1502         }
1503
1504       type = TREE_TYPE (gimple_phi_result (phi));
1505       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1506       step_expr = unshare_expr (step_expr);
1507
1508       /* FORNOW: We do not support IVs whose evolution function is a polynomial
1509          of degree >= 2 or exponential.  */
1510       gcc_assert (!tree_is_chrec (step_expr));
1511
1512       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1513
1514       off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1515                          fold_convert (TREE_TYPE (step_expr), niters),
1516                          step_expr);
1517       if (POINTER_TYPE_P (type))
1518         ni = fold_build_pointer_plus (init_expr, off);
1519       else
1520         ni = fold_build2 (PLUS_EXPR, type,
1521                           init_expr, fold_convert (type, off));
1522
1523       var = create_tmp_var (type, "tmp");
1524
1525       last_gsi = gsi_last_bb (exit_bb);
1526       gimple_seq new_stmts = NULL;
1527       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1528       /* Exit_bb shouldn't be empty.  */
1529       if (!gsi_end_p (last_gsi))
1530         gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1531       else
1532         gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1533
1534       /* Fix phi expressions in the successor bb.  */
1535       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1536     }
1537 }
1538
1539 /* Return a gimple value containing the misalignment (measured in vector
1540    elements) for the loop described by LOOP_VINFO, i.e. how many elements
1541    it is away from a perfectly aligned address.  Add any new statements
1542    to SEQ.  */
1543
1544 static tree
1545 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1546 {
1547   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1548   stmt_vec_info stmt_info = dr_info->stmt;
1549   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1550
1551   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1552   unsigned HOST_WIDE_INT target_align_c;
1553   tree target_align_minus_1;
1554
1555   bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1556                                         size_zero_node) < 0;
1557   tree offset = (negative
1558                  ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1559                  : size_zero_node);
1560   tree start_addr = vect_create_addr_base_for_vector_ref (stmt_info, seq,
1561                                                           offset);
1562   tree type = unsigned_type_for (TREE_TYPE (start_addr));
1563   if (target_align.is_constant (&target_align_c))
1564     target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1565   else
1566     {
1567       tree vla = build_int_cst (type, target_align);
1568       tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1569                                     fold_build2 (MINUS_EXPR, type,
1570                                                  build_int_cst (type, 0), vla));
1571       target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1572                                           build_int_cst (type, 1));
1573     }
1574
1575   HOST_WIDE_INT elem_size
1576     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1577   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1578
1579   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
1580   tree int_start_addr = fold_convert (type, start_addr);
1581   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1582                                         target_align_minus_1);
1583
1584   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
1585   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1586                                         elem_size_log);
1587
1588   return misalign_in_elems;
1589 }
1590
1591 /* Function vect_gen_prolog_loop_niters
1592
1593    Generate the number of iterations which should be peeled as prolog for the
1594    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
1595    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1596    As a result, after the execution of this loop, the data reference DR will
1597    refer to an aligned location.  The following computation is generated:
1598
1599    If the misalignment of DR is known at compile time:
1600      addr_mis = int mis = DR_MISALIGNMENT (dr);
1601    Else, compute address misalignment in bytes:
1602      addr_mis = addr & (target_align - 1)
1603
1604    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1605
1606    (elem_size = element type size; an element is the scalar element whose type
1607    is the inner type of the vectype)
1608
1609    The computations will be emitted at the end of BB.  We also compute and
1610    store upper bound (included) of the result in BOUND.
1611
1612    When the step of the data-ref in the loop is not 1 (as in interleaved data
1613    and SLP), the number of iterations of the prolog must be divided by the step
1614    (which is equal to the size of interleaved group).
1615
1616    The above formulas assume that VF == number of elements in the vector. This
1617    may not hold when there are multiple-types in the loop.
1618    In this case, for some data-references in the loop the VF does not represent
1619    the number of elements that fit in the vector.  Therefore, instead of VF we
1620    use TYPE_VECTOR_SUBPARTS.  */
1621
1622 static tree
1623 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1624                              basic_block bb, int *bound)
1625 {
1626   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1627   tree var;
1628   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1629   gimple_seq stmts = NULL, new_stmts = NULL;
1630   tree iters, iters_name;
1631   stmt_vec_info stmt_info = dr_info->stmt;
1632   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1633   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1634
1635   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1636     {
1637       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1638
1639       if (dump_enabled_p ())
1640         dump_printf_loc (MSG_NOTE, vect_location,
1641                          "known peeling = %d.\n", npeel);
1642
1643       iters = build_int_cst (niters_type, npeel);
1644       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1645     }
1646   else
1647     {
1648       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1649       tree type = TREE_TYPE (misalign_in_elems);
1650       HOST_WIDE_INT elem_size
1651         = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1652       /* We only do prolog peeling if the target alignment is known at compile
1653          time.  */
1654       poly_uint64 align_in_elems =
1655         exact_div (target_align, elem_size);
1656       tree align_in_elems_minus_1 =
1657         build_int_cst (type, align_in_elems - 1);
1658       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1659
1660       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
1661                                  & (align_in_elems - 1)).  */
1662       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1663                                             size_zero_node) < 0;
1664       if (negative)
1665         iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1666                              align_in_elems_tree);
1667       else
1668         iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1669                              misalign_in_elems);
1670       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1671       iters = fold_convert (niters_type, iters);
1672       unsigned HOST_WIDE_INT align_in_elems_c;
1673       if (align_in_elems.is_constant (&align_in_elems_c))
1674         *bound = align_in_elems_c - 1;
1675       else
1676         *bound = -1;
1677     }
1678
1679   if (dump_enabled_p ())
1680     dump_printf_loc (MSG_NOTE, vect_location,
1681                      "niters for prolog loop: %T\n", iters);
1682
1683   var = create_tmp_var (niters_type, "prolog_loop_niters");
1684   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1685
1686   if (new_stmts)
1687     gimple_seq_add_seq (&stmts, new_stmts);
1688   if (stmts)
1689     {
1690       gcc_assert (single_succ_p (bb));
1691       gimple_stmt_iterator gsi = gsi_last_bb (bb);
1692       if (gsi_end_p (gsi))
1693         gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1694       else
1695         gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1696     }
1697   return iters_name;
1698 }
1699
1700
1701 /* Function vect_update_init_of_dr
1702
1703    If CODE is PLUS, the vector loop starts NITERS iterations after the
1704    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1705    iterations before the scalar one (using masking to skip inactive
1706    elements).  This function updates the information recorded in DR to
1707    account for the difference.  Specifically, it updates the OFFSET
1708    field of DR.  */
1709
1710 static void
1711 vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code)
1712 {
1713   tree offset = DR_OFFSET (dr);
1714
1715   niters = fold_build2 (MULT_EXPR, sizetype,
1716                         fold_convert (sizetype, niters),
1717                         fold_convert (sizetype, DR_STEP (dr)));
1718   offset = fold_build2 (code, sizetype,
1719                         fold_convert (sizetype, offset), niters);
1720   DR_OFFSET (dr) = offset;
1721 }
1722
1723
1724 /* Function vect_update_inits_of_drs
1725
1726    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1727    CODE and NITERS are as for vect_update_inits_of_dr.  */
1728
1729 void
1730 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1731                           tree_code code)
1732 {
1733   unsigned int i;
1734   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1735   struct data_reference *dr;
1736
1737   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1738
1739   /* Adjust niters to sizetype.  We used to insert the stmts on loop preheader
1740      here, but since we might use these niters to update the epilogues niters
1741      and data references we can't insert them here as this definition might not
1742      always dominate its uses.  */
1743   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1744     niters = fold_convert (sizetype, niters);
1745
1746   FOR_EACH_VEC_ELT (datarefs, i, dr)
1747     {
1748       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1749       if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
1750         vect_update_init_of_dr (dr, niters, code);
1751     }
1752 }
1753
1754 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1755    by masking.  This involves calculating the number of iterations to
1756    be peeled and then aligning all memory references appropriately.  */
1757
1758 void
1759 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1760 {
1761   tree misalign_in_elems;
1762   tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1763
1764   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1765
1766   /* From the information recorded in LOOP_VINFO get the number of iterations
1767      that need to be skipped via masking.  */
1768   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1769     {
1770       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1771                              - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1772       misalign_in_elems = build_int_cst (type, misalign);
1773     }
1774   else
1775     {
1776       gimple_seq seq1 = NULL, seq2 = NULL;
1777       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1778       misalign_in_elems = fold_convert (type, misalign_in_elems);
1779       misalign_in_elems = force_gimple_operand (misalign_in_elems,
1780                                                 &seq2, true, NULL_TREE);
1781       gimple_seq_add_seq (&seq1, seq2);
1782       if (seq1)
1783         {
1784           edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1785           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1786           gcc_assert (!new_bb);
1787         }
1788     }
1789
1790   if (dump_enabled_p ())
1791     dump_printf_loc (MSG_NOTE, vect_location,
1792                      "misalignment for fully-masked loop: %T\n",
1793                      misalign_in_elems);
1794
1795   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1796
1797   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1798 }
1799
1800 /* This function builds ni_name = number of iterations.  Statements
1801    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
1802    it to TRUE if new ssa_var is generated.  */
1803
1804 tree
1805 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1806 {
1807   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1808   if (TREE_CODE (ni) == INTEGER_CST)
1809     return ni;
1810   else
1811     {
1812       tree ni_name, var;
1813       gimple_seq stmts = NULL;
1814       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1815
1816       var = create_tmp_var (TREE_TYPE (ni), "niters");
1817       ni_name = force_gimple_operand (ni, &stmts, false, var);
1818       if (stmts)
1819         {
1820           gsi_insert_seq_on_edge_immediate (pe, stmts);
1821           if (new_var_p != NULL)
1822             *new_var_p = true;
1823         }
1824
1825       return ni_name;
1826     }
1827 }
1828
1829 /* Calculate the number of iterations above which vectorized loop will be
1830    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
1831    of prolog loop.  If it's integer const, the integer number is also passed
1832    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
1833    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
1834    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
1835    threshold below which the scalar (rather than vectorized) loop will be
1836    executed.  This function stores the upper bound (inclusive) of the result
1837    in BOUND_SCALAR.  */
1838
1839 static tree
1840 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1841                              int bound_prolog, poly_int64 bound_epilog, int th,
1842                              poly_uint64 *bound_scalar,
1843                              bool check_profitability)
1844 {
1845   tree type = TREE_TYPE (niters_prolog);
1846   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1847                              build_int_cst (type, bound_epilog));
1848
1849   *bound_scalar = bound_prolog + bound_epilog;
1850   if (check_profitability)
1851     {
1852       /* TH indicates the minimum niters of vectorized loop, while we
1853          compute the maximum niters of scalar loop.  */
1854       th--;
1855       /* Peeling for constant times.  */
1856       if (int_niters_prolog >= 0)
1857         {
1858           *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1859           return build_int_cst (type, *bound_scalar);
1860         }
1861       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
1862          and BOUND_EPILOG are inclusive upper bounds.  */
1863       if (known_ge (th, bound_prolog + bound_epilog))
1864         {
1865           *bound_scalar = th;
1866           return build_int_cst (type, th);
1867         }
1868       /* Need to do runtime comparison.  */
1869       else if (maybe_gt (th, bound_epilog))
1870         {
1871           *bound_scalar = upper_bound (*bound_scalar, th);
1872           return fold_build2 (MAX_EXPR, type,
1873                               build_int_cst (type, th), niters);
1874         }
1875     }
1876   return niters;
1877 }
1878
1879 /* NITERS is the number of times that the original scalar loop executes
1880    after peeling.  Work out the maximum number of iterations N that can
1881    be handled by the vectorized form of the loop and then either:
1882
1883    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1884
1885         niters_vector = N
1886
1887    b) set *STEP_VECTOR_PTR to one and generate:
1888
1889         niters_vector = N / vf
1890
1891    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1892    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
1893    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
1894
1895 void
1896 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1897                              tree *niters_vector_ptr, tree *step_vector_ptr,
1898                              bool niters_no_overflow)
1899 {
1900   tree ni_minus_gap, var;
1901   tree niters_vector, step_vector, type = TREE_TYPE (niters);
1902   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1903   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1904   tree log_vf = NULL_TREE;
1905
1906   /* If epilogue loop is required because of data accesses with gaps, we
1907      subtract one iteration from the total number of iterations here for
1908      correct calculation of RATIO.  */
1909   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1910     {
1911       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1912                                   build_one_cst (type));
1913       if (!is_gimple_val (ni_minus_gap))
1914         {
1915           var = create_tmp_var (type, "ni_gap");
1916           gimple *stmts = NULL;
1917           ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1918                                                true, var);
1919           gsi_insert_seq_on_edge_immediate (pe, stmts);
1920         }
1921     }
1922   else
1923     ni_minus_gap = niters;
1924
1925   unsigned HOST_WIDE_INT const_vf;
1926   if (vf.is_constant (&const_vf)
1927       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1928     {
1929       /* Create: niters >> log2(vf) */
1930       /* If it's known that niters == number of latch executions + 1 doesn't
1931          overflow, we can generate niters >> log2(vf); otherwise we generate
1932          (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1933          will be at least one.  */
1934       log_vf = build_int_cst (type, exact_log2 (const_vf));
1935       if (niters_no_overflow)
1936         niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1937       else
1938         niters_vector
1939           = fold_build2 (PLUS_EXPR, type,
1940                          fold_build2 (RSHIFT_EXPR, type,
1941                                       fold_build2 (MINUS_EXPR, type,
1942                                                    ni_minus_gap,
1943                                                    build_int_cst (type, vf)),
1944                                       log_vf),
1945                          build_int_cst (type, 1));
1946       step_vector = build_one_cst (type);
1947     }
1948   else
1949     {
1950       niters_vector = ni_minus_gap;
1951       step_vector = build_int_cst (type, vf);
1952     }
1953
1954   if (!is_gimple_val (niters_vector))
1955     {
1956       var = create_tmp_var (type, "bnd");
1957       gimple_seq stmts = NULL;
1958       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1959       gsi_insert_seq_on_edge_immediate (pe, stmts);
1960       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1961          we set range information to make niters analyzer's life easier.  */
1962       if (stmts != NULL && log_vf)
1963         set_range_info (niters_vector, VR_RANGE,
1964                         wi::to_wide (build_int_cst (type, 1)),
1965                         wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1966                                                   TYPE_MAX_VALUE (type),
1967                                                   log_vf)));
1968     }
1969   *niters_vector_ptr = niters_vector;
1970   *step_vector_ptr = step_vector;
1971
1972   return;
1973 }
1974
1975 /* Given NITERS_VECTOR which is the number of iterations for vectorized
1976    loop specified by LOOP_VINFO after vectorization, compute the number
1977    of iterations before vectorization (niters_vector * vf) and store it
1978    to NITERS_VECTOR_MULT_VF_PTR.  */
1979
1980 static void
1981 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
1982                                      tree niters_vector,
1983                                      tree *niters_vector_mult_vf_ptr)
1984 {
1985   /* We should be using a step_vector of VF if VF is variable.  */
1986   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
1987   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1988   tree type = TREE_TYPE (niters_vector);
1989   tree log_vf = build_int_cst (type, exact_log2 (vf));
1990   basic_block exit_bb = single_exit (loop)->dest;
1991
1992   gcc_assert (niters_vector_mult_vf_ptr != NULL);
1993   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
1994                                             niters_vector, log_vf);
1995   if (!is_gimple_val (niters_vector_mult_vf))
1996     {
1997       tree var = create_tmp_var (type, "niters_vector_mult_vf");
1998       gimple_seq stmts = NULL;
1999       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2000                                                     &stmts, true, var);
2001       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2002       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2003     }
2004   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2005 }
2006
2007 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2008    this function searches for the corresponding lcssa phi node in exit
2009    bb of LOOP.  If it is found, return the phi result; otherwise return
2010    NULL.  */
2011
2012 static tree
2013 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2014                 gphi *lcssa_phi)
2015 {
2016   gphi_iterator gsi;
2017   edge e = single_exit (loop);
2018
2019   gcc_assert (single_pred_p (e->dest));
2020   for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2021     {
2022       gphi *phi = gsi.phi ();
2023       if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2024                            PHI_ARG_DEF (lcssa_phi, 0), 0))
2025         return PHI_RESULT (phi);
2026     }
2027   return NULL_TREE;
2028 }
2029
2030 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2031    from SECOND/FIRST and puts it at the original loop's preheader/exit
2032    edge, the two loops are arranged as below:
2033
2034        preheader_a:
2035      first_loop:
2036        header_a:
2037          i_1 = PHI<i_0, i_2>;
2038          ...
2039          i_2 = i_1 + 1;
2040          if (cond_a)
2041            goto latch_a;
2042          else
2043            goto between_bb;
2044        latch_a:
2045          goto header_a;
2046
2047        between_bb:
2048          ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
2049
2050      second_loop:
2051        header_b:
2052          i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2053                                  or with i_2 if no LCSSA phi is created
2054                                  under condition of CREATE_LCSSA_FOR_IV_PHIS.
2055          ...
2056          i_4 = i_3 + 1;
2057          if (cond_b)
2058            goto latch_b;
2059          else
2060            goto exit_bb;
2061        latch_b:
2062          goto header_b;
2063
2064        exit_bb:
2065
2066    This function creates loop closed SSA for the first loop; update the
2067    second loop's PHI nodes by replacing argument on incoming edge with the
2068    result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
2069    is false, Loop closed ssa phis will only be created for non-iv phis for
2070    the first loop.
2071
2072    This function assumes exit bb of the first loop is preheader bb of the
2073    second loop, i.e, between_bb in the example code.  With PHIs updated,
2074    the second loop will execute rest iterations of the first.  */
2075
2076 static void
2077 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2078                                    class loop *first, class loop *second,
2079                                    bool create_lcssa_for_iv_phis)
2080 {
2081   gphi_iterator gsi_update, gsi_orig;
2082   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2083
2084   edge first_latch_e = EDGE_SUCC (first->latch, 0);
2085   edge second_preheader_e = loop_preheader_edge (second);
2086   basic_block between_bb = single_exit (first)->dest;
2087
2088   gcc_assert (between_bb == second_preheader_e->src);
2089   gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2090   /* Either the first loop or the second is the loop to be vectorized.  */
2091   gcc_assert (loop == first || loop == second);
2092
2093   for (gsi_orig = gsi_start_phis (first->header),
2094        gsi_update = gsi_start_phis (second->header);
2095        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2096        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2097     {
2098       gphi *orig_phi = gsi_orig.phi ();
2099       gphi *update_phi = gsi_update.phi ();
2100
2101       tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2102       /* Generate lcssa PHI node for the first loop.  */
2103       gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2104       stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2105       if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2106         {
2107           tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2108           gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2109           add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2110           arg = new_res;
2111         }
2112
2113       /* Update PHI node in the second loop by replacing arg on the loop's
2114          incoming edge.  */
2115       adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2116     }
2117
2118   /* For epilogue peeling we have to make sure to copy all LC PHIs
2119      for correct vectorization of live stmts.  */
2120   if (loop == first)
2121     {
2122       basic_block orig_exit = single_exit (second)->dest;
2123       for (gsi_orig = gsi_start_phis (orig_exit);
2124            !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2125         {
2126           gphi *orig_phi = gsi_orig.phi ();
2127           tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2128           if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p  (orig_arg))
2129             continue;
2130
2131           /* Already created in the above loop.   */
2132           if (find_guard_arg (first, second, orig_phi))
2133             continue;
2134
2135           tree new_res = copy_ssa_name (orig_arg);
2136           gphi *lcphi = create_phi_node (new_res, between_bb);
2137           add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2138         }
2139     }
2140 }
2141
2142 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2143    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2144    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2145    appear like below:
2146
2147        guard_bb:
2148          if (cond)
2149            goto merge_bb;
2150          else
2151            goto skip_loop;
2152
2153      skip_loop:
2154        header_a:
2155          i_1 = PHI<i_0, i_2>;
2156          ...
2157          i_2 = i_1 + 1;
2158          if (cond_a)
2159            goto latch_a;
2160          else
2161            goto exit_a;
2162        latch_a:
2163          goto header_a;
2164
2165        exit_a:
2166          i_5 = PHI<i_2>;
2167
2168        merge_bb:
2169          ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2170
2171      update_loop:
2172        header_b:
2173          i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2174          ...
2175          i_4 = i_3 + 1;
2176          if (cond_b)
2177            goto latch_b;
2178          else
2179            goto exit_bb;
2180        latch_b:
2181          goto header_b;
2182
2183        exit_bb:
2184
2185    This function creates PHI nodes at merge_bb and replaces the use of i_5
2186    in the update_loop's PHI node with the result of new PHI result.  */
2187
2188 static void
2189 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2190                                     class loop *update_loop,
2191                                     edge guard_edge, edge merge_edge)
2192 {
2193   location_t merge_loc, guard_loc;
2194   edge orig_e = loop_preheader_edge (skip_loop);
2195   edge update_e = loop_preheader_edge (update_loop);
2196   gphi_iterator gsi_orig, gsi_update;
2197
2198   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2199         gsi_update = gsi_start_phis (update_loop->header));
2200        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2201        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2202     {
2203       gphi *orig_phi = gsi_orig.phi ();
2204       gphi *update_phi = gsi_update.phi ();
2205
2206       /* Generate new phi node at merge bb of the guard.  */
2207       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2208       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2209
2210       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2211          args in NEW_PHI for these edges.  */
2212       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2213       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2214       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2215       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2216       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2217       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2218
2219       /* Update phi in UPDATE_PHI.  */
2220       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2221     }
2222 }
2223
2224 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2225    from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
2226    point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
2227    and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2228    The CFG looks like:
2229
2230      loop:
2231        header_a:
2232          i_1 = PHI<i_0, i_2>;
2233          ...
2234          i_2 = i_1 + 1;
2235          if (cond_a)
2236            goto latch_a;
2237          else
2238            goto exit_a;
2239        latch_a:
2240          goto header_a;
2241
2242        exit_a:
2243
2244        guard_bb:
2245          if (cond)
2246            goto merge_bb;
2247          else
2248            goto epilog_loop;
2249
2250        ;; fall_through_bb
2251
2252      epilog_loop:
2253        header_b:
2254          i_3 = PHI<i_2, i_4>;
2255          ...
2256          i_4 = i_3 + 1;
2257          if (cond_b)
2258            goto latch_b;
2259          else
2260            goto merge_bb;
2261        latch_b:
2262          goto header_b;
2263
2264        merge_bb:
2265          ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2266
2267        exit_bb:
2268          i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
2269
2270    For each name used out side EPILOG (i.e - for each name that has a lcssa
2271    phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
2272    args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
2273    the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2274    by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
2275    in exit_bb will also be updated.  */
2276
2277 static void
2278 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2279                                     edge guard_edge, edge merge_edge)
2280 {
2281   gphi_iterator gsi;
2282   basic_block merge_bb = guard_edge->dest;
2283
2284   gcc_assert (single_succ_p (merge_bb));
2285   edge e = single_succ_edge (merge_bb);
2286   basic_block exit_bb = e->dest;
2287   gcc_assert (single_pred_p (exit_bb));
2288   gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2289
2290   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2291     {
2292       gphi *update_phi = gsi.phi ();
2293       tree old_arg = PHI_ARG_DEF (update_phi, 0);
2294
2295       tree merge_arg = NULL_TREE;
2296
2297       /* If the old argument is a SSA_NAME use its current_def.  */
2298       if (TREE_CODE (old_arg) == SSA_NAME)
2299         merge_arg = get_current_def (old_arg);
2300       /* If it's a constant or doesn't have a current_def, just use the old
2301          argument.  */
2302       if (!merge_arg)
2303         merge_arg = old_arg;
2304
2305       tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2306       /* If the var is live after loop but not a reduction, we simply
2307          use the old arg.  */
2308       if (!guard_arg)
2309         guard_arg = old_arg;
2310
2311       /* Create new phi node in MERGE_BB:  */
2312       tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2313       gphi *merge_phi = create_phi_node (new_res, merge_bb);
2314
2315       /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2316          the two PHI args in merge_phi for these edges.  */
2317       add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2318       add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2319
2320       /* Update the original phi in exit_bb.  */
2321       adjust_phi_and_debug_stmts (update_phi, e, new_res);
2322     }
2323 }
2324
2325 /* EPILOG loop is duplicated from the original loop for vectorizing,
2326    the arg of its loop closed ssa PHI needs to be updated.  */
2327
2328 static void
2329 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2330 {
2331   gphi_iterator gsi;
2332   basic_block exit_bb = single_exit (epilog)->dest;
2333
2334   gcc_assert (single_pred_p (exit_bb));
2335   edge e = EDGE_PRED (exit_bb, 0);
2336   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2337     rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2338 }
2339
2340 /* Function vect_do_peeling.
2341
2342    Input:
2343    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2344
2345        preheader:
2346      LOOP:
2347        header_bb:
2348          loop_body
2349          if (exit_loop_cond) goto exit_bb
2350          else                goto header_bb
2351        exit_bb:
2352
2353    - NITERS: The number of iterations of the loop.
2354    - NITERSM1: The number of iterations of the loop's latch.
2355    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2356    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2357                               CHECK_PROFITABILITY is true.
2358    Output:
2359    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2360      iterate after vectorization; see vect_set_loop_condition for details.
2361    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2362      should be set to the number of scalar iterations handled by the
2363      vector loop.  The SSA name is only used on exit from the loop.
2364
2365    This function peels prolog and epilog from the loop, adds guards skipping
2366    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2367    would look like:
2368
2369        guard_bb_1:
2370          if (prefer_scalar_loop) goto merge_bb_1
2371          else                    goto guard_bb_2
2372
2373        guard_bb_2:
2374          if (skip_prolog) goto merge_bb_2
2375          else             goto prolog_preheader
2376
2377        prolog_preheader:
2378      PROLOG:
2379        prolog_header_bb:
2380          prolog_body
2381          if (exit_prolog_cond) goto prolog_exit_bb
2382          else                  goto prolog_header_bb
2383        prolog_exit_bb:
2384
2385        merge_bb_2:
2386
2387        vector_preheader:
2388      VECTOR LOOP:
2389        vector_header_bb:
2390          vector_body
2391          if (exit_vector_cond) goto vector_exit_bb
2392          else                  goto vector_header_bb
2393        vector_exit_bb:
2394
2395        guard_bb_3:
2396          if (skip_epilog) goto merge_bb_3
2397          else             goto epilog_preheader
2398
2399        merge_bb_1:
2400
2401        epilog_preheader:
2402      EPILOG:
2403        epilog_header_bb:
2404          epilog_body
2405          if (exit_epilog_cond) goto merge_bb_3
2406          else                  goto epilog_header_bb
2407
2408        merge_bb_3:
2409
2410    Note this function peels prolog and epilog only if it's necessary,
2411    as well as guards.
2412    This function returns the epilogue loop if a decision was made to vectorize
2413    it, otherwise NULL.
2414
2415    The analysis resulting in this epilogue loop's loop_vec_info was performed
2416    in the same vect_analyze_loop call as the main loop's.  At that time
2417    vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2418    vectorization factors than the main loop.  This list is stored in the main
2419    loop's loop_vec_info in the 'epilogue_vinfos' member.  Everytime we decide to
2420    vectorize the epilogue loop for a lower vectorization factor,  the
2421    loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2422    updated and linked to the epilogue loop.  This is later used to vectorize
2423    the epilogue.  The reason the loop_vec_info needs updating is that it was
2424    constructed based on the original main loop, and the epilogue loop is a
2425    copy of this loop, so all links pointing to statements in the original loop
2426    need updating.  Furthermore, these loop_vec_infos share the
2427    data_reference's records, which will also need to be updated.
2428
2429    TODO: Guard for prefer_scalar_loop should be emitted along with
2430    versioning conditions if loop versioning is needed.  */
2431
2432
2433 class loop *
2434 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2435                  tree *niters_vector, tree *step_vector,
2436                  tree *niters_vector_mult_vf_var, int th,
2437                  bool check_profitability, bool niters_no_overflow,
2438                  tree *advance, drs_init_vec &orig_drs_init)
2439 {
2440   edge e, guard_e;
2441   tree type = TREE_TYPE (niters), guard_cond;
2442   basic_block guard_bb, guard_to;
2443   profile_probability prob_prolog, prob_vector, prob_epilog;
2444   int estimated_vf;
2445   int prolog_peeling = 0;
2446   bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2447   /* We currently do not support prolog peeling if the target alignment is not
2448      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
2449      target alignment being constant.  */
2450   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2451   if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2452     return NULL;
2453
2454   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2455     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2456
2457   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2458   poly_uint64 bound_epilog = 0;
2459   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2460       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2461     bound_epilog += vf - 1;
2462   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2463     bound_epilog += 1;
2464   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2465   poly_uint64 bound_scalar = bound_epilog;
2466
2467   if (!prolog_peeling && !epilog_peeling)
2468     return NULL;
2469
2470   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2471   estimated_vf = vect_vf_for_cost (loop_vinfo);
2472   if (estimated_vf == 2)
2473     estimated_vf = 3;
2474   prob_prolog = prob_epilog = profile_probability::guessed_always ()
2475                         .apply_scale (estimated_vf - 1, estimated_vf);
2476
2477   class loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
2478   class loop *first_loop = loop;
2479   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2480   create_lcssa_for_virtual_phi (loop);
2481   update_ssa (TODO_update_ssa_only_virtuals);
2482
2483   if (MAY_HAVE_DEBUG_BIND_STMTS)
2484     {
2485       gcc_assert (!adjust_vec.exists ());
2486       adjust_vec.create (32);
2487     }
2488   initialize_original_copy_tables ();
2489
2490   /* Record the anchor bb at which the guard should be placed if the scalar
2491      loop might be preferred.  */
2492   basic_block anchor = loop_preheader_edge (loop)->src;
2493
2494   /* Generate the number of iterations for the prolog loop.  We do this here
2495      so that we can also get the upper bound on the number of iterations.  */
2496   tree niters_prolog;
2497   int bound_prolog = 0;
2498   if (prolog_peeling)
2499     niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2500                                                   &bound_prolog);
2501   else
2502     niters_prolog = build_int_cst (type, 0);
2503
2504   loop_vec_info epilogue_vinfo = NULL;
2505   if (vect_epilogues)
2506     {
2507       epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2508       loop_vinfo->epilogue_vinfos.ordered_remove (0);
2509     }
2510
2511   tree niters_vector_mult_vf = NULL_TREE;
2512   /* Saving NITERs before the loop, as this may be changed by prologue.  */
2513   tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2514   edge update_e = NULL, skip_e = NULL;
2515   unsigned int lowest_vf = constant_lower_bound (vf);
2516   /* If we know the number of scalar iterations for the main loop we should
2517      check whether after the main loop there are enough iterations left over
2518      for the epilogue.  */
2519   if (vect_epilogues
2520       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2521       && prolog_peeling >= 0
2522       && known_eq (vf, lowest_vf))
2523     {
2524       unsigned HOST_WIDE_INT eiters
2525         = (LOOP_VINFO_INT_NITERS (loop_vinfo)
2526            - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2527
2528       eiters -= prolog_peeling;
2529       eiters
2530         = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2531
2532       unsigned int ratio;
2533       unsigned int epilogue_gaps
2534         = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2535       while (!(constant_multiple_p (loop_vinfo->vector_size,
2536                                     epilogue_vinfo->vector_size, &ratio)
2537                && eiters >= lowest_vf / ratio + epilogue_gaps))
2538         {
2539           delete epilogue_vinfo;
2540           epilogue_vinfo = NULL;
2541           if (loop_vinfo->epilogue_vinfos.length () == 0)
2542             {
2543               vect_epilogues = false;
2544               break;
2545             }
2546           epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2547           loop_vinfo->epilogue_vinfos.ordered_remove (0);
2548           epilogue_gaps = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2549         }
2550     }
2551   /* Prolog loop may be skipped.  */
2552   bool skip_prolog = (prolog_peeling != 0);
2553   /* Skip this loop to epilog when there are not enough iterations to enter this
2554      vectorized loop.  If true we should perform runtime checks on the NITERS
2555      to check whether we should skip the current vectorized loop.  If we know
2556      the number of scalar iterations we may choose to add a runtime check if
2557      this number "maybe" smaller than the number of iterations required
2558      when we know the number of scalar iterations may potentially
2559      be smaller than the number of iterations required to enter this loop, for
2560      this we use the upper bounds on the prolog and epilog peeling.  When we
2561      don't know the number of iterations and don't require versioning it is
2562      because we have asserted that there are enough scalar iterations to enter
2563      the main loop, so this skip is not necessary.  When we are versioning then
2564      we only add such a skip if we have chosen to vectorize the epilogue.  */
2565   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2566                       ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2567                                   bound_prolog + bound_epilog)
2568                       : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2569                          || vect_epilogues));
2570   /* Epilog loop must be executed if the number of iterations for epilog
2571      loop is known at compile time, otherwise we need to add a check at
2572      the end of vector loop and skip to the end of epilog loop.  */
2573   bool skip_epilog = (prolog_peeling < 0
2574                       || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2575                       || !vf.is_constant ());
2576   /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
2577   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2578     skip_epilog = false;
2579
2580   if (skip_vector)
2581     {
2582       split_edge (loop_preheader_edge (loop));
2583
2584       /* Due to the order in which we peel prolog and epilog, we first
2585          propagate probability to the whole loop.  The purpose is to
2586          avoid adjusting probabilities of both prolog and vector loops
2587          separately.  Note in this case, the probability of epilog loop
2588          needs to be scaled back later.  */
2589       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2590       if (prob_vector.initialized_p ())
2591         {
2592           scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2593           scale_loop_profile (loop, prob_vector, 0);
2594         }
2595     }
2596
2597   dump_user_location_t loop_loc = find_loop_location (loop);
2598   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2599   if (vect_epilogues)
2600     /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2601        use the original scalar loop as remaining epilogue if necessary.  */
2602     LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2603       = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2604
2605   if (prolog_peeling)
2606     {
2607       e = loop_preheader_edge (loop);
2608       if (!slpeel_can_duplicate_loop_p (loop, e))
2609         {
2610           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2611                            "loop can't be duplicated to preheader edge.\n");
2612           gcc_unreachable ();
2613         }
2614       /* Peel prolog and put it on preheader edge of loop.  */
2615       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2616       if (!prolog)
2617         {
2618           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2619                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2620           gcc_unreachable ();
2621         }
2622       prolog->force_vectorize = false;
2623       slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2624       first_loop = prolog;
2625       reset_original_copy_tables ();
2626
2627       /* Update the number of iterations for prolog loop.  */
2628       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2629       vect_set_loop_condition (prolog, NULL, niters_prolog,
2630                                step_prolog, NULL_TREE, false);
2631
2632       /* Skip the prolog loop.  */
2633       if (skip_prolog)
2634         {
2635           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2636                                     niters_prolog, build_int_cst (type, 0));
2637           guard_bb = loop_preheader_edge (prolog)->src;
2638           basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2639           guard_to = split_edge (loop_preheader_edge (loop));
2640           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2641                                            guard_to, guard_bb,
2642                                            prob_prolog.invert (),
2643                                            irred_flag);
2644           e = EDGE_PRED (guard_to, 0);
2645           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2646           slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2647
2648           scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2649           scale_loop_profile (prolog, prob_prolog, bound_prolog);
2650         }
2651
2652       /* Save original inits for each data_reference before advancing them with
2653          NITERS_PROLOG.  */
2654       unsigned int i;
2655       struct data_reference *dr;
2656       vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
2657       FOR_EACH_VEC_ELT (datarefs, i, dr)
2658         orig_drs_init.safe_push (std::make_pair (dr, DR_OFFSET (dr)));
2659
2660       /* Update init address of DRs.  */
2661       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2662       /* Update niters for vector loop.  */
2663       LOOP_VINFO_NITERS (loop_vinfo)
2664         = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2665       LOOP_VINFO_NITERSM1 (loop_vinfo)
2666         = fold_build2 (MINUS_EXPR, type,
2667                        LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2668       bool new_var_p = false;
2669       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2670       /* It's guaranteed that vector loop bound before vectorization is at
2671          least VF, so set range information for newly generated var.  */
2672       if (new_var_p)
2673         set_range_info (niters, VR_RANGE,
2674                         wi::to_wide (build_int_cst (type, vf)),
2675                         wi::to_wide (TYPE_MAX_VALUE (type)));
2676
2677       /* Prolog iterates at most bound_prolog times, latch iterates at
2678          most bound_prolog - 1 times.  */
2679       record_niter_bound (prolog, bound_prolog - 1, false, true);
2680       delete_update_ssa ();
2681       adjust_vec_debug_stmts ();
2682       scev_reset ();
2683     }
2684
2685   if (epilog_peeling)
2686     {
2687       e = single_exit (loop);
2688       if (!slpeel_can_duplicate_loop_p (loop, e))
2689         {
2690           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2691                            "loop can't be duplicated to exit edge.\n");
2692           gcc_unreachable ();
2693         }
2694       /* Peel epilog and put it on exit edge of loop.  If we are vectorizing
2695          said epilog then we should use a copy of the main loop as a starting
2696          point.  This loop may have already had some preliminary transformations
2697          to allow for more optimal vectorization, for example if-conversion.
2698          If we are not vectorizing the epilog then we should use the scalar loop
2699          as the transformations mentioned above make less or no sense when not
2700          vectorizing.  */
2701       epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2702       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
2703       if (!epilog)
2704         {
2705           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2706                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2707           gcc_unreachable ();
2708         }
2709       epilog->force_vectorize = false;
2710       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2711
2712       /* Scalar version loop may be preferred.  In this case, add guard
2713          and skip to epilog.  Note this only happens when the number of
2714          iterations of loop is unknown at compile time, otherwise this
2715          won't be vectorized.  */
2716       if (skip_vector)
2717         {
2718           /* Additional epilogue iteration is peeled if gap exists.  */
2719           tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2720                                                 bound_prolog, bound_epilog,
2721                                                 th, &bound_scalar,
2722                                                 check_profitability);
2723           /* Build guard against NITERSM1 since NITERS may overflow.  */
2724           guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2725           guard_bb = anchor;
2726           guard_to = split_edge (loop_preheader_edge (epilog));
2727           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2728                                            guard_to, guard_bb,
2729                                            prob_vector.invert (),
2730                                            irred_flag);
2731           skip_e = guard_e;
2732           e = EDGE_PRED (guard_to, 0);
2733           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2734           slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2735
2736           /* Simply propagate profile info from guard_bb to guard_to which is
2737              a merge point of control flow.  */
2738           guard_to->count = guard_bb->count;
2739
2740           /* Scale probability of epilog loop back.
2741              FIXME: We should avoid scaling down and back up.  Profile may
2742              get lost if we scale down to 0.  */
2743           basic_block *bbs = get_loop_body (epilog);
2744           for (unsigned int i = 0; i < epilog->num_nodes; i++)
2745             bbs[i]->count = bbs[i]->count.apply_scale
2746                                  (bbs[i]->count,
2747                                   bbs[i]->count.apply_probability
2748                                     (prob_vector));
2749           free (bbs);
2750         }
2751
2752       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2753       /* If loop is peeled for non-zero constant times, now niters refers to
2754          orig_niters - prolog_peeling, it won't overflow even the orig_niters
2755          overflows.  */
2756       niters_no_overflow |= (prolog_peeling > 0);
2757       vect_gen_vector_loop_niters (loop_vinfo, niters,
2758                                    niters_vector, step_vector,
2759                                    niters_no_overflow);
2760       if (!integer_onep (*step_vector))
2761         {
2762           /* On exit from the loop we will have an easy way of calcalating
2763              NITERS_VECTOR / STEP * STEP.  Install a dummy definition
2764              until then.  */
2765           niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2766           SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2767           *niters_vector_mult_vf_var = niters_vector_mult_vf;
2768         }
2769       else
2770         vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2771                                              &niters_vector_mult_vf);
2772       /* Update IVs of original loop as if they were advanced by
2773          niters_vector_mult_vf steps.  */
2774       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2775       update_e = skip_vector ? e : loop_preheader_edge (epilog);
2776       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2777                                         update_e);
2778
2779       if (skip_epilog)
2780         {
2781           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2782                                     niters, niters_vector_mult_vf);
2783           guard_bb = single_exit (loop)->dest;
2784           guard_to = split_edge (single_exit (epilog));
2785           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2786                                            skip_vector ? anchor : guard_bb,
2787                                            prob_epilog.invert (),
2788                                            irred_flag);
2789           slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2790                                               single_exit (epilog));
2791           /* Only need to handle basic block before epilog loop if it's not
2792              the guard_bb, which is the case when skip_vector is true.  */
2793           if (guard_bb != bb_before_epilog)
2794             {
2795               prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2796
2797               scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2798             }
2799           scale_loop_profile (epilog, prob_epilog, 0);
2800         }
2801       else
2802         slpeel_update_phi_nodes_for_lcssa (epilog);
2803
2804       unsigned HOST_WIDE_INT bound;
2805       if (bound_scalar.is_constant (&bound))
2806         {
2807           gcc_assert (bound != 0);
2808           /* -1 to convert loop iterations to latch iterations.  */
2809           record_niter_bound (epilog, bound - 1, false, true);
2810         }
2811
2812       delete_update_ssa ();
2813       adjust_vec_debug_stmts ();
2814       scev_reset ();
2815     }
2816
2817   if (vect_epilogues)
2818     {
2819       epilog->aux = epilogue_vinfo;
2820       LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
2821
2822       loop_constraint_clear (epilog, LOOP_C_INFINITE);
2823
2824       /* We now must calculate the number of NITERS performed by the previous
2825          loop and EPILOGUE_NITERS to be performed by the epilogue.  */
2826       tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
2827                                  niters_prolog, niters_vector_mult_vf);
2828
2829       /* If skip_vector we may skip the previous loop, we insert a phi-node to
2830          determine whether we are coming from the previous vectorized loop
2831          using the update_e edge or the skip_vector basic block using the
2832          skip_e edge.  */
2833       if (skip_vector)
2834         {
2835           gcc_assert (update_e != NULL && skip_e != NULL);
2836           gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
2837                                            update_e->dest);
2838           tree new_ssa = make_ssa_name (TREE_TYPE (niters));
2839           gimple *stmt = gimple_build_assign (new_ssa, niters);
2840           gimple_stmt_iterator gsi;
2841           if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
2842               && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
2843             {
2844               gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
2845               gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
2846             }
2847           else
2848             {
2849               gsi = gsi_last_bb (update_e->src);
2850               gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
2851             }
2852
2853           niters = new_ssa;
2854           add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
2855           add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
2856                        UNKNOWN_LOCATION);
2857           niters = PHI_RESULT (new_phi);
2858         }
2859
2860       /* Subtract the number of iterations performed by the vectorized loop
2861          from the number of total iterations.  */
2862       tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
2863                                           before_loop_niters,
2864                                           niters);
2865
2866       LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
2867       LOOP_VINFO_NITERSM1 (epilogue_vinfo)
2868         = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
2869                        epilogue_niters,
2870                        build_one_cst (TREE_TYPE (epilogue_niters)));
2871
2872       /* Set ADVANCE to the number of iterations performed by the previous
2873          loop and its prologue.  */
2874       *advance = niters;
2875
2876       /* Redo the peeling for niter analysis as the NITERs and alignment
2877          may have been updated to take the main loop into account.  */
2878       determine_peel_for_niter (epilogue_vinfo);
2879     }
2880
2881   adjust_vec.release ();
2882   free_original_copy_tables ();
2883
2884   return vect_epilogues ? epilog : NULL;
2885 }
2886
2887 /* Function vect_create_cond_for_niters_checks.
2888
2889    Create a conditional expression that represents the run-time checks for
2890    loop's niter.  The loop is guaranteed to terminate if the run-time
2891    checks hold.
2892
2893    Input:
2894    COND_EXPR  - input conditional expression.  New conditions will be chained
2895                 with logical AND operation.  If it is NULL, then the function
2896                 is used to return the number of alias checks.
2897    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2898                 to be checked.
2899
2900    Output:
2901    COND_EXPR - conditional expression.
2902
2903    The returned COND_EXPR is the conditional expression to be used in the
2904    if statement that controls which version of the loop gets executed at
2905    runtime.  */
2906
2907 static void
2908 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
2909 {
2910   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
2911
2912   if (*cond_expr)
2913     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2914                               *cond_expr, part_cond_expr);
2915   else
2916     *cond_expr = part_cond_expr;
2917 }
2918
2919 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2920    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
2921
2922 static void
2923 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
2924 {
2925   if (*cond_expr)
2926     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2927                               *cond_expr, part_cond_expr);
2928   else
2929     *cond_expr = part_cond_expr;
2930 }
2931
2932 /* Function vect_create_cond_for_align_checks.
2933
2934    Create a conditional expression that represents the alignment checks for
2935    all of data references (array element references) whose alignment must be
2936    checked at runtime.
2937
2938    Input:
2939    COND_EXPR  - input conditional expression.  New conditions will be chained
2940                 with logical AND operation.
2941    LOOP_VINFO - two fields of the loop information are used.
2942                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
2943                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
2944
2945    Output:
2946    COND_EXPR_STMT_LIST - statements needed to construct the conditional
2947                          expression.
2948    The returned value is the conditional expression to be used in the if
2949    statement that controls which version of the loop gets executed at runtime.
2950
2951    The algorithm makes two assumptions:
2952      1) The number of bytes "n" in a vector is a power of 2.
2953      2) An address "a" is aligned if a%n is zero and that this
2954         test can be done as a&(n-1) == 0.  For example, for 16
2955         byte vectors the test is a&0xf == 0.  */
2956
2957 static void
2958 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
2959                                    tree *cond_expr,
2960                                    gimple_seq *cond_expr_stmt_list)
2961 {
2962   vec<stmt_vec_info> may_misalign_stmts
2963     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2964   stmt_vec_info stmt_info;
2965   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
2966   tree mask_cst;
2967   unsigned int i;
2968   tree int_ptrsize_type;
2969   char tmp_name[20];
2970   tree or_tmp_name = NULL_TREE;
2971   tree and_tmp_name;
2972   gimple *and_stmt;
2973   tree ptrsize_zero;
2974   tree part_cond_expr;
2975
2976   /* Check that mask is one less than a power of 2, i.e., mask is
2977      all zeros followed by all ones.  */
2978   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
2979
2980   int_ptrsize_type = signed_type_for (ptr_type_node);
2981
2982   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
2983      of the first vector of the i'th data reference. */
2984
2985   FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2986     {
2987       gimple_seq new_stmt_list = NULL;
2988       tree addr_base;
2989       tree addr_tmp_name;
2990       tree new_or_tmp_name;
2991       gimple *addr_stmt, *or_stmt;
2992       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2993       bool negative = tree_int_cst_compare
2994         (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
2995       tree offset = negative
2996         ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
2997
2998       /* create: addr_tmp = (int)(address_of_first_vector) */
2999       addr_base =
3000         vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
3001                                               offset);
3002       if (new_stmt_list != NULL)
3003         gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3004
3005       sprintf (tmp_name, "addr2int%d", i);
3006       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3007       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3008       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3009
3010       /* The addresses are OR together.  */
3011
3012       if (or_tmp_name != NULL_TREE)
3013         {
3014           /* create: or_tmp = or_tmp | addr_tmp */
3015           sprintf (tmp_name, "orptrs%d", i);
3016           new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3017           or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3018                                          or_tmp_name, addr_tmp_name);
3019           gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3020           or_tmp_name = new_or_tmp_name;
3021         }
3022       else
3023         or_tmp_name = addr_tmp_name;
3024
3025     } /* end for i */
3026
3027   mask_cst = build_int_cst (int_ptrsize_type, mask);
3028
3029   /* create: and_tmp = or_tmp & mask  */
3030   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3031
3032   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3033                                   or_tmp_name, mask_cst);
3034   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3035
3036   /* Make and_tmp the left operand of the conditional test against zero.
3037      if and_tmp has a nonzero bit then some address is unaligned.  */
3038   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3039   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3040                                 and_tmp_name, ptrsize_zero);
3041   chain_cond_expr (cond_expr, part_cond_expr);
3042 }
3043
3044 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3045    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3046    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3047    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
3048
3049 static void
3050 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3051 {
3052   vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3053   unsigned int i;
3054   vec_object_pair *pair;
3055   FOR_EACH_VEC_ELT (pairs, i, pair)
3056     {
3057       tree addr1 = build_fold_addr_expr (pair->first);
3058       tree addr2 = build_fold_addr_expr (pair->second);
3059       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3060                                          addr1, addr2);
3061       chain_cond_expr (cond_expr, part_cond_expr);
3062     }
3063 }
3064
3065 /* Create an expression that is true when all lower-bound conditions for
3066    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
3067
3068 static void
3069 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3070 {
3071   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3072   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3073     {
3074       tree expr = lower_bounds[i].expr;
3075       tree type = unsigned_type_for (TREE_TYPE (expr));
3076       expr = fold_convert (type, expr);
3077       poly_uint64 bound = lower_bounds[i].min_value;
3078       if (!lower_bounds[i].unsigned_p)
3079         {
3080           expr = fold_build2 (PLUS_EXPR, type, expr,
3081                               build_int_cstu (type, bound - 1));
3082           bound += bound - 1;
3083         }
3084       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3085                                          build_int_cstu (type, bound));
3086       chain_cond_expr (cond_expr, part_cond_expr);
3087     }
3088 }
3089
3090 /* Function vect_create_cond_for_alias_checks.
3091
3092    Create a conditional expression that represents the run-time checks for
3093    overlapping of address ranges represented by a list of data references
3094    relations passed as input.
3095
3096    Input:
3097    COND_EXPR  - input conditional expression.  New conditions will be chained
3098                 with logical AND operation.  If it is NULL, then the function
3099                 is used to return the number of alias checks.
3100    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3101                 to be checked.
3102
3103    Output:
3104    COND_EXPR - conditional expression.
3105
3106    The returned COND_EXPR is the conditional expression to be used in the if
3107    statement that controls which version of the loop gets executed at runtime.
3108 */
3109
3110 void
3111 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3112 {
3113   vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
3114     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3115
3116   if (comp_alias_ddrs.is_empty ())
3117     return;
3118
3119   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3120                                &comp_alias_ddrs, cond_expr);
3121   if (dump_enabled_p ())
3122     dump_printf_loc (MSG_NOTE, vect_location,
3123                      "created %u versioning for alias checks.\n",
3124                      comp_alias_ddrs.length ());
3125 }
3126
3127
3128 /* Function vect_loop_versioning.
3129
3130    If the loop has data references that may or may not be aligned or/and
3131    has data reference relations whose independence was not proven then
3132    two versions of the loop need to be generated, one which is vectorized
3133    and one which isn't.  A test is then generated to control which of the
3134    loops is executed.  The test checks for the alignment of all of the
3135    data references that may or may not be aligned.  An additional
3136    sequence of runtime tests is generated for each pairs of DDRs whose
3137    independence was not proven.  The vectorized version of loop is
3138    executed only if both alias and alignment tests are passed.
3139
3140    The test generated to check which version of loop is executed
3141    is modified to also check for profitability as indicated by the
3142    cost model threshold TH.
3143
3144    The versioning precondition(s) are placed in *COND_EXPR and
3145    *COND_EXPR_STMT_LIST.  */
3146
3147 class loop *
3148 vect_loop_versioning (loop_vec_info loop_vinfo)
3149 {
3150   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3151   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3152   basic_block condition_bb;
3153   gphi_iterator gsi;
3154   gimple_stmt_iterator cond_exp_gsi;
3155   basic_block merge_bb;
3156   basic_block new_exit_bb;
3157   edge new_exit_e, e;
3158   gphi *orig_phi, *new_phi;
3159   tree cond_expr = NULL_TREE;
3160   gimple_seq cond_expr_stmt_list = NULL;
3161   tree arg;
3162   profile_probability prob = profile_probability::likely ();
3163   gimple_seq gimplify_stmt_list = NULL;
3164   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3165   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3166   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3167   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3168   poly_uint64 versioning_threshold
3169     = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3170   tree version_simd_if_cond
3171     = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3172   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3173
3174   if (th >= vect_vf_for_cost (loop_vinfo)
3175       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3176       && !ordered_p (th, versioning_threshold))
3177     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3178                              build_int_cst (TREE_TYPE (scalar_loop_iters),
3179                                             th - 1));
3180   if (maybe_ne (versioning_threshold, 0U))
3181     {
3182       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3183                                build_int_cst (TREE_TYPE (scalar_loop_iters),
3184                                               versioning_threshold - 1));
3185       if (cond_expr)
3186         cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3187                                  expr, cond_expr);
3188       else
3189         cond_expr = expr;
3190     }
3191
3192   if (version_niter)
3193     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3194
3195   if (cond_expr)
3196     cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3197                                         &cond_expr_stmt_list,
3198                                         is_gimple_condexpr, NULL_TREE);
3199
3200   if (version_align)
3201     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3202                                        &cond_expr_stmt_list);
3203
3204   if (version_alias)
3205     {
3206       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3207       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3208       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3209     }
3210
3211   if (version_simd_if_cond)
3212     {
3213       gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3214       if (flag_checking)
3215         if (basic_block bb
3216             = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3217           gcc_assert (bb != loop->header
3218                       && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3219                       && (scalar_loop == NULL
3220                           || (bb != scalar_loop->header
3221                               && dominated_by_p (CDI_DOMINATORS,
3222                                                  scalar_loop->header, bb))));
3223       tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3224       tree c = fold_build2 (NE_EXPR, boolean_type_node,
3225                             version_simd_if_cond, zero);
3226       if (cond_expr)
3227         cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3228                                  c, cond_expr);
3229       else
3230         cond_expr = c;
3231       if (dump_enabled_p ())
3232         dump_printf_loc (MSG_NOTE, vect_location,
3233                          "created versioning for simd if condition check.\n");
3234     }
3235
3236   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3237                                       &gimplify_stmt_list,
3238                                       is_gimple_condexpr, NULL_TREE);
3239   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3240
3241   /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3242      invariant in.  */
3243   class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3244   for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3245        !gsi_end_p (gsi); gsi_next (&gsi))
3246     {
3247       gimple *stmt = gsi_stmt (gsi);
3248       update_stmt (stmt);
3249       ssa_op_iter iter;
3250       use_operand_p use_p;
3251       basic_block def_bb;
3252       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3253         if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3254             && flow_bb_inside_loop_p (outermost, def_bb))
3255           outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3256     }
3257
3258   /* Search for the outermost loop we can version.  Avoid versioning of
3259      non-perfect nests but allow if-conversion versioned loops inside.  */
3260   class loop *loop_to_version = loop;
3261   if (flow_loop_nested_p (outermost, loop))
3262     {
3263       if (dump_enabled_p ())
3264         dump_printf_loc (MSG_NOTE, vect_location,
3265                          "trying to apply versioning to outer loop %d\n",
3266                          outermost->num);
3267       if (outermost->num == 0)
3268         outermost = superloop_at_depth (loop, 1);
3269       /* And avoid applying versioning on non-perfect nests.  */
3270       while (loop_to_version != outermost
3271              && single_exit (loop_outer (loop_to_version))
3272              && (!loop_outer (loop_to_version)->inner->next
3273                  || vect_loop_vectorized_call (loop_to_version))
3274              && (!loop_outer (loop_to_version)->inner->next
3275                  || !loop_outer (loop_to_version)->inner->next->next))
3276         loop_to_version = loop_outer (loop_to_version);
3277     }
3278
3279   /* Apply versioning.  If there is already a scalar version created by
3280      if-conversion re-use that.  Note we cannot re-use the copy of
3281      an if-converted outer-loop when vectorizing the inner loop only.  */
3282   gcond *cond;
3283   gimple *call;
3284   if ((!loop_to_version->inner || loop == loop_to_version)
3285       && (call = vect_loop_vectorized_call (loop_to_version, &cond)))
3286     {
3287       gcc_assert (scalar_loop);
3288       condition_bb = gimple_bb (cond);
3289       gimple_cond_set_condition_from_tree (cond, cond_expr);
3290       update_stmt (cond);
3291
3292       if (cond_expr_stmt_list)
3293         {
3294           cond_exp_gsi = gsi_for_stmt (call);
3295           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3296                                  GSI_SAME_STMT);
3297         }
3298
3299       /* if-conversion uses profile_probability::always () for both paths,
3300          reset the paths probabilities appropriately.  */
3301       edge te, fe;
3302       extract_true_false_edges_from_block (condition_bb, &te, &fe);
3303       te->probability = prob;
3304       fe->probability = prob.invert ();
3305       /* We can scale loops counts immediately but have to postpone
3306          scaling the scalar loop because we re-use it during peeling.  */
3307       scale_loop_frequencies (loop_to_version, te->probability);
3308       LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3309
3310       nloop = scalar_loop;
3311       if (dump_enabled_p ())
3312         dump_printf_loc (MSG_NOTE, vect_location,
3313                          "reusing %sloop version created by if conversion\n",
3314                          loop_to_version != loop ? "outer " : "");
3315     }
3316   else
3317     {
3318       if (loop_to_version != loop
3319           && dump_enabled_p ())
3320         dump_printf_loc (MSG_NOTE, vect_location,
3321                          "applying loop versioning to outer loop %d\n",
3322                          loop_to_version->num);
3323
3324       initialize_original_copy_tables ();
3325       nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3326                             prob, prob.invert (), prob, prob.invert (), true);
3327       gcc_assert (nloop);
3328       nloop = get_loop_copy (loop);
3329
3330       /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3331          reap those otherwise;  they also refer to the original
3332          loops.  */
3333       class loop *l = loop;
3334       while (gimple *call = vect_loop_vectorized_call (l))
3335         {
3336           call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3337           fold_loop_internal_call (call, boolean_false_node);
3338           l = loop_outer (l);
3339         }
3340       free_original_copy_tables ();
3341
3342       if (cond_expr_stmt_list)
3343         {
3344           cond_exp_gsi = gsi_last_bb (condition_bb);
3345           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3346                                  GSI_SAME_STMT);
3347         }
3348
3349       /* Loop versioning violates an assumption we try to maintain during
3350          vectorization - that the loop exit block has a single predecessor.
3351          After versioning, the exit block of both loop versions is the same
3352          basic block (i.e. it has two predecessors). Just in order to simplify
3353          following transformations in the vectorizer, we fix this situation
3354          here by adding a new (empty) block on the exit-edge of the loop,
3355          with the proper loop-exit phis to maintain loop-closed-form.
3356          If loop versioning wasn't done from loop, but scalar_loop instead,
3357          merge_bb will have already just a single successor.  */
3358
3359       merge_bb = single_exit (loop_to_version)->dest;
3360       if (EDGE_COUNT (merge_bb->preds) >= 2)
3361         {
3362           gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3363           new_exit_bb = split_edge (single_exit (loop_to_version));
3364           new_exit_e = single_exit (loop_to_version);
3365           e = EDGE_SUCC (new_exit_bb, 0);
3366
3367           for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3368                gsi_next (&gsi))
3369             {
3370               tree new_res;
3371               orig_phi = gsi.phi ();
3372               new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3373               new_phi = create_phi_node (new_res, new_exit_bb);
3374               arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3375               add_phi_arg (new_phi, arg, new_exit_e,
3376                            gimple_phi_arg_location_from_edge (orig_phi, e));
3377               adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3378             }
3379         }
3380
3381       update_ssa (TODO_update_ssa);
3382     }
3383
3384   if (version_niter)
3385     {
3386       /* The versioned loop could be infinite, we need to clear existing
3387          niter information which is copied from the original loop.  */
3388       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3389       vect_free_loop_info_assumptions (nloop);
3390       /* And set constraint LOOP_C_INFINITE for niter analyzer.  */
3391       loop_constraint_set (loop, LOOP_C_INFINITE);
3392     }
3393
3394   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3395       && dump_enabled_p ())
3396     {
3397       if (version_alias)
3398         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3399                          vect_location,
3400                          "loop versioned for vectorization because of "
3401                          "possible aliasing\n");
3402       if (version_align)
3403         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3404                          vect_location,
3405                          "loop versioned for vectorization to enhance "
3406                          "alignment\n");
3407
3408     }
3409
3410   return nloop;
3411 }