gcc/tree-vect-loop-manip.c

   1 /* Vectorizer Specific Loop Manipulations
   2    Copyright (C) 2003-2020 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "tree.h"
  27 #include "gimple.h"
  28 #include "cfghooks.h"
  29 #include "tree-pass.h"
  30 #include "ssa.h"
  31 #include "fold-const.h"
  32 #include "cfganal.h"
  33 #include "gimplify.h"
  34 #include "gimple-iterator.h"
  35 #include "gimplify-me.h"
  36 #include "tree-cfg.h"
  37 #include "tree-ssa-loop-manip.h"
  38 #include "tree-into-ssa.h"
  39 #include "tree-ssa.h"
  40 #include "cfgloop.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "gimple-fold.h"
  45 #include "tree-ssa-loop-niter.h"
  46 #include "internal-fn.h"
  47 #include "stor-layout.h"
  48 #include "optabs-query.h"
  49 #include "vec-perm-indices.h"
  50 #include "insn-config.h"
  51 #include "rtl.h"
  52 #include "recog.h"
  53
  54 /*************************************************************************
  55   Simple Loop Peeling Utilities
  56
  57   Utilities to support loop peeling for vectorization purposes.
  58  *************************************************************************/
  59
  60
  61 /* Renames the use *OP_P.  */
  62
  63 static void
  64 rename_use_op (use_operand_p op_p)
  65 {
  66   tree new_name;
  67
  68   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
  69     return;
  70
  71   new_name = get_current_def (USE_FROM_PTR (op_p));
  72
  73   /* Something defined outside of the loop.  */
  74   if (!new_name)
  75     return;
  76
  77   /* An ordinary ssa name defined in the loop.  */
  78
  79   SET_USE (op_p, new_name);
  80 }
  81
  82
  83 /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
  84    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
  85    true.  */
  86
  87 static void
  88 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
  89 {
  90   gimple *stmt;
  91   use_operand_p use_p;
  92   ssa_op_iter iter;
  93   edge e;
  94   edge_iterator ei;
  95   class loop *loop = bb->loop_father;
  96   class loop *outer_loop = NULL;
  97
  98   if (rename_from_outer_loop)
  99     {
 100       gcc_assert (loop);
 101       outer_loop = loop_outer (loop);
 102     }
 103
 104   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
 105        gsi_next (&gsi))
 106     {
 107       stmt = gsi_stmt (gsi);
 108       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
 109         rename_use_op (use_p);
 110     }
 111
 112   FOR_EACH_EDGE (e, ei, bb->preds)
 113     {
 114       if (!flow_bb_inside_loop_p (loop, e->src))
 115         {
 116           if (!rename_from_outer_loop)
 117             continue;
 118           if (e->src != outer_loop->header)
 119             {
 120               if (outer_loop->inner->next)
 121                 {
 122                   /* If outer_loop has 2 inner loops, allow there to
 123                      be an extra basic block which decides which of the
 124                      two loops to use using LOOP_VECTORIZED.  */
 125                   if (!single_pred_p (e->src)
 126                       || single_pred (e->src) != outer_loop->header)
 127                     continue;
 128                 }
 129             }
 130         }
 131       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 132            gsi_next (&gsi))
 133         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
 134     }
 135 }
 136
 137
 138 struct adjust_info
 139 {
 140   tree from, to;
 141   basic_block bb;
 142 };
 143
 144 /* A stack of values to be adjusted in debug stmts.  We have to
 145    process them LIFO, so that the closest substitution applies.  If we
 146    processed them FIFO, without the stack, we might substitute uses
 147    with a PHI DEF that would soon become non-dominant, and when we got
 148    to the suitable one, it wouldn't have anything to substitute any
 149    more.  */
 150 static vec<adjust_info, va_heap> adjust_vec;
 151
 152 /* Adjust any debug stmts that referenced AI->from values to use the
 153    loop-closed AI->to, if the references are dominated by AI->bb and
 154    not by the definition of AI->from.  */
 155
 156 static void
 157 adjust_debug_stmts_now (adjust_info *ai)
 158 {
 159   basic_block bbphi = ai->bb;
 160   tree orig_def = ai->from;
 161   tree new_def = ai->to;
 162   imm_use_iterator imm_iter;
 163   gimple *stmt;
 164   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
 165
 166   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
 167
 168   /* Adjust any debug stmts that held onto non-loop-closed
 169      references.  */
 170   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
 171     {
 172       use_operand_p use_p;
 173       basic_block bbuse;
 174
 175       if (!is_gimple_debug (stmt))
 176         continue;
 177
 178       gcc_assert (gimple_debug_bind_p (stmt));
 179
 180       bbuse = gimple_bb (stmt);
 181
 182       if ((bbuse == bbphi
 183            || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
 184           && !(bbuse == bbdef
 185                || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
 186         {
 187           if (new_def)
 188             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
 189               SET_USE (use_p, new_def);
 190           else
 191             {
 192               gimple_debug_bind_reset_value (stmt);
 193               update_stmt (stmt);
 194             }
 195         }
 196     }
 197 }
 198
 199 /* Adjust debug stmts as scheduled before.  */
 200
 201 static void
 202 adjust_vec_debug_stmts (void)
 203 {
 204   if (!MAY_HAVE_DEBUG_BIND_STMTS)
 205     return;
 206
 207   gcc_assert (adjust_vec.exists ());
 208
 209   while (!adjust_vec.is_empty ())
 210     {
 211       adjust_debug_stmts_now (&adjust_vec.last ());
 212       adjust_vec.pop ();
 213     }
 214 }
 215
 216 /* Adjust any debug stmts that referenced FROM values to use the
 217    loop-closed TO, if the references are dominated by BB and not by
 218    the definition of FROM.  If adjust_vec is non-NULL, adjustments
 219    will be postponed until adjust_vec_debug_stmts is called.  */
 220
 221 static void
 222 adjust_debug_stmts (tree from, tree to, basic_block bb)
 223 {
 224   adjust_info ai;
 225
 226   if (MAY_HAVE_DEBUG_BIND_STMTS
 227       && TREE_CODE (from) == SSA_NAME
 228       && ! SSA_NAME_IS_DEFAULT_DEF (from)
 229       && ! virtual_operand_p (from))
 230     {
 231       ai.from = from;
 232       ai.to = to;
 233       ai.bb = bb;
 234
 235       if (adjust_vec.exists ())
 236         adjust_vec.safe_push (ai);
 237       else
 238         adjust_debug_stmts_now (&ai);
 239     }
 240 }
 241
 242 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
 243    to adjust any debug stmts that referenced the old phi arg,
 244    presumably non-loop-closed references left over from other
 245    transformations.  */
 246
 247 static void
 248 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
 249 {
 250   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
 251
 252   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
 253
 254   if (MAY_HAVE_DEBUG_BIND_STMTS)
 255     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
 256                         gimple_bb (update_phi));
 257 }
 258
 259 /* Define one loop mask MASK from loop LOOP.  INIT_MASK is the value that
 260    the mask should have during the first iteration and NEXT_MASK is the
 261    value that it should have on subsequent iterations.  */
 262
 263 static void
 264 vect_set_loop_mask (class loop *loop, tree mask, tree init_mask,
 265                     tree next_mask)
 266 {
 267   gphi *phi = create_phi_node (mask, loop->header);
 268   add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
 269   add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
 270 }
 271
 272 /* Add SEQ to the end of LOOP's preheader block.  */
 273
 274 static void
 275 add_preheader_seq (class loop *loop, gimple_seq seq)
 276 {
 277   if (seq)
 278     {
 279       edge pe = loop_preheader_edge (loop);
 280       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 281       gcc_assert (!new_bb);
 282     }
 283 }
 284
 285 /* Add SEQ to the beginning of LOOP's header block.  */
 286
 287 static void
 288 add_header_seq (class loop *loop, gimple_seq seq)
 289 {
 290   if (seq)
 291     {
 292       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
 293       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
 294     }
 295 }
 296
 297 /* Return true if the target can interleave elements of two vectors.
 298    OFFSET is 0 if the first half of the vectors should be interleaved
 299    or 1 if the second half should.  When returning true, store the
 300    associated permutation in INDICES.  */
 301
 302 static bool
 303 interleave_supported_p (vec_perm_indices *indices, tree vectype,
 304                         unsigned int offset)
 305 {
 306   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
 307   poly_uint64 base = exact_div (nelts, 2) * offset;
 308   vec_perm_builder sel (nelts, 2, 3);
 309   for (unsigned int i = 0; i < 3; ++i)
 310     {
 311       sel.quick_push (base + i);
 312       sel.quick_push (base + i + nelts);
 313     }
 314   indices->new_vector (sel, 2, nelts);
 315   return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
 316 }
 317
 318 /* Try to use permutes to define the masks in DEST_RGM using the masks
 319    in SRC_RGM, given that the former has twice as many masks as the
 320    latter.  Return true on success, adding any new statements to SEQ.  */
 321
 322 static bool
 323 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
 324                                rgroup_masks *src_rgm)
 325 {
 326   tree src_masktype = src_rgm->mask_type;
 327   tree dest_masktype = dest_rgm->mask_type;
 328   machine_mode src_mode = TYPE_MODE (src_masktype);
 329   insn_code icode1, icode2;
 330   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
 331       && (icode1 = optab_handler (vec_unpacku_hi_optab,
 332                                   src_mode)) != CODE_FOR_nothing
 333       && (icode2 = optab_handler (vec_unpacku_lo_optab,
 334                                   src_mode)) != CODE_FOR_nothing)
 335     {
 336       /* Unpacking the source masks gives at least as many mask bits as
 337          we need.  We can then VIEW_CONVERT any excess bits away.  */
 338       machine_mode dest_mode = insn_data[icode1].operand[0].mode;
 339       gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
 340       tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
 341       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
 342         {
 343           tree src = src_rgm->masks[i / 2];
 344           tree dest = dest_rgm->masks[i];
 345           tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
 346                             ? VEC_UNPACK_HI_EXPR
 347                             : VEC_UNPACK_LO_EXPR);
 348           gassign *stmt;
 349           if (dest_masktype == unpack_masktype)
 350             stmt = gimple_build_assign (dest, code, src);
 351           else
 352             {
 353               tree temp = make_ssa_name (unpack_masktype);
 354               stmt = gimple_build_assign (temp, code, src);
 355               gimple_seq_add_stmt (seq, stmt);
 356               stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
 357                                           build1 (VIEW_CONVERT_EXPR,
 358                                                   dest_masktype, temp));
 359             }
 360           gimple_seq_add_stmt (seq, stmt);
 361         }
 362       return true;
 363     }
 364   vec_perm_indices indices[2];
 365   if (dest_masktype == src_masktype
 366       && interleave_supported_p (&indices[0], src_masktype, 0)
 367       && interleave_supported_p (&indices[1], src_masktype, 1))
 368     {
 369       /* The destination requires twice as many mask bits as the source, so
 370          we can use interleaving permutes to double up the number of bits.  */
 371       tree masks[2];
 372       for (unsigned int i = 0; i < 2; ++i)
 373         masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
 374       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
 375         {
 376           tree src = src_rgm->masks[i / 2];
 377           tree dest = dest_rgm->masks[i];
 378           gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
 379                                               src, src, masks[i & 1]);
 380           gimple_seq_add_stmt (seq, stmt);
 381         }
 382       return true;
 383     }
 384   return false;
 385 }
 386
 387 /* Helper for vect_set_loop_condition_masked.  Generate definitions for
 388    all the masks in RGM and return a mask that is nonzero when the loop
 389    needs to iterate.  Add any new preheader statements to PREHEADER_SEQ.
 390    Use LOOP_COND_GSI to insert code before the exit gcond.
 391
 392    RGM belongs to loop LOOP.  The loop originally iterated NITERS
 393    times and has been vectorized according to LOOP_VINFO.
 394
 395    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
 396    starts with NITERS_SKIP dummy iterations of the scalar loop before
 397    the real work starts.  The mask elements for these dummy iterations
 398    must be 0, to ensure that the extra iterations do not have an effect.
 399
 400    It is known that:
 401
 402      NITERS * RGM->max_nscalars_per_iter
 403
 404    does not overflow.  However, MIGHT_WRAP_P says whether an induction
 405    variable that starts at 0 and has step:
 406
 407      VF * RGM->max_nscalars_per_iter
 408
 409    might overflow before hitting a value above:
 410
 411      (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
 412
 413    This means that we cannot guarantee that such an induction variable
 414    would ever hit a value that produces a set of all-false masks for RGM.  */
 415
 416 static tree
 417 vect_set_loop_masks_directly (class loop *loop, loop_vec_info loop_vinfo,
 418                               gimple_seq *preheader_seq,
 419                               gimple_stmt_iterator loop_cond_gsi,
 420                               rgroup_masks *rgm, tree niters, tree niters_skip,
 421                               bool might_wrap_p)
 422 {
 423   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
 424   tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo);
 425   tree mask_type = rgm->mask_type;
 426   unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
 427   poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
 428   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 429
 430   /* Calculate the maximum number of scalar values that the rgroup
 431      handles in total, the number that it handles for each iteration
 432      of the vector loop, and the number that it should skip during the
 433      first iteration of the vector loop.  */
 434   tree nscalars_total = niters;
 435   tree nscalars_step = build_int_cst (iv_type, vf);
 436   tree nscalars_skip = niters_skip;
 437   if (nscalars_per_iter != 1)
 438     {
 439       /* We checked before choosing to use a fully-masked loop that these
 440          multiplications don't overflow.  */
 441       tree compare_factor = build_int_cst (compare_type, nscalars_per_iter);
 442       tree iv_factor = build_int_cst (iv_type, nscalars_per_iter);
 443       nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 444                                      nscalars_total, compare_factor);
 445       nscalars_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
 446                                     nscalars_step, iv_factor);
 447       if (nscalars_skip)
 448         nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 449                                       nscalars_skip, compare_factor);
 450     }
 451
 452   /* Create an induction variable that counts the number of scalars
 453      processed.  */
 454   tree index_before_incr, index_after_incr;
 455   gimple_stmt_iterator incr_gsi;
 456   bool insert_after;
 457   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 458   create_iv (build_int_cst (iv_type, 0), nscalars_step, NULL_TREE, loop,
 459              &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
 460
 461   tree zero_index = build_int_cst (compare_type, 0);
 462   tree test_index, test_limit, first_limit;
 463   gimple_stmt_iterator *test_gsi;
 464   if (might_wrap_p)
 465     {
 466       /* In principle the loop should stop iterating once the incremented
 467          IV reaches a value greater than or equal to:
 468
 469            NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
 470
 471          However, there's no guarantee that this addition doesn't overflow
 472          the comparison type, or that the IV hits a value above it before
 473          wrapping around.  We therefore adjust the limit down by one
 474          IV step:
 475
 476            (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
 477            -[infinite-prec] NSCALARS_STEP
 478
 479          and compare the IV against this limit _before_ incrementing it.
 480          Since the comparison type is unsigned, we actually want the
 481          subtraction to saturate at zero:
 482
 483            (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
 484            -[sat] NSCALARS_STEP
 485
 486          And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
 487
 488            NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
 489
 490          where the rightmost subtraction can be done directly in
 491          COMPARE_TYPE.  */
 492       test_index = index_before_incr;
 493       tree adjust = gimple_convert (preheader_seq, compare_type,
 494                                     nscalars_step);
 495       if (nscalars_skip)
 496         adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 497                                adjust, nscalars_skip);
 498       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
 499                                  nscalars_total, adjust);
 500       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 501                                  test_limit, adjust);
 502       test_gsi = &incr_gsi;
 503
 504       /* Get a safe limit for the first iteration.  */
 505       if (nscalars_skip)
 506         {
 507           /* The first vector iteration can handle at most NSCALARS_STEP
 508              scalars.  NSCALARS_STEP <= CONST_LIMIT, and adding
 509              NSCALARS_SKIP to that cannot overflow.  */
 510           tree const_limit = build_int_cst (compare_type,
 511                                             LOOP_VINFO_VECT_FACTOR (loop_vinfo)
 512                                             * nscalars_per_iter);
 513           first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
 514                                       nscalars_total, const_limit);
 515           first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 516                                       first_limit, nscalars_skip);
 517         }
 518       else
 519         /* For the first iteration it doesn't matter whether the IV hits
 520            a value above NSCALARS_TOTAL.  That only matters for the latch
 521            condition.  */
 522         first_limit = nscalars_total;
 523     }
 524   else
 525     {
 526       /* Test the incremented IV, which will always hit a value above
 527          the bound before wrapping.  */
 528       test_index = index_after_incr;
 529       test_limit = nscalars_total;
 530       if (nscalars_skip)
 531         test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 532                                    test_limit, nscalars_skip);
 533       test_gsi = &loop_cond_gsi;
 534
 535       first_limit = test_limit;
 536     }
 537
 538   /* Convert the IV value to the comparison type (either a no-op or
 539      a demotion).  */
 540   gimple_seq test_seq = NULL;
 541   test_index = gimple_convert (&test_seq, compare_type, test_index);
 542   gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
 543
 544   /* Provide a definition of each mask in the group.  */
 545   tree next_mask = NULL_TREE;
 546   tree mask;
 547   unsigned int i;
 548   FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
 549     {
 550       /* Previous masks will cover BIAS scalars.  This mask covers the
 551          next batch.  */
 552       poly_uint64 bias = nscalars_per_mask * i;
 553       tree bias_tree = build_int_cst (compare_type, bias);
 554       gimple *tmp_stmt;
 555
 556       /* See whether the first iteration of the vector loop is known
 557          to have a full mask.  */
 558       poly_uint64 const_limit;
 559       bool first_iteration_full
 560         = (poly_int_tree_p (first_limit, &const_limit)
 561            && known_ge (const_limit, (i + 1) * nscalars_per_mask));
 562
 563       /* Rather than have a new IV that starts at BIAS and goes up to
 564          TEST_LIMIT, prefer to use the same 0-based IV for each mask
 565          and adjust the bound down by BIAS.  */
 566       tree this_test_limit = test_limit;
 567       if (i != 0)
 568         {
 569           this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
 570                                           compare_type, this_test_limit,
 571                                           bias_tree);
 572           this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
 573                                           compare_type, this_test_limit,
 574                                           bias_tree);
 575         }
 576
 577       /* Create the initial mask.  First include all scalars that
 578          are within the loop limit.  */
 579       tree init_mask = NULL_TREE;
 580       if (!first_iteration_full)
 581         {
 582           tree start, end;
 583           if (first_limit == test_limit)
 584             {
 585               /* Use a natural test between zero (the initial IV value)
 586                  and the loop limit.  The "else" block would be valid too,
 587                  but this choice can avoid the need to load BIAS_TREE into
 588                  a register.  */
 589               start = zero_index;
 590               end = this_test_limit;
 591             }
 592           else
 593             {
 594               /* FIRST_LIMIT is the maximum number of scalars handled by the
 595                  first iteration of the vector loop.  Test the portion
 596                  associated with this mask.  */
 597               start = bias_tree;
 598               end = first_limit;
 599             }
 600
 601           init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
 602           tmp_stmt = vect_gen_while (init_mask, start, end);
 603           gimple_seq_add_stmt (preheader_seq, tmp_stmt);
 604         }
 605
 606       /* Now AND out the bits that are within the number of skipped
 607          scalars.  */
 608       poly_uint64 const_skip;
 609       if (nscalars_skip
 610           && !(poly_int_tree_p (nscalars_skip, &const_skip)
 611                && known_le (const_skip, bias)))
 612         {
 613           tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
 614                                                     bias_tree, nscalars_skip);
 615           if (init_mask)
 616             init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
 617                                       init_mask, unskipped_mask);
 618           else
 619             init_mask = unskipped_mask;
 620         }
 621
 622       if (!init_mask)
 623         /* First iteration is full.  */
 624         init_mask = build_minus_one_cst (mask_type);
 625
 626       /* Get the mask value for the next iteration of the loop.  */
 627       next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
 628       gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
 629       gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
 630
 631       vect_set_loop_mask (loop, mask, init_mask, next_mask);
 632     }
 633   return next_mask;
 634 }
 635
 636 /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
 637    LOOP_VINFO describes the vectorization of LOOP.  NITERS is the
 638    number of iterations of the original scalar loop that should be
 639    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are
 640    as for vect_set_loop_condition.
 641
 642    Insert the branch-back condition before LOOP_COND_GSI and return the
 643    final gcond.  */
 644
 645 static gcond *
 646 vect_set_loop_condition_masked (class loop *loop, loop_vec_info loop_vinfo,
 647                                 tree niters, tree final_iv,
 648                                 bool niters_maybe_zero,
 649                                 gimple_stmt_iterator loop_cond_gsi)
 650 {
 651   gimple_seq preheader_seq = NULL;
 652   gimple_seq header_seq = NULL;
 653
 654   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
 655   unsigned int compare_precision = TYPE_PRECISION (compare_type);
 656   tree orig_niters = niters;
 657
 658   /* Type of the initial value of NITERS.  */
 659   tree ni_actual_type = TREE_TYPE (niters);
 660   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
 661   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 662
 663   /* Convert NITERS to the same size as the compare.  */
 664   if (compare_precision > ni_actual_precision
 665       && niters_maybe_zero)
 666     {
 667       /* We know that there is always at least one iteration, so if the
 668          count is zero then it must have wrapped.  Cope with this by
 669          subtracting 1 before the conversion and adding 1 to the result.  */
 670       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
 671       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
 672                              niters, build_minus_one_cst (ni_actual_type));
 673       niters = gimple_convert (&preheader_seq, compare_type, niters);
 674       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
 675                              niters, build_one_cst (compare_type));
 676     }
 677   else
 678     niters = gimple_convert (&preheader_seq, compare_type, niters);
 679
 680   widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
 681
 682   /* Iterate over all the rgroups and fill in their masks.  We could use
 683      the first mask from any rgroup for the loop condition; here we
 684      arbitrarily pick the last.  */
 685   tree test_mask = NULL_TREE;
 686   rgroup_masks *rgm;
 687   unsigned int i;
 688   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
 689   FOR_EACH_VEC_ELT (*masks, i, rgm)
 690     if (!rgm->masks.is_empty ())
 691       {
 692         /* First try using permutes.  This adds a single vector
 693            instruction to the loop for each mask, but needs no extra
 694            loop invariants or IVs.  */
 695         unsigned int nmasks = i + 1;
 696         if ((nmasks & 1) == 0)
 697           {
 698             rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
 699             if (!half_rgm->masks.is_empty ()
 700                 && vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
 701               continue;
 702           }
 703
 704         /* See whether zero-based IV would ever generate all-false masks
 705            before wrapping around.  */
 706         bool might_wrap_p
 707           = (iv_limit == -1
 708              || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
 709                                     UNSIGNED)
 710                  > compare_precision));
 711
 712         /* Set up all masks for this group.  */
 713         test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
 714                                                   &preheader_seq,
 715                                                   loop_cond_gsi, rgm,
 716                                                   niters, niters_skip,
 717                                                   might_wrap_p);
 718       }
 719
 720   /* Emit all accumulated statements.  */
 721   add_preheader_seq (loop, preheader_seq);
 722   add_header_seq (loop, header_seq);
 723
 724   /* Get a boolean result that tells us whether to iterate.  */
 725   edge exit_edge = single_exit (loop);
 726   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
 727   tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
 728   gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
 729                                         NULL_TREE, NULL_TREE);
 730   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 731
 732   /* The loop iterates (NITERS - 1) / VF + 1 times.
 733      Subtract one from this to get the latch count.  */
 734   tree step = build_int_cst (compare_type,
 735                              LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 736   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
 737                                        build_minus_one_cst (compare_type));
 738   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
 739                                      niters_minus_one, step);
 740
 741   if (final_iv)
 742     {
 743       gassign *assign = gimple_build_assign (final_iv, orig_niters);
 744       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 745     }
 746
 747   return cond_stmt;
 748 }
 749
 750 /* Like vect_set_loop_condition, but handle the case in which there
 751    are no loop masks.  */
 752
 753 static gcond *
 754 vect_set_loop_condition_unmasked (class loop *loop, tree niters,
 755                                   tree step, tree final_iv,
 756                                   bool niters_maybe_zero,
 757                                   gimple_stmt_iterator loop_cond_gsi)
 758 {
 759   tree indx_before_incr, indx_after_incr;
 760   gcond *cond_stmt;
 761   gcond *orig_cond;
 762   edge pe = loop_preheader_edge (loop);
 763   edge exit_edge = single_exit (loop);
 764   gimple_stmt_iterator incr_gsi;
 765   bool insert_after;
 766   enum tree_code code;
 767   tree niters_type = TREE_TYPE (niters);
 768
 769   orig_cond = get_loop_exit_condition (loop);
 770   gcc_assert (orig_cond);
 771   loop_cond_gsi = gsi_for_stmt (orig_cond);
 772
 773   tree init, limit;
 774   if (!niters_maybe_zero && integer_onep (step))
 775     {
 776       /* In this case we can use a simple 0-based IV:
 777
 778          A:
 779            x = 0;
 780            do
 781              {
 782                ...
 783                x += 1;
 784              }
 785            while (x < NITERS);  */
 786       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 787       init = build_zero_cst (niters_type);
 788       limit = niters;
 789     }
 790   else
 791     {
 792       /* The following works for all values of NITERS except 0:
 793
 794          B:
 795            x = 0;
 796            do
 797              {
 798                ...
 799                x += STEP;
 800              }
 801            while (x <= NITERS - STEP);
 802
 803          so that the loop continues to iterate if x + STEP - 1 < NITERS
 804          but stops if x + STEP - 1 >= NITERS.
 805
 806          However, if NITERS is zero, x never hits a value above NITERS - STEP
 807          before wrapping around.  There are two obvious ways of dealing with
 808          this:
 809
 810          - start at STEP - 1 and compare x before incrementing it
 811          - start at -1 and compare x after incrementing it
 812
 813          The latter is simpler and is what we use.  The loop in this case
 814          looks like:
 815
 816          C:
 817            x = -1;
 818            do
 819              {
 820                ...
 821                x += STEP;
 822              }
 823            while (x < NITERS - STEP);
 824
 825          In both cases the loop limit is NITERS - STEP.  */
 826       gimple_seq seq = NULL;
 827       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
 828       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
 829       if (seq)
 830         {
 831           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 832           gcc_assert (!new_bb);
 833         }
 834       if (niters_maybe_zero)
 835         {
 836           /* Case C.  */
 837           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 838           init = build_all_ones_cst (niters_type);
 839         }
 840       else
 841         {
 842           /* Case B.  */
 843           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
 844           init = build_zero_cst (niters_type);
 845         }
 846     }
 847
 848   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 849   create_iv (init, step, NULL_TREE, loop,
 850              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
 851   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
 852                                               true, NULL_TREE, true,
 853                                               GSI_SAME_STMT);
 854   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
 855                                      true, GSI_SAME_STMT);
 856
 857   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
 858                                  NULL_TREE);
 859
 860   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 861
 862   /* Record the number of latch iterations.  */
 863   if (limit == niters)
 864     /* Case A: the loop iterates NITERS times.  Subtract one to get the
 865        latch count.  */
 866     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
 867                                        build_int_cst (niters_type, 1));
 868   else
 869     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
 870        Subtract one from this to get the latch count.  */
 871     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
 872                                        limit, step);
 873
 874   if (final_iv)
 875     {
 876       gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
 877                                              indx_after_incr, init);
 878       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 879     }
 880
 881   return cond_stmt;
 882 }
 883
 884 /* If we're using fully-masked loops, make LOOP iterate:
 885
 886       N == (NITERS - 1) / STEP + 1
 887
 888    times.  When NITERS is zero, this is equivalent to making the loop
 889    execute (1 << M) / STEP times, where M is the precision of NITERS.
 890    NITERS_MAYBE_ZERO is true if this last case might occur.
 891
 892    If we're not using fully-masked loops, make LOOP iterate:
 893
 894       N == (NITERS - STEP) / STEP + 1
 895
 896    times, where NITERS is known to be outside the range [1, STEP - 1].
 897    This is equivalent to making the loop execute NITERS / STEP times
 898    when NITERS is nonzero and (1 << M) / STEP times otherwise.
 899    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
 900
 901    If FINAL_IV is nonnull, it is an SSA name that should be set to
 902    N * STEP on exit from the loop.
 903
 904    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
 905
 906 void
 907 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
 908                          tree niters, tree step, tree final_iv,
 909                          bool niters_maybe_zero)
 910 {
 911   gcond *cond_stmt;
 912   gcond *orig_cond = get_loop_exit_condition (loop);
 913   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
 914
 915   if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
 916     cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
 917                                                 final_iv, niters_maybe_zero,
 918                                                 loop_cond_gsi);
 919   else
 920     cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
 921                                                   final_iv, niters_maybe_zero,
 922                                                   loop_cond_gsi);
 923
 924   /* Remove old loop exit test.  */
 925   stmt_vec_info orig_cond_info;
 926   if (loop_vinfo
 927       && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
 928     loop_vinfo->remove_stmt (orig_cond_info);
 929   else
 930     gsi_remove (&loop_cond_gsi, true);
 931
 932   if (dump_enabled_p ())
 933     dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
 934                      cond_stmt);
 935 }
 936
 937 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
 938    For all PHI arguments in FROM->dest and TO->dest from those
 939    edges ensure that TO->dest PHI arguments have current_def
 940    to that in from.  */
 941
 942 static void
 943 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
 944 {
 945   gimple_stmt_iterator gsi_from, gsi_to;
 946
 947   for (gsi_from = gsi_start_phis (from->dest),
 948        gsi_to = gsi_start_phis (to->dest);
 949        !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
 950     {
 951       gimple *from_phi = gsi_stmt (gsi_from);
 952       gimple *to_phi = gsi_stmt (gsi_to);
 953       tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
 954       tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
 955       if (virtual_operand_p (from_arg))
 956         {
 957           gsi_next (&gsi_from);
 958           continue;
 959         }
 960       if (virtual_operand_p (to_arg))
 961         {
 962           gsi_next (&gsi_to);
 963           continue;
 964         }
 965       if (TREE_CODE (from_arg) != SSA_NAME)
 966         gcc_assert (operand_equal_p (from_arg, to_arg, 0));
 967       else if (TREE_CODE (to_arg) == SSA_NAME
 968                && from_arg != to_arg)
 969         {
 970           if (get_current_def (to_arg) == NULL_TREE)
 971             {
 972               gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
 973                                               TREE_TYPE (get_current_def
 974                                                            (from_arg))));
 975               set_current_def (to_arg, get_current_def (from_arg));
 976             }
 977         }
 978       gsi_next (&gsi_from);
 979       gsi_next (&gsi_to);
 980     }
 981
 982   gphi *from_phi = get_virtual_phi (from->dest);
 983   gphi *to_phi = get_virtual_phi (to->dest);
 984   if (from_phi)
 985     set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
 986                      get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
 987 }
 988
 989
 990 /* Given LOOP this function generates a new copy of it and puts it
 991    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
 992    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
 993    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
 994    entry or exit of LOOP.  */
 995
 996 class loop *
 997 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
 998                                         class loop *scalar_loop, edge e)
 999 {
1000   class loop *new_loop;
1001   basic_block *new_bbs, *bbs, *pbbs;
1002   bool at_exit;
1003   bool was_imm_dom;
1004   basic_block exit_dest;
1005   edge exit, new_exit;
1006   bool duplicate_outer_loop = false;
1007
1008   exit = single_exit (loop);
1009   at_exit = (e == exit);
1010   if (!at_exit && e != loop_preheader_edge (loop))
1011     return NULL;
1012
1013   if (scalar_loop == NULL)
1014     scalar_loop = loop;
1015
1016   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1017   pbbs = bbs + 1;
1018   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1019   /* Allow duplication of outer loops.  */
1020   if (scalar_loop->inner)
1021     duplicate_outer_loop = true;
1022   /* Check whether duplication is possible.  */
1023   if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1024     {
1025       free (bbs);
1026       return NULL;
1027     }
1028
1029   /* Generate new loop structure.  */
1030   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1031   duplicate_subloops (scalar_loop, new_loop);
1032
1033   exit_dest = exit->dest;
1034   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1035                                           exit_dest) == loop->header ?
1036                  true : false);
1037
1038   /* Also copy the pre-header, this avoids jumping through hoops to
1039      duplicate the loop entry PHI arguments.  Create an empty
1040      pre-header unconditionally for this.  */
1041   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1042   edge entry_e = single_pred_edge (preheader);
1043   bbs[0] = preheader;
1044   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1045
1046   exit = single_exit (scalar_loop);
1047   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1048             &exit, 1, &new_exit, NULL,
1049             at_exit ? loop->latch : e->src, true);
1050   exit = single_exit (loop);
1051   basic_block new_preheader = new_bbs[0];
1052
1053   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1054
1055   if (scalar_loop != loop)
1056     {
1057       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1058          SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1059          but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
1060          the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1061          header) to have current_def set, so copy them over.  */
1062       slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1063                                                 exit);
1064       slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1065                                                            0),
1066                                                 EDGE_SUCC (loop->latch, 0));
1067     }
1068
1069   if (at_exit) /* Add the loop copy at exit.  */
1070     {
1071       if (scalar_loop != loop)
1072         {
1073           gphi_iterator gsi;
1074           new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1075
1076           for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1077                gsi_next (&gsi))
1078             {
1079               gphi *phi = gsi.phi ();
1080               tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1081               location_t orig_locus
1082                 = gimple_phi_arg_location_from_edge (phi, e);
1083
1084               add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1085             }
1086         }
1087       redirect_edge_and_branch_force (e, new_preheader);
1088       flush_pending_stmts (e);
1089       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1090       if (was_imm_dom || duplicate_outer_loop)
1091         set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1092
1093       /* And remove the non-necessary forwarder again.  Keep the other
1094          one so we have a proper pre-header for the loop at the exit edge.  */
1095       redirect_edge_pred (single_succ_edge (preheader),
1096                           single_pred (preheader));
1097       delete_basic_block (preheader);
1098       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1099                                loop_preheader_edge (scalar_loop)->src);
1100     }
1101   else /* Add the copy at entry.  */
1102     {
1103       if (scalar_loop != loop)
1104         {
1105           /* Remove the non-necessary forwarder of scalar_loop again.  */
1106           redirect_edge_pred (single_succ_edge (preheader),
1107                               single_pred (preheader));
1108           delete_basic_block (preheader);
1109           set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1110                                    loop_preheader_edge (scalar_loop)->src);
1111           preheader = split_edge (loop_preheader_edge (loop));
1112           entry_e = single_pred_edge (preheader);
1113         }
1114
1115       redirect_edge_and_branch_force (entry_e, new_preheader);
1116       flush_pending_stmts (entry_e);
1117       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1118
1119       redirect_edge_and_branch_force (new_exit, preheader);
1120       flush_pending_stmts (new_exit);
1121       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1122
1123       /* And remove the non-necessary forwarder again.  Keep the other
1124          one so we have a proper pre-header for the loop at the exit edge.  */
1125       redirect_edge_pred (single_succ_edge (new_preheader),
1126                           single_pred (new_preheader));
1127       delete_basic_block (new_preheader);
1128       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1129                                loop_preheader_edge (new_loop)->src);
1130     }
1131
1132   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1133   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1134     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1135
1136   if (scalar_loop != loop)
1137     {
1138       /* Update new_loop->header PHIs, so that on the preheader
1139          edge they are the ones from loop rather than scalar_loop.  */
1140       gphi_iterator gsi_orig, gsi_new;
1141       edge orig_e = loop_preheader_edge (loop);
1142       edge new_e = loop_preheader_edge (new_loop);
1143
1144       for (gsi_orig = gsi_start_phis (loop->header),
1145            gsi_new = gsi_start_phis (new_loop->header);
1146            !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1147            gsi_next (&gsi_orig), gsi_next (&gsi_new))
1148         {
1149           gphi *orig_phi = gsi_orig.phi ();
1150           gphi *new_phi = gsi_new.phi ();
1151           tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1152           location_t orig_locus
1153             = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1154
1155           add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1156         }
1157     }
1158
1159   free (new_bbs);
1160   free (bbs);
1161
1162   checking_verify_dominators (CDI_DOMINATORS);
1163
1164   return new_loop;
1165 }
1166
1167
1168 /* Given the condition expression COND, put it as the last statement of
1169    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1170    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1171    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1172    new edge as irreducible if IRREDUCIBLE_P is true.  */
1173
1174 static edge
1175 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1176                        basic_block guard_to, basic_block dom_bb,
1177                        profile_probability probability, bool irreducible_p)
1178 {
1179   gimple_stmt_iterator gsi;
1180   edge new_e, enter_e;
1181   gcond *cond_stmt;
1182   gimple_seq gimplify_stmt_list = NULL;
1183
1184   enter_e = EDGE_SUCC (guard_bb, 0);
1185   enter_e->flags &= ~EDGE_FALLTHRU;
1186   enter_e->flags |= EDGE_FALSE_VALUE;
1187   gsi = gsi_last_bb (guard_bb);
1188
1189   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1190                                  NULL_TREE);
1191   if (gimplify_stmt_list)
1192     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1193
1194   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1195   gsi = gsi_last_bb (guard_bb);
1196   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1197
1198   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1199   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1200
1201   new_e->probability = probability;
1202   if (irreducible_p)
1203     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1204
1205   enter_e->probability = probability.invert ();
1206   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1207
1208   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1209   if (enter_e->dest->loop_father->header == enter_e->dest)
1210     split_edge (enter_e);
1211
1212   return new_e;
1213 }
1214
1215
1216 /* This function verifies that the following restrictions apply to LOOP:
1217    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1218        for innermost loop and 5 basic blocks for outer-loop.
1219    (2) it is single entry, single exit
1220    (3) its exit condition is the last stmt in the header
1221    (4) E is the entry/exit edge of LOOP.
1222  */
1223
1224 bool
1225 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1226 {
1227   edge exit_e = single_exit (loop);
1228   edge entry_e = loop_preheader_edge (loop);
1229   gcond *orig_cond = get_loop_exit_condition (loop);
1230   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1231   unsigned int num_bb = loop->inner? 5 : 2;
1232
1233   /* All loops have an outer scope; the only case loop->outer is NULL is for
1234      the function itself.  */
1235   if (!loop_outer (loop)
1236       || loop->num_nodes != num_bb
1237       || !empty_block_p (loop->latch)
1238       || !single_exit (loop)
1239       /* Verify that new loop exit condition can be trivially modified.  */
1240       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1241       || (e != exit_e && e != entry_e))
1242     return false;
1243
1244   return true;
1245 }
1246
1247 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1248    in the exit bb and rename all the uses after the loop.  This simplifies
1249    the *guard[12] routines, which assume loop closed SSA form for all PHIs
1250    (but normally loop closed SSA form doesn't require virtual PHIs to be
1251    in the same form).  Doing this early simplifies the checking what
1252    uses should be renamed.
1253
1254    If we create a new phi after the loop, return the definition that
1255    applies on entry to the loop, otherwise return null.  */
1256
1257 static tree
1258 create_lcssa_for_virtual_phi (class loop *loop)
1259 {
1260   gphi_iterator gsi;
1261   edge exit_e = single_exit (loop);
1262
1263   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1264     if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1265       {
1266         gphi *phi = gsi.phi ();
1267         for (gsi = gsi_start_phis (exit_e->dest);
1268              !gsi_end_p (gsi); gsi_next (&gsi))
1269           if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1270             break;
1271         if (gsi_end_p (gsi))
1272           {
1273             tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1274             gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1275             tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1276             imm_use_iterator imm_iter;
1277             gimple *stmt;
1278             use_operand_p use_p;
1279
1280             SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1281               = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1282             add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1283             gimple_phi_set_result (new_phi, new_vop);
1284             FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1285               if (stmt != new_phi
1286                   && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1287                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1288                   SET_USE (use_p, new_vop);
1289
1290             return PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1291           }
1292         break;
1293       }
1294   return NULL_TREE;
1295 }
1296
1297 /* Function vect_get_loop_location.
1298
1299    Extract the location of the loop in the source code.
1300    If the loop is not well formed for vectorization, an estimated
1301    location is calculated.
1302    Return the loop location if succeed and NULL if not.  */
1303
1304 dump_user_location_t
1305 find_loop_location (class loop *loop)
1306 {
1307   gimple *stmt = NULL;
1308   basic_block bb;
1309   gimple_stmt_iterator si;
1310
1311   if (!loop)
1312     return dump_user_location_t ();
1313
1314   stmt = get_loop_exit_condition (loop);
1315
1316   if (stmt
1317       && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1318     return stmt;
1319
1320   /* If we got here the loop is probably not "well formed",
1321      try to estimate the loop location */
1322
1323   if (!loop->header)
1324     return dump_user_location_t ();
1325
1326   bb = loop->header;
1327
1328   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1329     {
1330       stmt = gsi_stmt (si);
1331       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1332         return stmt;
1333     }
1334
1335   return dump_user_location_t ();
1336 }
1337
1338 /* Return true if the phi described by STMT_INFO defines an IV of the
1339    loop to be vectorized.  */
1340
1341 static bool
1342 iv_phi_p (stmt_vec_info stmt_info)
1343 {
1344   gphi *phi = as_a <gphi *> (stmt_info->stmt);
1345   if (virtual_operand_p (PHI_RESULT (phi)))
1346     return false;
1347
1348   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1349       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1350     return false;
1351
1352   return true;
1353 }
1354
1355 /* Function vect_can_advance_ivs_p
1356
1357    In case the number of iterations that LOOP iterates is unknown at compile
1358    time, an epilog loop will be generated, and the loop induction variables
1359    (IVs) will be "advanced" to the value they are supposed to take just before
1360    the epilog loop.  Here we check that the access function of the loop IVs
1361    and the expression that represents the loop bound are simple enough.
1362    These restrictions will be relaxed in the future.  */
1363
1364 bool
1365 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1366 {
1367   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1368   basic_block bb = loop->header;
1369   gphi_iterator gsi;
1370
1371   /* Analyze phi functions of the loop header.  */
1372
1373   if (dump_enabled_p ())
1374     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1375   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1376     {
1377       tree evolution_part;
1378
1379       gphi *phi = gsi.phi ();
1380       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1381       if (dump_enabled_p ())
1382         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1383                          phi_info->stmt);
1384
1385       /* Skip virtual phi's. The data dependences that are associated with
1386          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1387
1388          Skip reduction phis.  */
1389       if (!iv_phi_p (phi_info))
1390         {
1391           if (dump_enabled_p ())
1392             dump_printf_loc (MSG_NOTE, vect_location,
1393                              "reduc or virtual phi. skip.\n");
1394           continue;
1395         }
1396
1397       /* Analyze the evolution function.  */
1398
1399       evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1400       if (evolution_part == NULL_TREE)
1401         {
1402           if (dump_enabled_p ())
1403             dump_printf (MSG_MISSED_OPTIMIZATION,
1404                          "No access function or evolution.\n");
1405           return false;
1406         }
1407
1408       /* FORNOW: We do not transform initial conditions of IVs
1409          which evolution functions are not invariants in the loop.  */
1410
1411       if (!expr_invariant_in_loop_p (loop, evolution_part))
1412         {
1413           if (dump_enabled_p ())
1414             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1415                              "evolution not invariant in loop.\n");
1416           return false;
1417         }
1418
1419       /* FORNOW: We do not transform initial conditions of IVs
1420          which evolution functions are a polynomial of degree >= 2.  */
1421
1422       if (tree_is_chrec (evolution_part))
1423         {
1424           if (dump_enabled_p ())
1425             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1426                              "evolution is chrec.\n");
1427           return false;
1428         }
1429     }
1430
1431   return true;
1432 }
1433
1434
1435 /*   Function vect_update_ivs_after_vectorizer.
1436
1437      "Advance" the induction variables of LOOP to the value they should take
1438      after the execution of LOOP.  This is currently necessary because the
1439      vectorizer does not handle induction variables that are used after the
1440      loop.  Such a situation occurs when the last iterations of LOOP are
1441      peeled, because:
1442      1. We introduced new uses after LOOP for IVs that were not originally used
1443         after LOOP: the IVs of LOOP are now used by an epilog loop.
1444      2. LOOP is going to be vectorized; this means that it will iterate N/VF
1445         times, whereas the loop IVs should be bumped N times.
1446
1447      Input:
1448      - LOOP - a loop that is going to be vectorized. The last few iterations
1449               of LOOP were peeled.
1450      - NITERS - the number of iterations that LOOP executes (before it is
1451                 vectorized). i.e, the number of times the ivs should be bumped.
1452      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1453                   coming out from LOOP on which there are uses of the LOOP ivs
1454                   (this is the path from LOOP->exit to epilog_loop->preheader).
1455
1456                   The new definitions of the ivs are placed in LOOP->exit.
1457                   The phi args associated with the edge UPDATE_E in the bb
1458                   UPDATE_E->dest are updated accordingly.
1459
1460      Assumption 1: Like the rest of the vectorizer, this function assumes
1461      a single loop exit that has a single predecessor.
1462
1463      Assumption 2: The phi nodes in the LOOP header and in update_bb are
1464      organized in the same order.
1465
1466      Assumption 3: The access function of the ivs is simple enough (see
1467      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
1468
1469      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1470      coming out of LOOP on which the ivs of LOOP are used (this is the path
1471      that leads to the epilog loop; other paths skip the epilog loop).  This
1472      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1473      needs to have its phis updated.
1474  */
1475
1476 static void
1477 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1478                                   tree niters, edge update_e)
1479 {
1480   gphi_iterator gsi, gsi1;
1481   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1482   basic_block update_bb = update_e->dest;
1483   basic_block exit_bb = single_exit (loop)->dest;
1484
1485   /* Make sure there exists a single-predecessor exit bb:  */
1486   gcc_assert (single_pred_p (exit_bb));
1487   gcc_assert (single_succ_edge (exit_bb) == update_e);
1488
1489   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1490        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1491        gsi_next (&gsi), gsi_next (&gsi1))
1492     {
1493       tree init_expr;
1494       tree step_expr, off;
1495       tree type;
1496       tree var, ni, ni_name;
1497       gimple_stmt_iterator last_gsi;
1498
1499       gphi *phi = gsi.phi ();
1500       gphi *phi1 = gsi1.phi ();
1501       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1502       if (dump_enabled_p ())
1503         dump_printf_loc (MSG_NOTE, vect_location,
1504                          "vect_update_ivs_after_vectorizer: phi: %G", phi);
1505
1506       /* Skip reduction and virtual phis.  */
1507       if (!iv_phi_p (phi_info))
1508         {
1509           if (dump_enabled_p ())
1510             dump_printf_loc (MSG_NOTE, vect_location,
1511                              "reduc or virtual phi. skip.\n");
1512           continue;
1513         }
1514
1515       type = TREE_TYPE (gimple_phi_result (phi));
1516       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1517       step_expr = unshare_expr (step_expr);
1518
1519       /* FORNOW: We do not support IVs whose evolution function is a polynomial
1520          of degree >= 2 or exponential.  */
1521       gcc_assert (!tree_is_chrec (step_expr));
1522
1523       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1524
1525       off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1526                          fold_convert (TREE_TYPE (step_expr), niters),
1527                          step_expr);
1528       if (POINTER_TYPE_P (type))
1529         ni = fold_build_pointer_plus (init_expr, off);
1530       else
1531         ni = fold_build2 (PLUS_EXPR, type,
1532                           init_expr, fold_convert (type, off));
1533
1534       var = create_tmp_var (type, "tmp");
1535
1536       last_gsi = gsi_last_bb (exit_bb);
1537       gimple_seq new_stmts = NULL;
1538       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1539       /* Exit_bb shouldn't be empty.  */
1540       if (!gsi_end_p (last_gsi))
1541         gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1542       else
1543         gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1544
1545       /* Fix phi expressions in the successor bb.  */
1546       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1547     }
1548 }
1549
1550 /* Return a gimple value containing the misalignment (measured in vector
1551    elements) for the loop described by LOOP_VINFO, i.e. how many elements
1552    it is away from a perfectly aligned address.  Add any new statements
1553    to SEQ.  */
1554
1555 static tree
1556 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1557 {
1558   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1559   stmt_vec_info stmt_info = dr_info->stmt;
1560   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1561
1562   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1563   unsigned HOST_WIDE_INT target_align_c;
1564   tree target_align_minus_1;
1565
1566   bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1567                                         size_zero_node) < 0;
1568   tree offset = (negative
1569                  ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1570                  : size_zero_node);
1571   tree start_addr = vect_create_addr_base_for_vector_ref (stmt_info, seq,
1572                                                           offset);
1573   tree type = unsigned_type_for (TREE_TYPE (start_addr));
1574   if (target_align.is_constant (&target_align_c))
1575     target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1576   else
1577     {
1578       tree vla = build_int_cst (type, target_align);
1579       tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1580                                     fold_build2 (MINUS_EXPR, type,
1581                                                  build_int_cst (type, 0), vla));
1582       target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1583                                           build_int_cst (type, 1));
1584     }
1585
1586   HOST_WIDE_INT elem_size
1587     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1588   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1589
1590   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
1591   tree int_start_addr = fold_convert (type, start_addr);
1592   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1593                                         target_align_minus_1);
1594
1595   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
1596   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1597                                         elem_size_log);
1598
1599   return misalign_in_elems;
1600 }
1601
1602 /* Function vect_gen_prolog_loop_niters
1603
1604    Generate the number of iterations which should be peeled as prolog for the
1605    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
1606    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1607    As a result, after the execution of this loop, the data reference DR will
1608    refer to an aligned location.  The following computation is generated:
1609
1610    If the misalignment of DR is known at compile time:
1611      addr_mis = int mis = DR_MISALIGNMENT (dr);
1612    Else, compute address misalignment in bytes:
1613      addr_mis = addr & (target_align - 1)
1614
1615    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1616
1617    (elem_size = element type size; an element is the scalar element whose type
1618    is the inner type of the vectype)
1619
1620    The computations will be emitted at the end of BB.  We also compute and
1621    store upper bound (included) of the result in BOUND.
1622
1623    When the step of the data-ref in the loop is not 1 (as in interleaved data
1624    and SLP), the number of iterations of the prolog must be divided by the step
1625    (which is equal to the size of interleaved group).
1626
1627    The above formulas assume that VF == number of elements in the vector. This
1628    may not hold when there are multiple-types in the loop.
1629    In this case, for some data-references in the loop the VF does not represent
1630    the number of elements that fit in the vector.  Therefore, instead of VF we
1631    use TYPE_VECTOR_SUBPARTS.  */
1632
1633 static tree
1634 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1635                              basic_block bb, int *bound)
1636 {
1637   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1638   tree var;
1639   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1640   gimple_seq stmts = NULL, new_stmts = NULL;
1641   tree iters, iters_name;
1642   stmt_vec_info stmt_info = dr_info->stmt;
1643   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1644   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1645
1646   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1647     {
1648       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1649
1650       if (dump_enabled_p ())
1651         dump_printf_loc (MSG_NOTE, vect_location,
1652                          "known peeling = %d.\n", npeel);
1653
1654       iters = build_int_cst (niters_type, npeel);
1655       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1656     }
1657   else
1658     {
1659       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1660       tree type = TREE_TYPE (misalign_in_elems);
1661       HOST_WIDE_INT elem_size
1662         = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1663       /* We only do prolog peeling if the target alignment is known at compile
1664          time.  */
1665       poly_uint64 align_in_elems =
1666         exact_div (target_align, elem_size);
1667       tree align_in_elems_minus_1 =
1668         build_int_cst (type, align_in_elems - 1);
1669       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1670
1671       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
1672                                  & (align_in_elems - 1)).  */
1673       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1674                                             size_zero_node) < 0;
1675       if (negative)
1676         iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1677                              align_in_elems_tree);
1678       else
1679         iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1680                              misalign_in_elems);
1681       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1682       iters = fold_convert (niters_type, iters);
1683       unsigned HOST_WIDE_INT align_in_elems_c;
1684       if (align_in_elems.is_constant (&align_in_elems_c))
1685         *bound = align_in_elems_c - 1;
1686       else
1687         *bound = -1;
1688     }
1689
1690   if (dump_enabled_p ())
1691     dump_printf_loc (MSG_NOTE, vect_location,
1692                      "niters for prolog loop: %T\n", iters);
1693
1694   var = create_tmp_var (niters_type, "prolog_loop_niters");
1695   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1696
1697   if (new_stmts)
1698     gimple_seq_add_seq (&stmts, new_stmts);
1699   if (stmts)
1700     {
1701       gcc_assert (single_succ_p (bb));
1702       gimple_stmt_iterator gsi = gsi_last_bb (bb);
1703       if (gsi_end_p (gsi))
1704         gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1705       else
1706         gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1707     }
1708   return iters_name;
1709 }
1710
1711
1712 /* Function vect_update_init_of_dr
1713
1714    If CODE is PLUS, the vector loop starts NITERS iterations after the
1715    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1716    iterations before the scalar one (using masking to skip inactive
1717    elements).  This function updates the information recorded in DR to
1718    account for the difference.  Specifically, it updates the OFFSET
1719    field of DR_INFO.  */
1720
1721 static void
1722 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
1723 {
1724   struct data_reference *dr = dr_info->dr;
1725   tree offset = dr_info->offset;
1726   if (!offset)
1727     offset = build_zero_cst (sizetype);
1728
1729   niters = fold_build2 (MULT_EXPR, sizetype,
1730                         fold_convert (sizetype, niters),
1731                         fold_convert (sizetype, DR_STEP (dr)));
1732   offset = fold_build2 (code, sizetype,
1733                         fold_convert (sizetype, offset), niters);
1734   dr_info->offset = offset;
1735 }
1736
1737
1738 /* Function vect_update_inits_of_drs
1739
1740    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1741    CODE and NITERS are as for vect_update_inits_of_dr.  */
1742
1743 void
1744 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1745                           tree_code code)
1746 {
1747   unsigned int i;
1748   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1749   struct data_reference *dr;
1750
1751   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1752
1753   /* Adjust niters to sizetype.  We used to insert the stmts on loop preheader
1754      here, but since we might use these niters to update the epilogues niters
1755      and data references we can't insert them here as this definition might not
1756      always dominate its uses.  */
1757   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1758     niters = fold_convert (sizetype, niters);
1759
1760   FOR_EACH_VEC_ELT (datarefs, i, dr)
1761     {
1762       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1763       if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
1764         vect_update_init_of_dr (dr_info, niters, code);
1765     }
1766 }
1767
1768 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1769    by masking.  This involves calculating the number of iterations to
1770    be peeled and then aligning all memory references appropriately.  */
1771
1772 void
1773 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1774 {
1775   tree misalign_in_elems;
1776   tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1777
1778   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1779
1780   /* From the information recorded in LOOP_VINFO get the number of iterations
1781      that need to be skipped via masking.  */
1782   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1783     {
1784       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1785                              - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1786       misalign_in_elems = build_int_cst (type, misalign);
1787     }
1788   else
1789     {
1790       gimple_seq seq1 = NULL, seq2 = NULL;
1791       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1792       misalign_in_elems = fold_convert (type, misalign_in_elems);
1793       misalign_in_elems = force_gimple_operand (misalign_in_elems,
1794                                                 &seq2, true, NULL_TREE);
1795       gimple_seq_add_seq (&seq1, seq2);
1796       if (seq1)
1797         {
1798           edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1799           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1800           gcc_assert (!new_bb);
1801         }
1802     }
1803
1804   if (dump_enabled_p ())
1805     dump_printf_loc (MSG_NOTE, vect_location,
1806                      "misalignment for fully-masked loop: %T\n",
1807                      misalign_in_elems);
1808
1809   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1810
1811   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1812 }
1813
1814 /* This function builds ni_name = number of iterations.  Statements
1815    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
1816    it to TRUE if new ssa_var is generated.  */
1817
1818 tree
1819 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1820 {
1821   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1822   if (TREE_CODE (ni) == INTEGER_CST)
1823     return ni;
1824   else
1825     {
1826       tree ni_name, var;
1827       gimple_seq stmts = NULL;
1828       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1829
1830       var = create_tmp_var (TREE_TYPE (ni), "niters");
1831       ni_name = force_gimple_operand (ni, &stmts, false, var);
1832       if (stmts)
1833         {
1834           gsi_insert_seq_on_edge_immediate (pe, stmts);
1835           if (new_var_p != NULL)
1836             *new_var_p = true;
1837         }
1838
1839       return ni_name;
1840     }
1841 }
1842
1843 /* Calculate the number of iterations above which vectorized loop will be
1844    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
1845    of prolog loop.  If it's integer const, the integer number is also passed
1846    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
1847    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
1848    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
1849    threshold below which the scalar (rather than vectorized) loop will be
1850    executed.  This function stores the upper bound (inclusive) of the result
1851    in BOUND_SCALAR.  */
1852
1853 static tree
1854 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1855                              int bound_prolog, poly_int64 bound_epilog, int th,
1856                              poly_uint64 *bound_scalar,
1857                              bool check_profitability)
1858 {
1859   tree type = TREE_TYPE (niters_prolog);
1860   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1861                              build_int_cst (type, bound_epilog));
1862
1863   *bound_scalar = bound_prolog + bound_epilog;
1864   if (check_profitability)
1865     {
1866       /* TH indicates the minimum niters of vectorized loop, while we
1867          compute the maximum niters of scalar loop.  */
1868       th--;
1869       /* Peeling for constant times.  */
1870       if (int_niters_prolog >= 0)
1871         {
1872           *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1873           return build_int_cst (type, *bound_scalar);
1874         }
1875       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
1876          and BOUND_EPILOG are inclusive upper bounds.  */
1877       if (known_ge (th, bound_prolog + bound_epilog))
1878         {
1879           *bound_scalar = th;
1880           return build_int_cst (type, th);
1881         }
1882       /* Need to do runtime comparison.  */
1883       else if (maybe_gt (th, bound_epilog))
1884         {
1885           *bound_scalar = upper_bound (*bound_scalar, th);
1886           return fold_build2 (MAX_EXPR, type,
1887                               build_int_cst (type, th), niters);
1888         }
1889     }
1890   return niters;
1891 }
1892
1893 /* NITERS is the number of times that the original scalar loop executes
1894    after peeling.  Work out the maximum number of iterations N that can
1895    be handled by the vectorized form of the loop and then either:
1896
1897    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1898
1899         niters_vector = N
1900
1901    b) set *STEP_VECTOR_PTR to one and generate:
1902
1903         niters_vector = N / vf
1904
1905    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1906    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
1907    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
1908
1909 void
1910 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1911                              tree *niters_vector_ptr, tree *step_vector_ptr,
1912                              bool niters_no_overflow)
1913 {
1914   tree ni_minus_gap, var;
1915   tree niters_vector, step_vector, type = TREE_TYPE (niters);
1916   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1917   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1918   tree log_vf = NULL_TREE;
1919
1920   /* If epilogue loop is required because of data accesses with gaps, we
1921      subtract one iteration from the total number of iterations here for
1922      correct calculation of RATIO.  */
1923   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1924     {
1925       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1926                                   build_one_cst (type));
1927       if (!is_gimple_val (ni_minus_gap))
1928         {
1929           var = create_tmp_var (type, "ni_gap");
1930           gimple *stmts = NULL;
1931           ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1932                                                true, var);
1933           gsi_insert_seq_on_edge_immediate (pe, stmts);
1934         }
1935     }
1936   else
1937     ni_minus_gap = niters;
1938
1939   unsigned HOST_WIDE_INT const_vf;
1940   if (vf.is_constant (&const_vf)
1941       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1942     {
1943       /* Create: niters >> log2(vf) */
1944       /* If it's known that niters == number of latch executions + 1 doesn't
1945          overflow, we can generate niters >> log2(vf); otherwise we generate
1946          (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1947          will be at least one.  */
1948       log_vf = build_int_cst (type, exact_log2 (const_vf));
1949       if (niters_no_overflow)
1950         niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1951       else
1952         niters_vector
1953           = fold_build2 (PLUS_EXPR, type,
1954                          fold_build2 (RSHIFT_EXPR, type,
1955                                       fold_build2 (MINUS_EXPR, type,
1956                                                    ni_minus_gap,
1957                                                    build_int_cst (type, vf)),
1958                                       log_vf),
1959                          build_int_cst (type, 1));
1960       step_vector = build_one_cst (type);
1961     }
1962   else
1963     {
1964       niters_vector = ni_minus_gap;
1965       step_vector = build_int_cst (type, vf);
1966     }
1967
1968   if (!is_gimple_val (niters_vector))
1969     {
1970       var = create_tmp_var (type, "bnd");
1971       gimple_seq stmts = NULL;
1972       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1973       gsi_insert_seq_on_edge_immediate (pe, stmts);
1974       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1975          we set range information to make niters analyzer's life easier.  */
1976       if (stmts != NULL && log_vf)
1977         set_range_info (niters_vector, VR_RANGE,
1978                         wi::to_wide (build_int_cst (type, 1)),
1979                         wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1980                                                   TYPE_MAX_VALUE (type),
1981                                                   log_vf)));
1982     }
1983   *niters_vector_ptr = niters_vector;
1984   *step_vector_ptr = step_vector;
1985
1986   return;
1987 }
1988
1989 /* Given NITERS_VECTOR which is the number of iterations for vectorized
1990    loop specified by LOOP_VINFO after vectorization, compute the number
1991    of iterations before vectorization (niters_vector * vf) and store it
1992    to NITERS_VECTOR_MULT_VF_PTR.  */
1993
1994 static void
1995 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
1996                                      tree niters_vector,
1997                                      tree *niters_vector_mult_vf_ptr)
1998 {
1999   /* We should be using a step_vector of VF if VF is variable.  */
2000   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2001   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2002   tree type = TREE_TYPE (niters_vector);
2003   tree log_vf = build_int_cst (type, exact_log2 (vf));
2004   basic_block exit_bb = single_exit (loop)->dest;
2005
2006   gcc_assert (niters_vector_mult_vf_ptr != NULL);
2007   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2008                                             niters_vector, log_vf);
2009   if (!is_gimple_val (niters_vector_mult_vf))
2010     {
2011       tree var = create_tmp_var (type, "niters_vector_mult_vf");
2012       gimple_seq stmts = NULL;
2013       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2014                                                     &stmts, true, var);
2015       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2016       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2017     }
2018   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2019 }
2020
2021 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2022    this function searches for the corresponding lcssa phi node in exit
2023    bb of LOOP.  If it is found, return the phi result; otherwise return
2024    NULL.  */
2025
2026 static tree
2027 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2028                 gphi *lcssa_phi)
2029 {
2030   gphi_iterator gsi;
2031   edge e = single_exit (loop);
2032
2033   gcc_assert (single_pred_p (e->dest));
2034   for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2035     {
2036       gphi *phi = gsi.phi ();
2037       if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2038                            PHI_ARG_DEF (lcssa_phi, 0), 0))
2039         return PHI_RESULT (phi);
2040     }
2041   return NULL_TREE;
2042 }
2043
2044 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2045    from SECOND/FIRST and puts it at the original loop's preheader/exit
2046    edge, the two loops are arranged as below:
2047
2048        preheader_a:
2049      first_loop:
2050        header_a:
2051          i_1 = PHI<i_0, i_2>;
2052          ...
2053          i_2 = i_1 + 1;
2054          if (cond_a)
2055            goto latch_a;
2056          else
2057            goto between_bb;
2058        latch_a:
2059          goto header_a;
2060
2061        between_bb:
2062          ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
2063
2064      second_loop:
2065        header_b:
2066          i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2067                                  or with i_2 if no LCSSA phi is created
2068                                  under condition of CREATE_LCSSA_FOR_IV_PHIS.
2069          ...
2070          i_4 = i_3 + 1;
2071          if (cond_b)
2072            goto latch_b;
2073          else
2074            goto exit_bb;
2075        latch_b:
2076          goto header_b;
2077
2078        exit_bb:
2079
2080    This function creates loop closed SSA for the first loop; update the
2081    second loop's PHI nodes by replacing argument on incoming edge with the
2082    result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
2083    is false, Loop closed ssa phis will only be created for non-iv phis for
2084    the first loop.
2085
2086    This function assumes exit bb of the first loop is preheader bb of the
2087    second loop, i.e, between_bb in the example code.  With PHIs updated,
2088    the second loop will execute rest iterations of the first.  */
2089
2090 static void
2091 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2092                                    class loop *first, class loop *second,
2093                                    bool create_lcssa_for_iv_phis)
2094 {
2095   gphi_iterator gsi_update, gsi_orig;
2096   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2097
2098   edge first_latch_e = EDGE_SUCC (first->latch, 0);
2099   edge second_preheader_e = loop_preheader_edge (second);
2100   basic_block between_bb = single_exit (first)->dest;
2101
2102   gcc_assert (between_bb == second_preheader_e->src);
2103   gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2104   /* Either the first loop or the second is the loop to be vectorized.  */
2105   gcc_assert (loop == first || loop == second);
2106
2107   for (gsi_orig = gsi_start_phis (first->header),
2108        gsi_update = gsi_start_phis (second->header);
2109        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2110        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2111     {
2112       gphi *orig_phi = gsi_orig.phi ();
2113       gphi *update_phi = gsi_update.phi ();
2114
2115       tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2116       /* Generate lcssa PHI node for the first loop.  */
2117       gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2118       stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2119       if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2120         {
2121           tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2122           gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2123           add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2124           arg = new_res;
2125         }
2126
2127       /* Update PHI node in the second loop by replacing arg on the loop's
2128          incoming edge.  */
2129       adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2130     }
2131
2132   /* For epilogue peeling we have to make sure to copy all LC PHIs
2133      for correct vectorization of live stmts.  */
2134   if (loop == first)
2135     {
2136       basic_block orig_exit = single_exit (second)->dest;
2137       for (gsi_orig = gsi_start_phis (orig_exit);
2138            !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2139         {
2140           gphi *orig_phi = gsi_orig.phi ();
2141           tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2142           if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p  (orig_arg))
2143             continue;
2144
2145           /* Already created in the above loop.   */
2146           if (find_guard_arg (first, second, orig_phi))
2147             continue;
2148
2149           tree new_res = copy_ssa_name (orig_arg);
2150           gphi *lcphi = create_phi_node (new_res, between_bb);
2151           add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2152         }
2153     }
2154 }
2155
2156 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2157    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2158    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2159    appear like below:
2160
2161        guard_bb:
2162          if (cond)
2163            goto merge_bb;
2164          else
2165            goto skip_loop;
2166
2167      skip_loop:
2168        header_a:
2169          i_1 = PHI<i_0, i_2>;
2170          ...
2171          i_2 = i_1 + 1;
2172          if (cond_a)
2173            goto latch_a;
2174          else
2175            goto exit_a;
2176        latch_a:
2177          goto header_a;
2178
2179        exit_a:
2180          i_5 = PHI<i_2>;
2181
2182        merge_bb:
2183          ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2184
2185      update_loop:
2186        header_b:
2187          i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2188          ...
2189          i_4 = i_3 + 1;
2190          if (cond_b)
2191            goto latch_b;
2192          else
2193            goto exit_bb;
2194        latch_b:
2195          goto header_b;
2196
2197        exit_bb:
2198
2199    This function creates PHI nodes at merge_bb and replaces the use of i_5
2200    in the update_loop's PHI node with the result of new PHI result.  */
2201
2202 static void
2203 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2204                                     class loop *update_loop,
2205                                     edge guard_edge, edge merge_edge)
2206 {
2207   location_t merge_loc, guard_loc;
2208   edge orig_e = loop_preheader_edge (skip_loop);
2209   edge update_e = loop_preheader_edge (update_loop);
2210   gphi_iterator gsi_orig, gsi_update;
2211
2212   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2213         gsi_update = gsi_start_phis (update_loop->header));
2214        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2215        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2216     {
2217       gphi *orig_phi = gsi_orig.phi ();
2218       gphi *update_phi = gsi_update.phi ();
2219
2220       /* Generate new phi node at merge bb of the guard.  */
2221       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2222       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2223
2224       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2225          args in NEW_PHI for these edges.  */
2226       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2227       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2228       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2229       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2230       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2231       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2232
2233       /* Update phi in UPDATE_PHI.  */
2234       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2235     }
2236 }
2237
2238 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2239    from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
2240    point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
2241    and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2242    The CFG looks like:
2243
2244      loop:
2245        header_a:
2246          i_1 = PHI<i_0, i_2>;
2247          ...
2248          i_2 = i_1 + 1;
2249          if (cond_a)
2250            goto latch_a;
2251          else
2252            goto exit_a;
2253        latch_a:
2254          goto header_a;
2255
2256        exit_a:
2257
2258        guard_bb:
2259          if (cond)
2260            goto merge_bb;
2261          else
2262            goto epilog_loop;
2263
2264        ;; fall_through_bb
2265
2266      epilog_loop:
2267        header_b:
2268          i_3 = PHI<i_2, i_4>;
2269          ...
2270          i_4 = i_3 + 1;
2271          if (cond_b)
2272            goto latch_b;
2273          else
2274            goto merge_bb;
2275        latch_b:
2276          goto header_b;
2277
2278        merge_bb:
2279          ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2280
2281        exit_bb:
2282          i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
2283
2284    For each name used out side EPILOG (i.e - for each name that has a lcssa
2285    phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
2286    args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
2287    the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2288    by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
2289    in exit_bb will also be updated.  */
2290
2291 static void
2292 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2293                                     edge guard_edge, edge merge_edge)
2294 {
2295   gphi_iterator gsi;
2296   basic_block merge_bb = guard_edge->dest;
2297
2298   gcc_assert (single_succ_p (merge_bb));
2299   edge e = single_succ_edge (merge_bb);
2300   basic_block exit_bb = e->dest;
2301   gcc_assert (single_pred_p (exit_bb));
2302   gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2303
2304   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2305     {
2306       gphi *update_phi = gsi.phi ();
2307       tree old_arg = PHI_ARG_DEF (update_phi, 0);
2308
2309       tree merge_arg = NULL_TREE;
2310
2311       /* If the old argument is a SSA_NAME use its current_def.  */
2312       if (TREE_CODE (old_arg) == SSA_NAME)
2313         merge_arg = get_current_def (old_arg);
2314       /* If it's a constant or doesn't have a current_def, just use the old
2315          argument.  */
2316       if (!merge_arg)
2317         merge_arg = old_arg;
2318
2319       tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2320       /* If the var is live after loop but not a reduction, we simply
2321          use the old arg.  */
2322       if (!guard_arg)
2323         guard_arg = old_arg;
2324
2325       /* Create new phi node in MERGE_BB:  */
2326       tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2327       gphi *merge_phi = create_phi_node (new_res, merge_bb);
2328
2329       /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2330          the two PHI args in merge_phi for these edges.  */
2331       add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2332       add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2333
2334       /* Update the original phi in exit_bb.  */
2335       adjust_phi_and_debug_stmts (update_phi, e, new_res);
2336     }
2337 }
2338
2339 /* EPILOG loop is duplicated from the original loop for vectorizing,
2340    the arg of its loop closed ssa PHI needs to be updated.  */
2341
2342 static void
2343 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2344 {
2345   gphi_iterator gsi;
2346   basic_block exit_bb = single_exit (epilog)->dest;
2347
2348   gcc_assert (single_pred_p (exit_bb));
2349   edge e = EDGE_PRED (exit_bb, 0);
2350   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2351     rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2352 }
2353
2354 /* Function vect_do_peeling.
2355
2356    Input:
2357    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2358
2359        preheader:
2360      LOOP:
2361        header_bb:
2362          loop_body
2363          if (exit_loop_cond) goto exit_bb
2364          else                goto header_bb
2365        exit_bb:
2366
2367    - NITERS: The number of iterations of the loop.
2368    - NITERSM1: The number of iterations of the loop's latch.
2369    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2370    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2371                               CHECK_PROFITABILITY is true.
2372    Output:
2373    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2374      iterate after vectorization; see vect_set_loop_condition for details.
2375    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2376      should be set to the number of scalar iterations handled by the
2377      vector loop.  The SSA name is only used on exit from the loop.
2378
2379    This function peels prolog and epilog from the loop, adds guards skipping
2380    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2381    would look like:
2382
2383        guard_bb_1:
2384          if (prefer_scalar_loop) goto merge_bb_1
2385          else                    goto guard_bb_2
2386
2387        guard_bb_2:
2388          if (skip_prolog) goto merge_bb_2
2389          else             goto prolog_preheader
2390
2391        prolog_preheader:
2392      PROLOG:
2393        prolog_header_bb:
2394          prolog_body
2395          if (exit_prolog_cond) goto prolog_exit_bb
2396          else                  goto prolog_header_bb
2397        prolog_exit_bb:
2398
2399        merge_bb_2:
2400
2401        vector_preheader:
2402      VECTOR LOOP:
2403        vector_header_bb:
2404          vector_body
2405          if (exit_vector_cond) goto vector_exit_bb
2406          else                  goto vector_header_bb
2407        vector_exit_bb:
2408
2409        guard_bb_3:
2410          if (skip_epilog) goto merge_bb_3
2411          else             goto epilog_preheader
2412
2413        merge_bb_1:
2414
2415        epilog_preheader:
2416      EPILOG:
2417        epilog_header_bb:
2418          epilog_body
2419          if (exit_epilog_cond) goto merge_bb_3
2420          else                  goto epilog_header_bb
2421
2422        merge_bb_3:
2423
2424    Note this function peels prolog and epilog only if it's necessary,
2425    as well as guards.
2426    This function returns the epilogue loop if a decision was made to vectorize
2427    it, otherwise NULL.
2428
2429    The analysis resulting in this epilogue loop's loop_vec_info was performed
2430    in the same vect_analyze_loop call as the main loop's.  At that time
2431    vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2432    vectorization factors than the main loop.  This list is stored in the main
2433    loop's loop_vec_info in the 'epilogue_vinfos' member.  Everytime we decide to
2434    vectorize the epilogue loop for a lower vectorization factor,  the
2435    loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2436    updated and linked to the epilogue loop.  This is later used to vectorize
2437    the epilogue.  The reason the loop_vec_info needs updating is that it was
2438    constructed based on the original main loop, and the epilogue loop is a
2439    copy of this loop, so all links pointing to statements in the original loop
2440    need updating.  Furthermore, these loop_vec_infos share the
2441    data_reference's records, which will also need to be updated.
2442
2443    TODO: Guard for prefer_scalar_loop should be emitted along with
2444    versioning conditions if loop versioning is needed.  */
2445
2446
2447 class loop *
2448 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2449                  tree *niters_vector, tree *step_vector,
2450                  tree *niters_vector_mult_vf_var, int th,
2451                  bool check_profitability, bool niters_no_overflow,
2452                  tree *advance)
2453 {
2454   edge e, guard_e;
2455   tree type = TREE_TYPE (niters), guard_cond;
2456   basic_block guard_bb, guard_to;
2457   profile_probability prob_prolog, prob_vector, prob_epilog;
2458   int estimated_vf;
2459   int prolog_peeling = 0;
2460   bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2461   /* We currently do not support prolog peeling if the target alignment is not
2462      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
2463      target alignment being constant.  */
2464   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2465   if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2466     return NULL;
2467
2468   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2469     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2470
2471   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2472   poly_uint64 bound_epilog = 0;
2473   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2474       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2475     bound_epilog += vf - 1;
2476   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2477     bound_epilog += 1;
2478   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2479   poly_uint64 bound_scalar = bound_epilog;
2480
2481   if (!prolog_peeling && !epilog_peeling)
2482     return NULL;
2483
2484   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2485   estimated_vf = vect_vf_for_cost (loop_vinfo);
2486   if (estimated_vf == 2)
2487     estimated_vf = 3;
2488   prob_prolog = prob_epilog = profile_probability::guessed_always ()
2489                         .apply_scale (estimated_vf - 1, estimated_vf);
2490
2491   class loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
2492   class loop *first_loop = loop;
2493   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2494
2495   /* We might have a queued need to update virtual SSA form.  As we
2496      delete the update SSA machinery below after doing a regular
2497      incremental SSA update during loop copying make sure we don't
2498      lose that fact.
2499      ???  Needing to update virtual SSA form by renaming is unfortunate
2500      but not all of the vectorizer code inserting new loads / stores
2501      properly assigns virtual operands to those statements.  */
2502   update_ssa (TODO_update_ssa_only_virtuals);
2503
2504   create_lcssa_for_virtual_phi (loop);
2505
2506   /* If we're vectorizing an epilogue loop, the update_ssa above will
2507      have ensured that the virtual operand is in SSA form throughout the
2508      vectorized main loop.  Normally it is possible to trace the updated
2509      vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2510      back to scalar-stmt vuses, meaning that the effect of the SSA update
2511      remains local to the main loop.  However, there are rare cases in
2512      which the vectorized loop has vdefs even when the original scalar
2513      loop didn't.  For example, vectorizing a load with IFN_LOAD_LANES
2514      introduces clobbers of the temporary vector array, which in turn
2515      needs new vdefs.  If the scalar loop doesn't write to memory, these
2516      new vdefs will be the only ones in the vector loop.
2517
2518      In that case, update_ssa will have added a new virtual phi to the
2519      main loop, which previously didn't need one.  Ensure that we (locally)
2520      maintain LCSSA form for the virtual operand, just as we would have
2521      done if the virtual phi had existed from the outset.  This makes it
2522      easier to duplicate the scalar epilogue loop below.  */
2523   tree vop_to_rename = NULL_TREE;
2524   if (loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
2525     {
2526       class loop *orig_loop = LOOP_VINFO_LOOP (orig_loop_vinfo);
2527       vop_to_rename = create_lcssa_for_virtual_phi (orig_loop);
2528     }
2529
2530   if (MAY_HAVE_DEBUG_BIND_STMTS)
2531     {
2532       gcc_assert (!adjust_vec.exists ());
2533       adjust_vec.create (32);
2534     }
2535   initialize_original_copy_tables ();
2536
2537   /* Record the anchor bb at which the guard should be placed if the scalar
2538      loop might be preferred.  */
2539   basic_block anchor = loop_preheader_edge (loop)->src;
2540
2541   /* Generate the number of iterations for the prolog loop.  We do this here
2542      so that we can also get the upper bound on the number of iterations.  */
2543   tree niters_prolog;
2544   int bound_prolog = 0;
2545   if (prolog_peeling)
2546     niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2547                                                   &bound_prolog);
2548   else
2549     niters_prolog = build_int_cst (type, 0);
2550
2551   loop_vec_info epilogue_vinfo = NULL;
2552   if (vect_epilogues)
2553     {
2554       epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2555       loop_vinfo->epilogue_vinfos.ordered_remove (0);
2556     }
2557
2558   tree niters_vector_mult_vf = NULL_TREE;
2559   /* Saving NITERs before the loop, as this may be changed by prologue.  */
2560   tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2561   edge update_e = NULL, skip_e = NULL;
2562   unsigned int lowest_vf = constant_lower_bound (vf);
2563   /* If we know the number of scalar iterations for the main loop we should
2564      check whether after the main loop there are enough iterations left over
2565      for the epilogue.  */
2566   if (vect_epilogues
2567       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2568       && prolog_peeling >= 0
2569       && known_eq (vf, lowest_vf))
2570     {
2571       unsigned HOST_WIDE_INT eiters
2572         = (LOOP_VINFO_INT_NITERS (loop_vinfo)
2573            - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2574
2575       eiters -= prolog_peeling;
2576       eiters
2577         = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2578
2579       unsigned int ratio;
2580       unsigned int epilogue_gaps
2581         = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2582       while (!(constant_multiple_p
2583                (GET_MODE_SIZE (loop_vinfo->vector_mode),
2584                 GET_MODE_SIZE (epilogue_vinfo->vector_mode), &ratio)
2585                && eiters >= lowest_vf / ratio + epilogue_gaps))
2586         {
2587           delete epilogue_vinfo;
2588           epilogue_vinfo = NULL;
2589           if (loop_vinfo->epilogue_vinfos.length () == 0)
2590             {
2591               vect_epilogues = false;
2592               break;
2593             }
2594           epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2595           loop_vinfo->epilogue_vinfos.ordered_remove (0);
2596           epilogue_gaps = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2597         }
2598     }
2599   /* Prolog loop may be skipped.  */
2600   bool skip_prolog = (prolog_peeling != 0);
2601   /* Skip this loop to epilog when there are not enough iterations to enter this
2602      vectorized loop.  If true we should perform runtime checks on the NITERS
2603      to check whether we should skip the current vectorized loop.  If we know
2604      the number of scalar iterations we may choose to add a runtime check if
2605      this number "maybe" smaller than the number of iterations required
2606      when we know the number of scalar iterations may potentially
2607      be smaller than the number of iterations required to enter this loop, for
2608      this we use the upper bounds on the prolog and epilog peeling.  When we
2609      don't know the number of iterations and don't require versioning it is
2610      because we have asserted that there are enough scalar iterations to enter
2611      the main loop, so this skip is not necessary.  When we are versioning then
2612      we only add such a skip if we have chosen to vectorize the epilogue.  */
2613   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2614                       ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2615                                   bound_prolog + bound_epilog)
2616                       : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2617                          || vect_epilogues));
2618   /* Epilog loop must be executed if the number of iterations for epilog
2619      loop is known at compile time, otherwise we need to add a check at
2620      the end of vector loop and skip to the end of epilog loop.  */
2621   bool skip_epilog = (prolog_peeling < 0
2622                       || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2623                       || !vf.is_constant ());
2624   /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
2625   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2626     skip_epilog = false;
2627
2628   if (skip_vector)
2629     {
2630       split_edge (loop_preheader_edge (loop));
2631
2632       /* Due to the order in which we peel prolog and epilog, we first
2633          propagate probability to the whole loop.  The purpose is to
2634          avoid adjusting probabilities of both prolog and vector loops
2635          separately.  Note in this case, the probability of epilog loop
2636          needs to be scaled back later.  */
2637       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2638       if (prob_vector.initialized_p ())
2639         {
2640           scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2641           scale_loop_profile (loop, prob_vector, 0);
2642         }
2643     }
2644
2645   dump_user_location_t loop_loc = find_loop_location (loop);
2646   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2647   if (vect_epilogues)
2648     /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2649        use the original scalar loop as remaining epilogue if necessary.  */
2650     LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2651       = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2652
2653   if (prolog_peeling)
2654     {
2655       e = loop_preheader_edge (loop);
2656       if (!slpeel_can_duplicate_loop_p (loop, e))
2657         {
2658           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2659                            "loop can't be duplicated to preheader edge.\n");
2660           gcc_unreachable ();
2661         }
2662       /* Peel prolog and put it on preheader edge of loop.  */
2663       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2664       if (!prolog)
2665         {
2666           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2667                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2668           gcc_unreachable ();
2669         }
2670       prolog->force_vectorize = false;
2671       slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2672       first_loop = prolog;
2673       reset_original_copy_tables ();
2674
2675       /* Update the number of iterations for prolog loop.  */
2676       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2677       vect_set_loop_condition (prolog, NULL, niters_prolog,
2678                                step_prolog, NULL_TREE, false);
2679
2680       /* Skip the prolog loop.  */
2681       if (skip_prolog)
2682         {
2683           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2684                                     niters_prolog, build_int_cst (type, 0));
2685           guard_bb = loop_preheader_edge (prolog)->src;
2686           basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2687           guard_to = split_edge (loop_preheader_edge (loop));
2688           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2689                                            guard_to, guard_bb,
2690                                            prob_prolog.invert (),
2691                                            irred_flag);
2692           e = EDGE_PRED (guard_to, 0);
2693           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2694           slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2695
2696           scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2697           scale_loop_profile (prolog, prob_prolog, bound_prolog);
2698         }
2699
2700       /* Update init address of DRs.  */
2701       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2702       /* Update niters for vector loop.  */
2703       LOOP_VINFO_NITERS (loop_vinfo)
2704         = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2705       LOOP_VINFO_NITERSM1 (loop_vinfo)
2706         = fold_build2 (MINUS_EXPR, type,
2707                        LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2708       bool new_var_p = false;
2709       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2710       /* It's guaranteed that vector loop bound before vectorization is at
2711          least VF, so set range information for newly generated var.  */
2712       if (new_var_p)
2713         set_range_info (niters, VR_RANGE,
2714                         wi::to_wide (build_int_cst (type, vf)),
2715                         wi::to_wide (TYPE_MAX_VALUE (type)));
2716
2717       /* Prolog iterates at most bound_prolog times, latch iterates at
2718          most bound_prolog - 1 times.  */
2719       record_niter_bound (prolog, bound_prolog - 1, false, true);
2720       delete_update_ssa ();
2721       adjust_vec_debug_stmts ();
2722       scev_reset ();
2723     }
2724
2725   if (epilog_peeling)
2726     {
2727       e = single_exit (loop);
2728       if (!slpeel_can_duplicate_loop_p (loop, e))
2729         {
2730           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2731                            "loop can't be duplicated to exit edge.\n");
2732           gcc_unreachable ();
2733         }
2734       /* Peel epilog and put it on exit edge of loop.  If we are vectorizing
2735          said epilog then we should use a copy of the main loop as a starting
2736          point.  This loop may have already had some preliminary transformations
2737          to allow for more optimal vectorization, for example if-conversion.
2738          If we are not vectorizing the epilog then we should use the scalar loop
2739          as the transformations mentioned above make less or no sense when not
2740          vectorizing.  */
2741       epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2742       if (vop_to_rename)
2743         {
2744           /* Vectorizing the main loop can sometimes introduce a vdef to
2745              a loop that previously didn't have one; see the comment above
2746              the definition of VOP_TO_RENAME for details.  The definition
2747              D that holds on E will then be different from the definition
2748              VOP_TO_RENAME that holds during SCALAR_LOOP, so we need to
2749              rename VOP_TO_RENAME to D when copying the loop.
2750
2751              The virtual operand is in LCSSA form for the main loop,
2752              and no stmt between the main loop and E needs a vdef,
2753              so we know that D is provided by a phi rather than by a
2754              vdef on a normal gimple stmt.  */
2755           basic_block vdef_bb = e->src;
2756           gphi *vphi;
2757           while (!(vphi = get_virtual_phi (vdef_bb)))
2758             vdef_bb = get_immediate_dominator (CDI_DOMINATORS, vdef_bb);
2759           gcc_assert (vop_to_rename != gimple_phi_result (vphi));
2760           set_current_def (vop_to_rename, gimple_phi_result (vphi));
2761         }
2762       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
2763       if (!epilog)
2764         {
2765           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2766                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2767           gcc_unreachable ();
2768         }
2769       epilog->force_vectorize = false;
2770       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2771
2772       /* Scalar version loop may be preferred.  In this case, add guard
2773          and skip to epilog.  Note this only happens when the number of
2774          iterations of loop is unknown at compile time, otherwise this
2775          won't be vectorized.  */
2776       if (skip_vector)
2777         {
2778           /* Additional epilogue iteration is peeled if gap exists.  */
2779           tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2780                                                 bound_prolog, bound_epilog,
2781                                                 th, &bound_scalar,
2782                                                 check_profitability);
2783           /* Build guard against NITERSM1 since NITERS may overflow.  */
2784           guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2785           guard_bb = anchor;
2786           guard_to = split_edge (loop_preheader_edge (epilog));
2787           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2788                                            guard_to, guard_bb,
2789                                            prob_vector.invert (),
2790                                            irred_flag);
2791           skip_e = guard_e;
2792           e = EDGE_PRED (guard_to, 0);
2793           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2794           slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2795
2796           /* Simply propagate profile info from guard_bb to guard_to which is
2797              a merge point of control flow.  */
2798           guard_to->count = guard_bb->count;
2799
2800           /* Scale probability of epilog loop back.
2801              FIXME: We should avoid scaling down and back up.  Profile may
2802              get lost if we scale down to 0.  */
2803           basic_block *bbs = get_loop_body (epilog);
2804           for (unsigned int i = 0; i < epilog->num_nodes; i++)
2805             bbs[i]->count = bbs[i]->count.apply_scale
2806                                  (bbs[i]->count,
2807                                   bbs[i]->count.apply_probability
2808                                     (prob_vector));
2809           free (bbs);
2810         }
2811
2812       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2813       /* If loop is peeled for non-zero constant times, now niters refers to
2814          orig_niters - prolog_peeling, it won't overflow even the orig_niters
2815          overflows.  */
2816       niters_no_overflow |= (prolog_peeling > 0);
2817       vect_gen_vector_loop_niters (loop_vinfo, niters,
2818                                    niters_vector, step_vector,
2819                                    niters_no_overflow);
2820       if (!integer_onep (*step_vector))
2821         {
2822           /* On exit from the loop we will have an easy way of calcalating
2823              NITERS_VECTOR / STEP * STEP.  Install a dummy definition
2824              until then.  */
2825           niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2826           SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2827           *niters_vector_mult_vf_var = niters_vector_mult_vf;
2828         }
2829       else
2830         vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2831                                              &niters_vector_mult_vf);
2832       /* Update IVs of original loop as if they were advanced by
2833          niters_vector_mult_vf steps.  */
2834       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2835       update_e = skip_vector ? e : loop_preheader_edge (epilog);
2836       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2837                                         update_e);
2838
2839       if (skip_epilog)
2840         {
2841           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2842                                     niters, niters_vector_mult_vf);
2843           guard_bb = single_exit (loop)->dest;
2844           guard_to = split_edge (single_exit (epilog));
2845           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2846                                            skip_vector ? anchor : guard_bb,
2847                                            prob_epilog.invert (),
2848                                            irred_flag);
2849           slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2850                                               single_exit (epilog));
2851           /* Only need to handle basic block before epilog loop if it's not
2852              the guard_bb, which is the case when skip_vector is true.  */
2853           if (guard_bb != bb_before_epilog)
2854             {
2855               prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2856
2857               scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2858             }
2859           scale_loop_profile (epilog, prob_epilog, 0);
2860         }
2861       else
2862         slpeel_update_phi_nodes_for_lcssa (epilog);
2863
2864       unsigned HOST_WIDE_INT bound;
2865       if (bound_scalar.is_constant (&bound))
2866         {
2867           gcc_assert (bound != 0);
2868           /* -1 to convert loop iterations to latch iterations.  */
2869           record_niter_bound (epilog, bound - 1, false, true);
2870         }
2871
2872       delete_update_ssa ();
2873       adjust_vec_debug_stmts ();
2874       scev_reset ();
2875     }
2876
2877   if (vect_epilogues)
2878     {
2879       epilog->aux = epilogue_vinfo;
2880       LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
2881
2882       loop_constraint_clear (epilog, LOOP_C_INFINITE);
2883
2884       /* We now must calculate the number of NITERS performed by the previous
2885          loop and EPILOGUE_NITERS to be performed by the epilogue.  */
2886       tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
2887                                  niters_prolog, niters_vector_mult_vf);
2888
2889       /* If skip_vector we may skip the previous loop, we insert a phi-node to
2890          determine whether we are coming from the previous vectorized loop
2891          using the update_e edge or the skip_vector basic block using the
2892          skip_e edge.  */
2893       if (skip_vector)
2894         {
2895           gcc_assert (update_e != NULL && skip_e != NULL);
2896           gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
2897                                            update_e->dest);
2898           tree new_ssa = make_ssa_name (TREE_TYPE (niters));
2899           gimple *stmt = gimple_build_assign (new_ssa, niters);
2900           gimple_stmt_iterator gsi;
2901           if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
2902               && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
2903             {
2904               gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
2905               gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
2906             }
2907           else
2908             {
2909               gsi = gsi_last_bb (update_e->src);
2910               gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
2911             }
2912
2913           niters = new_ssa;
2914           add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
2915           add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
2916                        UNKNOWN_LOCATION);
2917           niters = PHI_RESULT (new_phi);
2918         }
2919
2920       /* Subtract the number of iterations performed by the vectorized loop
2921          from the number of total iterations.  */
2922       tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
2923                                           before_loop_niters,
2924                                           niters);
2925
2926       LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
2927       LOOP_VINFO_NITERSM1 (epilogue_vinfo)
2928         = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
2929                        epilogue_niters,
2930                        build_one_cst (TREE_TYPE (epilogue_niters)));
2931
2932       /* Set ADVANCE to the number of iterations performed by the previous
2933          loop and its prologue.  */
2934       *advance = niters;
2935
2936       /* Redo the peeling for niter analysis as the NITERs and alignment
2937          may have been updated to take the main loop into account.  */
2938       determine_peel_for_niter (epilogue_vinfo);
2939     }
2940
2941   adjust_vec.release ();
2942   free_original_copy_tables ();
2943
2944   return vect_epilogues ? epilog : NULL;
2945 }
2946
2947 /* Function vect_create_cond_for_niters_checks.
2948
2949    Create a conditional expression that represents the run-time checks for
2950    loop's niter.  The loop is guaranteed to terminate if the run-time
2951    checks hold.
2952
2953    Input:
2954    COND_EXPR  - input conditional expression.  New conditions will be chained
2955                 with logical AND operation.  If it is NULL, then the function
2956                 is used to return the number of alias checks.
2957    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2958                 to be checked.
2959
2960    Output:
2961    COND_EXPR - conditional expression.
2962
2963    The returned COND_EXPR is the conditional expression to be used in the
2964    if statement that controls which version of the loop gets executed at
2965    runtime.  */
2966
2967 static void
2968 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
2969 {
2970   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
2971
2972   if (*cond_expr)
2973     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2974                               *cond_expr, part_cond_expr);
2975   else
2976     *cond_expr = part_cond_expr;
2977 }
2978
2979 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2980    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
2981
2982 static void
2983 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
2984 {
2985   if (*cond_expr)
2986     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2987                               *cond_expr, part_cond_expr);
2988   else
2989     *cond_expr = part_cond_expr;
2990 }
2991
2992 /* Function vect_create_cond_for_align_checks.
2993
2994    Create a conditional expression that represents the alignment checks for
2995    all of data references (array element references) whose alignment must be
2996    checked at runtime.
2997
2998    Input:
2999    COND_EXPR  - input conditional expression.  New conditions will be chained
3000                 with logical AND operation.
3001    LOOP_VINFO - two fields of the loop information are used.
3002                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3003                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3004
3005    Output:
3006    COND_EXPR_STMT_LIST - statements needed to construct the conditional
3007                          expression.
3008    The returned value is the conditional expression to be used in the if
3009    statement that controls which version of the loop gets executed at runtime.
3010
3011    The algorithm makes two assumptions:
3012      1) The number of bytes "n" in a vector is a power of 2.
3013      2) An address "a" is aligned if a%n is zero and that this
3014         test can be done as a&(n-1) == 0.  For example, for 16
3015         byte vectors the test is a&0xf == 0.  */
3016
3017 static void
3018 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3019                                    tree *cond_expr,
3020                                    gimple_seq *cond_expr_stmt_list)
3021 {
3022   vec<stmt_vec_info> may_misalign_stmts
3023     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3024   stmt_vec_info stmt_info;
3025   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3026   tree mask_cst;
3027   unsigned int i;
3028   tree int_ptrsize_type;
3029   char tmp_name[20];
3030   tree or_tmp_name = NULL_TREE;
3031   tree and_tmp_name;
3032   gimple *and_stmt;
3033   tree ptrsize_zero;
3034   tree part_cond_expr;
3035
3036   /* Check that mask is one less than a power of 2, i.e., mask is
3037      all zeros followed by all ones.  */
3038   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3039
3040   int_ptrsize_type = signed_type_for (ptr_type_node);
3041
3042   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3043      of the first vector of the i'th data reference. */
3044
3045   FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3046     {
3047       gimple_seq new_stmt_list = NULL;
3048       tree addr_base;
3049       tree addr_tmp_name;
3050       tree new_or_tmp_name;
3051       gimple *addr_stmt, *or_stmt;
3052       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3053       bool negative = tree_int_cst_compare
3054         (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3055       tree offset = negative
3056         ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
3057
3058       /* create: addr_tmp = (int)(address_of_first_vector) */
3059       addr_base =
3060         vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
3061                                               offset);
3062       if (new_stmt_list != NULL)
3063         gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3064
3065       sprintf (tmp_name, "addr2int%d", i);
3066       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3067       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3068       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3069
3070       /* The addresses are OR together.  */
3071
3072       if (or_tmp_name != NULL_TREE)
3073         {
3074           /* create: or_tmp = or_tmp | addr_tmp */
3075           sprintf (tmp_name, "orptrs%d", i);
3076           new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3077           or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3078                                          or_tmp_name, addr_tmp_name);
3079           gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3080           or_tmp_name = new_or_tmp_name;
3081         }
3082       else
3083         or_tmp_name = addr_tmp_name;
3084
3085     } /* end for i */
3086
3087   mask_cst = build_int_cst (int_ptrsize_type, mask);
3088
3089   /* create: and_tmp = or_tmp & mask  */
3090   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3091
3092   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3093                                   or_tmp_name, mask_cst);
3094   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3095
3096   /* Make and_tmp the left operand of the conditional test against zero.
3097      if and_tmp has a nonzero bit then some address is unaligned.  */
3098   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3099   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3100                                 and_tmp_name, ptrsize_zero);
3101   chain_cond_expr (cond_expr, part_cond_expr);
3102 }
3103
3104 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3105    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3106    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3107    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
3108
3109 static void
3110 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3111 {
3112   vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3113   unsigned int i;
3114   vec_object_pair *pair;
3115   FOR_EACH_VEC_ELT (pairs, i, pair)
3116     {
3117       tree addr1 = build_fold_addr_expr (pair->first);
3118       tree addr2 = build_fold_addr_expr (pair->second);
3119       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3120                                          addr1, addr2);
3121       chain_cond_expr (cond_expr, part_cond_expr);
3122     }
3123 }
3124
3125 /* Create an expression that is true when all lower-bound conditions for
3126    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
3127
3128 static void
3129 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3130 {
3131   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3132   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3133     {
3134       tree expr = lower_bounds[i].expr;
3135       tree type = unsigned_type_for (TREE_TYPE (expr));
3136       expr = fold_convert (type, expr);
3137       poly_uint64 bound = lower_bounds[i].min_value;
3138       if (!lower_bounds[i].unsigned_p)
3139         {
3140           expr = fold_build2 (PLUS_EXPR, type, expr,
3141                               build_int_cstu (type, bound - 1));
3142           bound += bound - 1;
3143         }
3144       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3145                                          build_int_cstu (type, bound));
3146       chain_cond_expr (cond_expr, part_cond_expr);
3147     }
3148 }
3149
3150 /* Function vect_create_cond_for_alias_checks.
3151
3152    Create a conditional expression that represents the run-time checks for
3153    overlapping of address ranges represented by a list of data references
3154    relations passed as input.
3155
3156    Input:
3157    COND_EXPR  - input conditional expression.  New conditions will be chained
3158                 with logical AND operation.  If it is NULL, then the function
3159                 is used to return the number of alias checks.
3160    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3161                 to be checked.
3162
3163    Output:
3164    COND_EXPR - conditional expression.
3165
3166    The returned COND_EXPR is the conditional expression to be used in the if
3167    statement that controls which version of the loop gets executed at runtime.
3168 */
3169
3170 void
3171 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3172 {
3173   vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
3174     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3175
3176   if (comp_alias_ddrs.is_empty ())
3177     return;
3178
3179   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3180                                &comp_alias_ddrs, cond_expr);
3181   if (dump_enabled_p ())
3182     dump_printf_loc (MSG_NOTE, vect_location,
3183                      "created %u versioning for alias checks.\n",
3184                      comp_alias_ddrs.length ());
3185 }
3186
3187
3188 /* Function vect_loop_versioning.
3189
3190    If the loop has data references that may or may not be aligned or/and
3191    has data reference relations whose independence was not proven then
3192    two versions of the loop need to be generated, one which is vectorized
3193    and one which isn't.  A test is then generated to control which of the
3194    loops is executed.  The test checks for the alignment of all of the
3195    data references that may or may not be aligned.  An additional
3196    sequence of runtime tests is generated for each pairs of DDRs whose
3197    independence was not proven.  The vectorized version of loop is
3198    executed only if both alias and alignment tests are passed.
3199
3200    The test generated to check which version of loop is executed
3201    is modified to also check for profitability as indicated by the
3202    cost model threshold TH.
3203
3204    The versioning precondition(s) are placed in *COND_EXPR and
3205    *COND_EXPR_STMT_LIST.  */
3206
3207 class loop *
3208 vect_loop_versioning (loop_vec_info loop_vinfo,
3209                       gimple *loop_vectorized_call)
3210 {
3211   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3212   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3213   basic_block condition_bb;
3214   gphi_iterator gsi;
3215   gimple_stmt_iterator cond_exp_gsi;
3216   basic_block merge_bb;
3217   basic_block new_exit_bb;
3218   edge new_exit_e, e;
3219   gphi *orig_phi, *new_phi;
3220   tree cond_expr = NULL_TREE;
3221   gimple_seq cond_expr_stmt_list = NULL;
3222   tree arg;
3223   profile_probability prob = profile_probability::likely ();
3224   gimple_seq gimplify_stmt_list = NULL;
3225   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3226   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3227   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3228   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3229   poly_uint64 versioning_threshold
3230     = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3231   tree version_simd_if_cond
3232     = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3233   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3234
3235   if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3236       && !ordered_p (th, versioning_threshold))
3237     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3238                              build_int_cst (TREE_TYPE (scalar_loop_iters),
3239                                             th - 1));
3240   if (maybe_ne (versioning_threshold, 0U))
3241     {
3242       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3243                                build_int_cst (TREE_TYPE (scalar_loop_iters),
3244                                               versioning_threshold - 1));
3245       if (cond_expr)
3246         cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3247                                  expr, cond_expr);
3248       else
3249         cond_expr = expr;
3250     }
3251
3252   if (version_niter)
3253     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3254
3255   if (cond_expr)
3256     cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3257                                         &cond_expr_stmt_list,
3258                                         is_gimple_condexpr, NULL_TREE);
3259
3260   if (version_align)
3261     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3262                                        &cond_expr_stmt_list);
3263
3264   if (version_alias)
3265     {
3266       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3267       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3268       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3269     }
3270
3271   if (version_simd_if_cond)
3272     {
3273       gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3274       if (flag_checking)
3275         if (basic_block bb
3276             = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3277           gcc_assert (bb != loop->header
3278                       && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3279                       && (scalar_loop == NULL
3280                           || (bb != scalar_loop->header
3281                               && dominated_by_p (CDI_DOMINATORS,
3282                                                  scalar_loop->header, bb))));
3283       tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3284       tree c = fold_build2 (NE_EXPR, boolean_type_node,
3285                             version_simd_if_cond, zero);
3286       if (cond_expr)
3287         cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3288                                  c, cond_expr);
3289       else
3290         cond_expr = c;
3291       if (dump_enabled_p ())
3292         dump_printf_loc (MSG_NOTE, vect_location,
3293                          "created versioning for simd if condition check.\n");
3294     }
3295
3296   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3297                                       &gimplify_stmt_list,
3298                                       is_gimple_condexpr, NULL_TREE);
3299   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3300
3301   /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3302      invariant in.  */
3303   class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3304   for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3305        !gsi_end_p (gsi); gsi_next (&gsi))
3306     {
3307       gimple *stmt = gsi_stmt (gsi);
3308       update_stmt (stmt);
3309       ssa_op_iter iter;
3310       use_operand_p use_p;
3311       basic_block def_bb;
3312       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3313         if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3314             && flow_bb_inside_loop_p (outermost, def_bb))
3315           outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3316     }
3317
3318   /* Search for the outermost loop we can version.  Avoid versioning of
3319      non-perfect nests but allow if-conversion versioned loops inside.  */
3320   class loop *loop_to_version = loop;
3321   if (flow_loop_nested_p (outermost, loop))
3322     {
3323       if (dump_enabled_p ())
3324         dump_printf_loc (MSG_NOTE, vect_location,
3325                          "trying to apply versioning to outer loop %d\n",
3326                          outermost->num);
3327       if (outermost->num == 0)
3328         outermost = superloop_at_depth (loop, 1);
3329       /* And avoid applying versioning on non-perfect nests.  */
3330       while (loop_to_version != outermost
3331              && single_exit (loop_outer (loop_to_version))
3332              && (!loop_outer (loop_to_version)->inner->next
3333                  || vect_loop_vectorized_call (loop_to_version))
3334              && (!loop_outer (loop_to_version)->inner->next
3335                  || !loop_outer (loop_to_version)->inner->next->next))
3336         loop_to_version = loop_outer (loop_to_version);
3337     }
3338
3339   /* Apply versioning.  If there is already a scalar version created by
3340      if-conversion re-use that.  Note we cannot re-use the copy of
3341      an if-converted outer-loop when vectorizing the inner loop only.  */
3342   gcond *cond;
3343   if ((!loop_to_version->inner || loop == loop_to_version)
3344       && loop_vectorized_call)
3345     {
3346       gcc_assert (scalar_loop);
3347       condition_bb = gimple_bb (loop_vectorized_call);
3348       cond = as_a <gcond *> (last_stmt (condition_bb));
3349       gimple_cond_set_condition_from_tree (cond, cond_expr);
3350       update_stmt (cond);
3351
3352       if (cond_expr_stmt_list)
3353         {
3354           cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3355           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3356                                  GSI_SAME_STMT);
3357         }
3358
3359       /* if-conversion uses profile_probability::always () for both paths,
3360          reset the paths probabilities appropriately.  */
3361       edge te, fe;
3362       extract_true_false_edges_from_block (condition_bb, &te, &fe);
3363       te->probability = prob;
3364       fe->probability = prob.invert ();
3365       /* We can scale loops counts immediately but have to postpone
3366          scaling the scalar loop because we re-use it during peeling.  */
3367       scale_loop_frequencies (loop_to_version, te->probability);
3368       LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3369
3370       nloop = scalar_loop;
3371       if (dump_enabled_p ())
3372         dump_printf_loc (MSG_NOTE, vect_location,
3373                          "reusing %sloop version created by if conversion\n",
3374                          loop_to_version != loop ? "outer " : "");
3375     }
3376   else
3377     {
3378       if (loop_to_version != loop
3379           && dump_enabled_p ())
3380         dump_printf_loc (MSG_NOTE, vect_location,
3381                          "applying loop versioning to outer loop %d\n",
3382                          loop_to_version->num);
3383
3384       initialize_original_copy_tables ();
3385       nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3386                             prob, prob.invert (), prob, prob.invert (), true);
3387       gcc_assert (nloop);
3388       nloop = get_loop_copy (loop);
3389
3390       /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3391          reap those otherwise;  they also refer to the original
3392          loops.  */
3393       class loop *l = loop;
3394       while (gimple *call = vect_loop_vectorized_call (l))
3395         {
3396           call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3397           fold_loop_internal_call (call, boolean_false_node);
3398           l = loop_outer (l);
3399         }
3400       free_original_copy_tables ();
3401
3402       if (cond_expr_stmt_list)
3403         {
3404           cond_exp_gsi = gsi_last_bb (condition_bb);
3405           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3406                                  GSI_SAME_STMT);
3407         }
3408
3409       /* Loop versioning violates an assumption we try to maintain during
3410          vectorization - that the loop exit block has a single predecessor.
3411          After versioning, the exit block of both loop versions is the same
3412          basic block (i.e. it has two predecessors). Just in order to simplify
3413          following transformations in the vectorizer, we fix this situation
3414          here by adding a new (empty) block on the exit-edge of the loop,
3415          with the proper loop-exit phis to maintain loop-closed-form.
3416          If loop versioning wasn't done from loop, but scalar_loop instead,
3417          merge_bb will have already just a single successor.  */
3418
3419       merge_bb = single_exit (loop_to_version)->dest;
3420       if (EDGE_COUNT (merge_bb->preds) >= 2)
3421         {
3422           gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3423           new_exit_bb = split_edge (single_exit (loop_to_version));
3424           new_exit_e = single_exit (loop_to_version);
3425           e = EDGE_SUCC (new_exit_bb, 0);
3426
3427           for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3428                gsi_next (&gsi))
3429             {
3430               tree new_res;
3431               orig_phi = gsi.phi ();
3432               new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3433               new_phi = create_phi_node (new_res, new_exit_bb);
3434               arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3435               add_phi_arg (new_phi, arg, new_exit_e,
3436                            gimple_phi_arg_location_from_edge (orig_phi, e));
3437               adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3438             }
3439         }
3440
3441       update_ssa (TODO_update_ssa);
3442     }
3443
3444   if (version_niter)
3445     {
3446       /* The versioned loop could be infinite, we need to clear existing
3447          niter information which is copied from the original loop.  */
3448       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3449       vect_free_loop_info_assumptions (nloop);
3450       /* And set constraint LOOP_C_INFINITE for niter analyzer.  */
3451       loop_constraint_set (loop, LOOP_C_INFINITE);
3452     }
3453
3454   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3455       && dump_enabled_p ())
3456     {
3457       if (version_alias)
3458         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3459                          vect_location,
3460                          "loop versioned for vectorization because of "
3461                          "possible aliasing\n");
3462       if (version_align)
3463         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3464                          vect_location,
3465                          "loop versioned for vectorization to enhance "
3466                          "alignment\n");
3467
3468     }
3469
3470   return nloop;
3471 }