Fix ICE on unsupported FP comparison
[gcc.git] / gcc / tree-vect-loop-manip.c
1 /* Vectorizer Specific Loop Manipulations
2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "fold-const.h"
32 #include "cfganal.h"
33 #include "gimplify.h"
34 #include "gimple-iterator.h"
35 #include "gimplify-me.h"
36 #include "tree-cfg.h"
37 #include "tree-ssa-loop-manip.h"
38 #include "tree-into-ssa.h"
39 #include "tree-ssa.h"
40 #include "cfgloop.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "gimple-fold.h"
45 #include "tree-ssa-loop-niter.h"
46 #include "internal-fn.h"
47 #include "stor-layout.h"
48 #include "optabs-query.h"
49 #include "vec-perm-indices.h"
50 #include "insn-config.h"
51 #include "rtl.h"
52 #include "recog.h"
53
54 /*************************************************************************
55 Simple Loop Peeling Utilities
56
57 Utilities to support loop peeling for vectorization purposes.
58 *************************************************************************/
59
60
61 /* Renames the use *OP_P. */
62
63 static void
64 rename_use_op (use_operand_p op_p)
65 {
66 tree new_name;
67
68 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
69 return;
70
71 new_name = get_current_def (USE_FROM_PTR (op_p));
72
73 /* Something defined outside of the loop. */
74 if (!new_name)
75 return;
76
77 /* An ordinary ssa name defined in the loop. */
78
79 SET_USE (op_p, new_name);
80 }
81
82
83 /* Renames the variables in basic block BB. Allow renaming of PHI arguments
84 on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
85 true. */
86
87 static void
88 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
89 {
90 gimple *stmt;
91 use_operand_p use_p;
92 ssa_op_iter iter;
93 edge e;
94 edge_iterator ei;
95 class loop *loop = bb->loop_father;
96 class loop *outer_loop = NULL;
97
98 if (rename_from_outer_loop)
99 {
100 gcc_assert (loop);
101 outer_loop = loop_outer (loop);
102 }
103
104 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
105 gsi_next (&gsi))
106 {
107 stmt = gsi_stmt (gsi);
108 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
109 rename_use_op (use_p);
110 }
111
112 FOR_EACH_EDGE (e, ei, bb->preds)
113 {
114 if (!flow_bb_inside_loop_p (loop, e->src))
115 {
116 if (!rename_from_outer_loop)
117 continue;
118 if (e->src != outer_loop->header)
119 {
120 if (outer_loop->inner->next)
121 {
122 /* If outer_loop has 2 inner loops, allow there to
123 be an extra basic block which decides which of the
124 two loops to use using LOOP_VECTORIZED. */
125 if (!single_pred_p (e->src)
126 || single_pred (e->src) != outer_loop->header)
127 continue;
128 }
129 }
130 }
131 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
132 gsi_next (&gsi))
133 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
134 }
135 }
136
137
138 struct adjust_info
139 {
140 tree from, to;
141 basic_block bb;
142 };
143
144 /* A stack of values to be adjusted in debug stmts. We have to
145 process them LIFO, so that the closest substitution applies. If we
146 processed them FIFO, without the stack, we might substitute uses
147 with a PHI DEF that would soon become non-dominant, and when we got
148 to the suitable one, it wouldn't have anything to substitute any
149 more. */
150 static vec<adjust_info, va_heap> adjust_vec;
151
152 /* Adjust any debug stmts that referenced AI->from values to use the
153 loop-closed AI->to, if the references are dominated by AI->bb and
154 not by the definition of AI->from. */
155
156 static void
157 adjust_debug_stmts_now (adjust_info *ai)
158 {
159 basic_block bbphi = ai->bb;
160 tree orig_def = ai->from;
161 tree new_def = ai->to;
162 imm_use_iterator imm_iter;
163 gimple *stmt;
164 basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
165
166 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
167
168 /* Adjust any debug stmts that held onto non-loop-closed
169 references. */
170 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
171 {
172 use_operand_p use_p;
173 basic_block bbuse;
174
175 if (!is_gimple_debug (stmt))
176 continue;
177
178 gcc_assert (gimple_debug_bind_p (stmt));
179
180 bbuse = gimple_bb (stmt);
181
182 if ((bbuse == bbphi
183 || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
184 && !(bbuse == bbdef
185 || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
186 {
187 if (new_def)
188 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
189 SET_USE (use_p, new_def);
190 else
191 {
192 gimple_debug_bind_reset_value (stmt);
193 update_stmt (stmt);
194 }
195 }
196 }
197 }
198
199 /* Adjust debug stmts as scheduled before. */
200
201 static void
202 adjust_vec_debug_stmts (void)
203 {
204 if (!MAY_HAVE_DEBUG_BIND_STMTS)
205 return;
206
207 gcc_assert (adjust_vec.exists ());
208
209 while (!adjust_vec.is_empty ())
210 {
211 adjust_debug_stmts_now (&adjust_vec.last ());
212 adjust_vec.pop ();
213 }
214 }
215
216 /* Adjust any debug stmts that referenced FROM values to use the
217 loop-closed TO, if the references are dominated by BB and not by
218 the definition of FROM. If adjust_vec is non-NULL, adjustments
219 will be postponed until adjust_vec_debug_stmts is called. */
220
221 static void
222 adjust_debug_stmts (tree from, tree to, basic_block bb)
223 {
224 adjust_info ai;
225
226 if (MAY_HAVE_DEBUG_BIND_STMTS
227 && TREE_CODE (from) == SSA_NAME
228 && ! SSA_NAME_IS_DEFAULT_DEF (from)
229 && ! virtual_operand_p (from))
230 {
231 ai.from = from;
232 ai.to = to;
233 ai.bb = bb;
234
235 if (adjust_vec.exists ())
236 adjust_vec.safe_push (ai);
237 else
238 adjust_debug_stmts_now (&ai);
239 }
240 }
241
242 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
243 to adjust any debug stmts that referenced the old phi arg,
244 presumably non-loop-closed references left over from other
245 transformations. */
246
247 static void
248 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
249 {
250 tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
251
252 SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
253
254 if (MAY_HAVE_DEBUG_BIND_STMTS)
255 adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
256 gimple_bb (update_phi));
257 }
258
259 /* Define one loop mask MASK from loop LOOP. INIT_MASK is the value that
260 the mask should have during the first iteration and NEXT_MASK is the
261 value that it should have on subsequent iterations. */
262
263 static void
264 vect_set_loop_mask (class loop *loop, tree mask, tree init_mask,
265 tree next_mask)
266 {
267 gphi *phi = create_phi_node (mask, loop->header);
268 add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
269 add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
270 }
271
272 /* Add SEQ to the end of LOOP's preheader block. */
273
274 static void
275 add_preheader_seq (class loop *loop, gimple_seq seq)
276 {
277 if (seq)
278 {
279 edge pe = loop_preheader_edge (loop);
280 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
281 gcc_assert (!new_bb);
282 }
283 }
284
285 /* Add SEQ to the beginning of LOOP's header block. */
286
287 static void
288 add_header_seq (class loop *loop, gimple_seq seq)
289 {
290 if (seq)
291 {
292 gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
293 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
294 }
295 }
296
297 /* Return true if the target can interleave elements of two vectors.
298 OFFSET is 0 if the first half of the vectors should be interleaved
299 or 1 if the second half should. When returning true, store the
300 associated permutation in INDICES. */
301
302 static bool
303 interleave_supported_p (vec_perm_indices *indices, tree vectype,
304 unsigned int offset)
305 {
306 poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
307 poly_uint64 base = exact_div (nelts, 2) * offset;
308 vec_perm_builder sel (nelts, 2, 3);
309 for (unsigned int i = 0; i < 3; ++i)
310 {
311 sel.quick_push (base + i);
312 sel.quick_push (base + i + nelts);
313 }
314 indices->new_vector (sel, 2, nelts);
315 return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
316 }
317
318 /* Try to use permutes to define the masks in DEST_RGM using the masks
319 in SRC_RGM, given that the former has twice as many masks as the
320 latter. Return true on success, adding any new statements to SEQ. */
321
322 static bool
323 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
324 rgroup_masks *src_rgm)
325 {
326 tree src_masktype = src_rgm->mask_type;
327 tree dest_masktype = dest_rgm->mask_type;
328 machine_mode src_mode = TYPE_MODE (src_masktype);
329 insn_code icode1, icode2;
330 if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
331 && (icode1 = optab_handler (vec_unpacku_hi_optab,
332 src_mode)) != CODE_FOR_nothing
333 && (icode2 = optab_handler (vec_unpacku_lo_optab,
334 src_mode)) != CODE_FOR_nothing)
335 {
336 /* Unpacking the source masks gives at least as many mask bits as
337 we need. We can then VIEW_CONVERT any excess bits away. */
338 machine_mode dest_mode = insn_data[icode1].operand[0].mode;
339 gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
340 tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
341 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
342 {
343 tree src = src_rgm->masks[i / 2];
344 tree dest = dest_rgm->masks[i];
345 tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
346 ? VEC_UNPACK_HI_EXPR
347 : VEC_UNPACK_LO_EXPR);
348 gassign *stmt;
349 if (dest_masktype == unpack_masktype)
350 stmt = gimple_build_assign (dest, code, src);
351 else
352 {
353 tree temp = make_ssa_name (unpack_masktype);
354 stmt = gimple_build_assign (temp, code, src);
355 gimple_seq_add_stmt (seq, stmt);
356 stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
357 build1 (VIEW_CONVERT_EXPR,
358 dest_masktype, temp));
359 }
360 gimple_seq_add_stmt (seq, stmt);
361 }
362 return true;
363 }
364 vec_perm_indices indices[2];
365 if (dest_masktype == src_masktype
366 && interleave_supported_p (&indices[0], src_masktype, 0)
367 && interleave_supported_p (&indices[1], src_masktype, 1))
368 {
369 /* The destination requires twice as many mask bits as the source, so
370 we can use interleaving permutes to double up the number of bits. */
371 tree masks[2];
372 for (unsigned int i = 0; i < 2; ++i)
373 masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
374 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
375 {
376 tree src = src_rgm->masks[i / 2];
377 tree dest = dest_rgm->masks[i];
378 gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
379 src, src, masks[i & 1]);
380 gimple_seq_add_stmt (seq, stmt);
381 }
382 return true;
383 }
384 return false;
385 }
386
387 /* Helper for vect_set_loop_condition_masked. Generate definitions for
388 all the masks in RGM and return a mask that is nonzero when the loop
389 needs to iterate. Add any new preheader statements to PREHEADER_SEQ.
390 Use LOOP_COND_GSI to insert code before the exit gcond.
391
392 RGM belongs to loop LOOP. The loop originally iterated NITERS
393 times and has been vectorized according to LOOP_VINFO.
394
395 If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
396 starts with NITERS_SKIP dummy iterations of the scalar loop before
397 the real work starts. The mask elements for these dummy iterations
398 must be 0, to ensure that the extra iterations do not have an effect.
399
400 It is known that:
401
402 NITERS * RGM->max_nscalars_per_iter
403
404 does not overflow. However, MIGHT_WRAP_P says whether an induction
405 variable that starts at 0 and has step:
406
407 VF * RGM->max_nscalars_per_iter
408
409 might overflow before hitting a value above:
410
411 (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
412
413 This means that we cannot guarantee that such an induction variable
414 would ever hit a value that produces a set of all-false masks for RGM. */
415
416 static tree
417 vect_set_loop_masks_directly (class loop *loop, loop_vec_info loop_vinfo,
418 gimple_seq *preheader_seq,
419 gimple_stmt_iterator loop_cond_gsi,
420 rgroup_masks *rgm, tree niters, tree niters_skip,
421 bool might_wrap_p)
422 {
423 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
424 tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo);
425 tree mask_type = rgm->mask_type;
426 unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
427 poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
428 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
429
430 /* Calculate the maximum number of scalar values that the rgroup
431 handles in total, the number that it handles for each iteration
432 of the vector loop, and the number that it should skip during the
433 first iteration of the vector loop. */
434 tree nscalars_total = niters;
435 tree nscalars_step = build_int_cst (iv_type, vf);
436 tree nscalars_skip = niters_skip;
437 if (nscalars_per_iter != 1)
438 {
439 /* We checked before choosing to use a fully-masked loop that these
440 multiplications don't overflow. */
441 tree compare_factor = build_int_cst (compare_type, nscalars_per_iter);
442 tree iv_factor = build_int_cst (iv_type, nscalars_per_iter);
443 nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
444 nscalars_total, compare_factor);
445 nscalars_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
446 nscalars_step, iv_factor);
447 if (nscalars_skip)
448 nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
449 nscalars_skip, compare_factor);
450 }
451
452 /* Create an induction variable that counts the number of scalars
453 processed. */
454 tree index_before_incr, index_after_incr;
455 gimple_stmt_iterator incr_gsi;
456 bool insert_after;
457 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
458 create_iv (build_int_cst (iv_type, 0), nscalars_step, NULL_TREE, loop,
459 &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
460
461 tree zero_index = build_int_cst (compare_type, 0);
462 tree test_index, test_limit, first_limit;
463 gimple_stmt_iterator *test_gsi;
464 if (might_wrap_p)
465 {
466 /* In principle the loop should stop iterating once the incremented
467 IV reaches a value greater than or equal to:
468
469 NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
470
471 However, there's no guarantee that this addition doesn't overflow
472 the comparison type, or that the IV hits a value above it before
473 wrapping around. We therefore adjust the limit down by one
474 IV step:
475
476 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
477 -[infinite-prec] NSCALARS_STEP
478
479 and compare the IV against this limit _before_ incrementing it.
480 Since the comparison type is unsigned, we actually want the
481 subtraction to saturate at zero:
482
483 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
484 -[sat] NSCALARS_STEP
485
486 And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
487
488 NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
489
490 where the rightmost subtraction can be done directly in
491 COMPARE_TYPE. */
492 test_index = index_before_incr;
493 tree adjust = gimple_convert (preheader_seq, compare_type,
494 nscalars_step);
495 if (nscalars_skip)
496 adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
497 adjust, nscalars_skip);
498 test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
499 nscalars_total, adjust);
500 test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
501 test_limit, adjust);
502 test_gsi = &incr_gsi;
503
504 /* Get a safe limit for the first iteration. */
505 if (nscalars_skip)
506 {
507 /* The first vector iteration can handle at most NSCALARS_STEP
508 scalars. NSCALARS_STEP <= CONST_LIMIT, and adding
509 NSCALARS_SKIP to that cannot overflow. */
510 tree const_limit = build_int_cst (compare_type,
511 LOOP_VINFO_VECT_FACTOR (loop_vinfo)
512 * nscalars_per_iter);
513 first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
514 nscalars_total, const_limit);
515 first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
516 first_limit, nscalars_skip);
517 }
518 else
519 /* For the first iteration it doesn't matter whether the IV hits
520 a value above NSCALARS_TOTAL. That only matters for the latch
521 condition. */
522 first_limit = nscalars_total;
523 }
524 else
525 {
526 /* Test the incremented IV, which will always hit a value above
527 the bound before wrapping. */
528 test_index = index_after_incr;
529 test_limit = nscalars_total;
530 if (nscalars_skip)
531 test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
532 test_limit, nscalars_skip);
533 test_gsi = &loop_cond_gsi;
534
535 first_limit = test_limit;
536 }
537
538 /* Convert the IV value to the comparison type (either a no-op or
539 a demotion). */
540 gimple_seq test_seq = NULL;
541 test_index = gimple_convert (&test_seq, compare_type, test_index);
542 gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
543
544 /* Provide a definition of each mask in the group. */
545 tree next_mask = NULL_TREE;
546 tree mask;
547 unsigned int i;
548 FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
549 {
550 /* Previous masks will cover BIAS scalars. This mask covers the
551 next batch. */
552 poly_uint64 bias = nscalars_per_mask * i;
553 tree bias_tree = build_int_cst (compare_type, bias);
554 gimple *tmp_stmt;
555
556 /* See whether the first iteration of the vector loop is known
557 to have a full mask. */
558 poly_uint64 const_limit;
559 bool first_iteration_full
560 = (poly_int_tree_p (first_limit, &const_limit)
561 && known_ge (const_limit, (i + 1) * nscalars_per_mask));
562
563 /* Rather than have a new IV that starts at BIAS and goes up to
564 TEST_LIMIT, prefer to use the same 0-based IV for each mask
565 and adjust the bound down by BIAS. */
566 tree this_test_limit = test_limit;
567 if (i != 0)
568 {
569 this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
570 compare_type, this_test_limit,
571 bias_tree);
572 this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
573 compare_type, this_test_limit,
574 bias_tree);
575 }
576
577 /* Create the initial mask. First include all scalars that
578 are within the loop limit. */
579 tree init_mask = NULL_TREE;
580 if (!first_iteration_full)
581 {
582 tree start, end;
583 if (first_limit == test_limit)
584 {
585 /* Use a natural test between zero (the initial IV value)
586 and the loop limit. The "else" block would be valid too,
587 but this choice can avoid the need to load BIAS_TREE into
588 a register. */
589 start = zero_index;
590 end = this_test_limit;
591 }
592 else
593 {
594 /* FIRST_LIMIT is the maximum number of scalars handled by the
595 first iteration of the vector loop. Test the portion
596 associated with this mask. */
597 start = bias_tree;
598 end = first_limit;
599 }
600
601 init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
602 tmp_stmt = vect_gen_while (init_mask, start, end);
603 gimple_seq_add_stmt (preheader_seq, tmp_stmt);
604 }
605
606 /* Now AND out the bits that are within the number of skipped
607 scalars. */
608 poly_uint64 const_skip;
609 if (nscalars_skip
610 && !(poly_int_tree_p (nscalars_skip, &const_skip)
611 && known_le (const_skip, bias)))
612 {
613 tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
614 bias_tree, nscalars_skip);
615 if (init_mask)
616 init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
617 init_mask, unskipped_mask);
618 else
619 init_mask = unskipped_mask;
620 }
621
622 if (!init_mask)
623 /* First iteration is full. */
624 init_mask = build_minus_one_cst (mask_type);
625
626 /* Get the mask value for the next iteration of the loop. */
627 next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
628 gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
629 gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
630
631 vect_set_loop_mask (loop, mask, init_mask, next_mask);
632 }
633 return next_mask;
634 }
635
636 /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
637 LOOP_VINFO describes the vectorization of LOOP. NITERS is the
638 number of iterations of the original scalar loop that should be
639 handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are
640 as for vect_set_loop_condition.
641
642 Insert the branch-back condition before LOOP_COND_GSI and return the
643 final gcond. */
644
645 static gcond *
646 vect_set_loop_condition_masked (class loop *loop, loop_vec_info loop_vinfo,
647 tree niters, tree final_iv,
648 bool niters_maybe_zero,
649 gimple_stmt_iterator loop_cond_gsi)
650 {
651 gimple_seq preheader_seq = NULL;
652 gimple_seq header_seq = NULL;
653
654 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
655 unsigned int compare_precision = TYPE_PRECISION (compare_type);
656 tree orig_niters = niters;
657
658 /* Type of the initial value of NITERS. */
659 tree ni_actual_type = TREE_TYPE (niters);
660 unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
661 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
662
663 /* Convert NITERS to the same size as the compare. */
664 if (compare_precision > ni_actual_precision
665 && niters_maybe_zero)
666 {
667 /* We know that there is always at least one iteration, so if the
668 count is zero then it must have wrapped. Cope with this by
669 subtracting 1 before the conversion and adding 1 to the result. */
670 gcc_assert (TYPE_UNSIGNED (ni_actual_type));
671 niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
672 niters, build_minus_one_cst (ni_actual_type));
673 niters = gimple_convert (&preheader_seq, compare_type, niters);
674 niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
675 niters, build_one_cst (compare_type));
676 }
677 else
678 niters = gimple_convert (&preheader_seq, compare_type, niters);
679
680 widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
681
682 /* Iterate over all the rgroups and fill in their masks. We could use
683 the first mask from any rgroup for the loop condition; here we
684 arbitrarily pick the last. */
685 tree test_mask = NULL_TREE;
686 rgroup_masks *rgm;
687 unsigned int i;
688 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
689 FOR_EACH_VEC_ELT (*masks, i, rgm)
690 if (!rgm->masks.is_empty ())
691 {
692 /* First try using permutes. This adds a single vector
693 instruction to the loop for each mask, but needs no extra
694 loop invariants or IVs. */
695 unsigned int nmasks = i + 1;
696 if ((nmasks & 1) == 0)
697 {
698 rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
699 if (!half_rgm->masks.is_empty ()
700 && vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
701 continue;
702 }
703
704 /* See whether zero-based IV would ever generate all-false masks
705 before wrapping around. */
706 bool might_wrap_p
707 = (iv_limit == -1
708 || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
709 UNSIGNED)
710 > compare_precision));
711
712 /* Set up all masks for this group. */
713 test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
714 &preheader_seq,
715 loop_cond_gsi, rgm,
716 niters, niters_skip,
717 might_wrap_p);
718 }
719
720 /* Emit all accumulated statements. */
721 add_preheader_seq (loop, preheader_seq);
722 add_header_seq (loop, header_seq);
723
724 /* Get a boolean result that tells us whether to iterate. */
725 edge exit_edge = single_exit (loop);
726 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
727 tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
728 gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
729 NULL_TREE, NULL_TREE);
730 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
731
732 /* The loop iterates (NITERS - 1) / VF + 1 times.
733 Subtract one from this to get the latch count. */
734 tree step = build_int_cst (compare_type,
735 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
736 tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
737 build_minus_one_cst (compare_type));
738 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
739 niters_minus_one, step);
740
741 if (final_iv)
742 {
743 gassign *assign = gimple_build_assign (final_iv, orig_niters);
744 gsi_insert_on_edge_immediate (single_exit (loop), assign);
745 }
746
747 return cond_stmt;
748 }
749
750 /* Like vect_set_loop_condition, but handle the case in which there
751 are no loop masks. */
752
753 static gcond *
754 vect_set_loop_condition_unmasked (class loop *loop, tree niters,
755 tree step, tree final_iv,
756 bool niters_maybe_zero,
757 gimple_stmt_iterator loop_cond_gsi)
758 {
759 tree indx_before_incr, indx_after_incr;
760 gcond *cond_stmt;
761 gcond *orig_cond;
762 edge pe = loop_preheader_edge (loop);
763 edge exit_edge = single_exit (loop);
764 gimple_stmt_iterator incr_gsi;
765 bool insert_after;
766 enum tree_code code;
767 tree niters_type = TREE_TYPE (niters);
768
769 orig_cond = get_loop_exit_condition (loop);
770 gcc_assert (orig_cond);
771 loop_cond_gsi = gsi_for_stmt (orig_cond);
772
773 tree init, limit;
774 if (!niters_maybe_zero && integer_onep (step))
775 {
776 /* In this case we can use a simple 0-based IV:
777
778 A:
779 x = 0;
780 do
781 {
782 ...
783 x += 1;
784 }
785 while (x < NITERS); */
786 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
787 init = build_zero_cst (niters_type);
788 limit = niters;
789 }
790 else
791 {
792 /* The following works for all values of NITERS except 0:
793
794 B:
795 x = 0;
796 do
797 {
798 ...
799 x += STEP;
800 }
801 while (x <= NITERS - STEP);
802
803 so that the loop continues to iterate if x + STEP - 1 < NITERS
804 but stops if x + STEP - 1 >= NITERS.
805
806 However, if NITERS is zero, x never hits a value above NITERS - STEP
807 before wrapping around. There are two obvious ways of dealing with
808 this:
809
810 - start at STEP - 1 and compare x before incrementing it
811 - start at -1 and compare x after incrementing it
812
813 The latter is simpler and is what we use. The loop in this case
814 looks like:
815
816 C:
817 x = -1;
818 do
819 {
820 ...
821 x += STEP;
822 }
823 while (x < NITERS - STEP);
824
825 In both cases the loop limit is NITERS - STEP. */
826 gimple_seq seq = NULL;
827 limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
828 limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
829 if (seq)
830 {
831 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
832 gcc_assert (!new_bb);
833 }
834 if (niters_maybe_zero)
835 {
836 /* Case C. */
837 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
838 init = build_all_ones_cst (niters_type);
839 }
840 else
841 {
842 /* Case B. */
843 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
844 init = build_zero_cst (niters_type);
845 }
846 }
847
848 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
849 create_iv (init, step, NULL_TREE, loop,
850 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
851 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
852 true, NULL_TREE, true,
853 GSI_SAME_STMT);
854 limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
855 true, GSI_SAME_STMT);
856
857 cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
858 NULL_TREE);
859
860 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
861
862 /* Record the number of latch iterations. */
863 if (limit == niters)
864 /* Case A: the loop iterates NITERS times. Subtract one to get the
865 latch count. */
866 loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
867 build_int_cst (niters_type, 1));
868 else
869 /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
870 Subtract one from this to get the latch count. */
871 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
872 limit, step);
873
874 if (final_iv)
875 {
876 gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
877 indx_after_incr, init);
878 gsi_insert_on_edge_immediate (single_exit (loop), assign);
879 }
880
881 return cond_stmt;
882 }
883
884 /* If we're using fully-masked loops, make LOOP iterate:
885
886 N == (NITERS - 1) / STEP + 1
887
888 times. When NITERS is zero, this is equivalent to making the loop
889 execute (1 << M) / STEP times, where M is the precision of NITERS.
890 NITERS_MAYBE_ZERO is true if this last case might occur.
891
892 If we're not using fully-masked loops, make LOOP iterate:
893
894 N == (NITERS - STEP) / STEP + 1
895
896 times, where NITERS is known to be outside the range [1, STEP - 1].
897 This is equivalent to making the loop execute NITERS / STEP times
898 when NITERS is nonzero and (1 << M) / STEP times otherwise.
899 NITERS_MAYBE_ZERO again indicates whether this last case might occur.
900
901 If FINAL_IV is nonnull, it is an SSA name that should be set to
902 N * STEP on exit from the loop.
903
904 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
905
906 void
907 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
908 tree niters, tree step, tree final_iv,
909 bool niters_maybe_zero)
910 {
911 gcond *cond_stmt;
912 gcond *orig_cond = get_loop_exit_condition (loop);
913 gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
914
915 if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
916 cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
917 final_iv, niters_maybe_zero,
918 loop_cond_gsi);
919 else
920 cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
921 final_iv, niters_maybe_zero,
922 loop_cond_gsi);
923
924 /* Remove old loop exit test. */
925 stmt_vec_info orig_cond_info;
926 if (loop_vinfo
927 && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
928 loop_vinfo->remove_stmt (orig_cond_info);
929 else
930 gsi_remove (&loop_cond_gsi, true);
931
932 if (dump_enabled_p ())
933 dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
934 cond_stmt);
935 }
936
937 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
938 For all PHI arguments in FROM->dest and TO->dest from those
939 edges ensure that TO->dest PHI arguments have current_def
940 to that in from. */
941
942 static void
943 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
944 {
945 gimple_stmt_iterator gsi_from, gsi_to;
946
947 for (gsi_from = gsi_start_phis (from->dest),
948 gsi_to = gsi_start_phis (to->dest);
949 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
950 {
951 gimple *from_phi = gsi_stmt (gsi_from);
952 gimple *to_phi = gsi_stmt (gsi_to);
953 tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
954 tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
955 if (virtual_operand_p (from_arg))
956 {
957 gsi_next (&gsi_from);
958 continue;
959 }
960 if (virtual_operand_p (to_arg))
961 {
962 gsi_next (&gsi_to);
963 continue;
964 }
965 if (TREE_CODE (from_arg) != SSA_NAME)
966 gcc_assert (operand_equal_p (from_arg, to_arg, 0));
967 else if (TREE_CODE (to_arg) == SSA_NAME
968 && from_arg != to_arg)
969 {
970 if (get_current_def (to_arg) == NULL_TREE)
971 {
972 gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
973 TREE_TYPE (get_current_def
974 (from_arg))));
975 set_current_def (to_arg, get_current_def (from_arg));
976 }
977 }
978 gsi_next (&gsi_from);
979 gsi_next (&gsi_to);
980 }
981
982 gphi *from_phi = get_virtual_phi (from->dest);
983 gphi *to_phi = get_virtual_phi (to->dest);
984 if (from_phi)
985 set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
986 get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
987 }
988
989
990 /* Given LOOP this function generates a new copy of it and puts it
991 on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
992 non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
993 basic blocks from SCALAR_LOOP instead of LOOP, but to either the
994 entry or exit of LOOP. */
995
996 class loop *
997 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
998 class loop *scalar_loop, edge e)
999 {
1000 class loop *new_loop;
1001 basic_block *new_bbs, *bbs, *pbbs;
1002 bool at_exit;
1003 bool was_imm_dom;
1004 basic_block exit_dest;
1005 edge exit, new_exit;
1006 bool duplicate_outer_loop = false;
1007
1008 exit = single_exit (loop);
1009 at_exit = (e == exit);
1010 if (!at_exit && e != loop_preheader_edge (loop))
1011 return NULL;
1012
1013 if (scalar_loop == NULL)
1014 scalar_loop = loop;
1015
1016 bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1017 pbbs = bbs + 1;
1018 get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1019 /* Allow duplication of outer loops. */
1020 if (scalar_loop->inner)
1021 duplicate_outer_loop = true;
1022 /* Check whether duplication is possible. */
1023 if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1024 {
1025 free (bbs);
1026 return NULL;
1027 }
1028
1029 /* Generate new loop structure. */
1030 new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1031 duplicate_subloops (scalar_loop, new_loop);
1032
1033 exit_dest = exit->dest;
1034 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1035 exit_dest) == loop->header ?
1036 true : false);
1037
1038 /* Also copy the pre-header, this avoids jumping through hoops to
1039 duplicate the loop entry PHI arguments. Create an empty
1040 pre-header unconditionally for this. */
1041 basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1042 edge entry_e = single_pred_edge (preheader);
1043 bbs[0] = preheader;
1044 new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1045
1046 exit = single_exit (scalar_loop);
1047 copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1048 &exit, 1, &new_exit, NULL,
1049 at_exit ? loop->latch : e->src, true);
1050 exit = single_exit (loop);
1051 basic_block new_preheader = new_bbs[0];
1052
1053 add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1054
1055 if (scalar_loop != loop)
1056 {
1057 /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1058 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1059 but LOOP will not. slpeel_update_phi_nodes_for_guard{1,2} expects
1060 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1061 header) to have current_def set, so copy them over. */
1062 slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1063 exit);
1064 slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1065 0),
1066 EDGE_SUCC (loop->latch, 0));
1067 }
1068
1069 if (at_exit) /* Add the loop copy at exit. */
1070 {
1071 if (scalar_loop != loop)
1072 {
1073 gphi_iterator gsi;
1074 new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1075
1076 for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1077 gsi_next (&gsi))
1078 {
1079 gphi *phi = gsi.phi ();
1080 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1081 location_t orig_locus
1082 = gimple_phi_arg_location_from_edge (phi, e);
1083
1084 add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1085 }
1086 }
1087 redirect_edge_and_branch_force (e, new_preheader);
1088 flush_pending_stmts (e);
1089 set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1090 if (was_imm_dom || duplicate_outer_loop)
1091 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1092
1093 /* And remove the non-necessary forwarder again. Keep the other
1094 one so we have a proper pre-header for the loop at the exit edge. */
1095 redirect_edge_pred (single_succ_edge (preheader),
1096 single_pred (preheader));
1097 delete_basic_block (preheader);
1098 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1099 loop_preheader_edge (scalar_loop)->src);
1100 }
1101 else /* Add the copy at entry. */
1102 {
1103 if (scalar_loop != loop)
1104 {
1105 /* Remove the non-necessary forwarder of scalar_loop again. */
1106 redirect_edge_pred (single_succ_edge (preheader),
1107 single_pred (preheader));
1108 delete_basic_block (preheader);
1109 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1110 loop_preheader_edge (scalar_loop)->src);
1111 preheader = split_edge (loop_preheader_edge (loop));
1112 entry_e = single_pred_edge (preheader);
1113 }
1114
1115 redirect_edge_and_branch_force (entry_e, new_preheader);
1116 flush_pending_stmts (entry_e);
1117 set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1118
1119 redirect_edge_and_branch_force (new_exit, preheader);
1120 flush_pending_stmts (new_exit);
1121 set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1122
1123 /* And remove the non-necessary forwarder again. Keep the other
1124 one so we have a proper pre-header for the loop at the exit edge. */
1125 redirect_edge_pred (single_succ_edge (new_preheader),
1126 single_pred (new_preheader));
1127 delete_basic_block (new_preheader);
1128 set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1129 loop_preheader_edge (new_loop)->src);
1130 }
1131
1132 /* Skip new preheader since it's deleted if copy loop is added at entry. */
1133 for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1134 rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1135
1136 if (scalar_loop != loop)
1137 {
1138 /* Update new_loop->header PHIs, so that on the preheader
1139 edge they are the ones from loop rather than scalar_loop. */
1140 gphi_iterator gsi_orig, gsi_new;
1141 edge orig_e = loop_preheader_edge (loop);
1142 edge new_e = loop_preheader_edge (new_loop);
1143
1144 for (gsi_orig = gsi_start_phis (loop->header),
1145 gsi_new = gsi_start_phis (new_loop->header);
1146 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1147 gsi_next (&gsi_orig), gsi_next (&gsi_new))
1148 {
1149 gphi *orig_phi = gsi_orig.phi ();
1150 gphi *new_phi = gsi_new.phi ();
1151 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1152 location_t orig_locus
1153 = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1154
1155 add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1156 }
1157 }
1158
1159 free (new_bbs);
1160 free (bbs);
1161
1162 checking_verify_dominators (CDI_DOMINATORS);
1163
1164 return new_loop;
1165 }
1166
1167
1168 /* Given the condition expression COND, put it as the last statement of
1169 GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1170 DOM_BB; return the skip edge. GUARD_TO is the target basic block to
1171 skip the loop. PROBABILITY is the skip edge's probability. Mark the
1172 new edge as irreducible if IRREDUCIBLE_P is true. */
1173
1174 static edge
1175 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1176 basic_block guard_to, basic_block dom_bb,
1177 profile_probability probability, bool irreducible_p)
1178 {
1179 gimple_stmt_iterator gsi;
1180 edge new_e, enter_e;
1181 gcond *cond_stmt;
1182 gimple_seq gimplify_stmt_list = NULL;
1183
1184 enter_e = EDGE_SUCC (guard_bb, 0);
1185 enter_e->flags &= ~EDGE_FALLTHRU;
1186 enter_e->flags |= EDGE_FALSE_VALUE;
1187 gsi = gsi_last_bb (guard_bb);
1188
1189 cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1190 NULL_TREE);
1191 if (gimplify_stmt_list)
1192 gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1193
1194 cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1195 gsi = gsi_last_bb (guard_bb);
1196 gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1197
1198 /* Add new edge to connect guard block to the merge/loop-exit block. */
1199 new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1200
1201 new_e->probability = probability;
1202 if (irreducible_p)
1203 new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1204
1205 enter_e->probability = probability.invert ();
1206 set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1207
1208 /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
1209 if (enter_e->dest->loop_father->header == enter_e->dest)
1210 split_edge (enter_e);
1211
1212 return new_e;
1213 }
1214
1215
1216 /* This function verifies that the following restrictions apply to LOOP:
1217 (1) it consists of exactly 2 basic blocks - header, and an empty latch
1218 for innermost loop and 5 basic blocks for outer-loop.
1219 (2) it is single entry, single exit
1220 (3) its exit condition is the last stmt in the header
1221 (4) E is the entry/exit edge of LOOP.
1222 */
1223
1224 bool
1225 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1226 {
1227 edge exit_e = single_exit (loop);
1228 edge entry_e = loop_preheader_edge (loop);
1229 gcond *orig_cond = get_loop_exit_condition (loop);
1230 gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1231 unsigned int num_bb = loop->inner? 5 : 2;
1232
1233 /* All loops have an outer scope; the only case loop->outer is NULL is for
1234 the function itself. */
1235 if (!loop_outer (loop)
1236 || loop->num_nodes != num_bb
1237 || !empty_block_p (loop->latch)
1238 || !single_exit (loop)
1239 /* Verify that new loop exit condition can be trivially modified. */
1240 || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1241 || (e != exit_e && e != entry_e))
1242 return false;
1243
1244 return true;
1245 }
1246
1247 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1248 in the exit bb and rename all the uses after the loop. This simplifies
1249 the *guard[12] routines, which assume loop closed SSA form for all PHIs
1250 (but normally loop closed SSA form doesn't require virtual PHIs to be
1251 in the same form). Doing this early simplifies the checking what
1252 uses should be renamed.
1253
1254 If we create a new phi after the loop, return the definition that
1255 applies on entry to the loop, otherwise return null. */
1256
1257 static tree
1258 create_lcssa_for_virtual_phi (class loop *loop)
1259 {
1260 gphi_iterator gsi;
1261 edge exit_e = single_exit (loop);
1262
1263 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1264 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1265 {
1266 gphi *phi = gsi.phi ();
1267 for (gsi = gsi_start_phis (exit_e->dest);
1268 !gsi_end_p (gsi); gsi_next (&gsi))
1269 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1270 break;
1271 if (gsi_end_p (gsi))
1272 {
1273 tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1274 gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1275 tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1276 imm_use_iterator imm_iter;
1277 gimple *stmt;
1278 use_operand_p use_p;
1279
1280 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1281 = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1282 add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1283 gimple_phi_set_result (new_phi, new_vop);
1284 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1285 if (stmt != new_phi
1286 && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1287 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1288 SET_USE (use_p, new_vop);
1289
1290 return PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1291 }
1292 break;
1293 }
1294 return NULL_TREE;
1295 }
1296
1297 /* Function vect_get_loop_location.
1298
1299 Extract the location of the loop in the source code.
1300 If the loop is not well formed for vectorization, an estimated
1301 location is calculated.
1302 Return the loop location if succeed and NULL if not. */
1303
1304 dump_user_location_t
1305 find_loop_location (class loop *loop)
1306 {
1307 gimple *stmt = NULL;
1308 basic_block bb;
1309 gimple_stmt_iterator si;
1310
1311 if (!loop)
1312 return dump_user_location_t ();
1313
1314 stmt = get_loop_exit_condition (loop);
1315
1316 if (stmt
1317 && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1318 return stmt;
1319
1320 /* If we got here the loop is probably not "well formed",
1321 try to estimate the loop location */
1322
1323 if (!loop->header)
1324 return dump_user_location_t ();
1325
1326 bb = loop->header;
1327
1328 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1329 {
1330 stmt = gsi_stmt (si);
1331 if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1332 return stmt;
1333 }
1334
1335 return dump_user_location_t ();
1336 }
1337
1338 /* Return true if the phi described by STMT_INFO defines an IV of the
1339 loop to be vectorized. */
1340
1341 static bool
1342 iv_phi_p (stmt_vec_info stmt_info)
1343 {
1344 gphi *phi = as_a <gphi *> (stmt_info->stmt);
1345 if (virtual_operand_p (PHI_RESULT (phi)))
1346 return false;
1347
1348 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1349 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1350 return false;
1351
1352 return true;
1353 }
1354
1355 /* Function vect_can_advance_ivs_p
1356
1357 In case the number of iterations that LOOP iterates is unknown at compile
1358 time, an epilog loop will be generated, and the loop induction variables
1359 (IVs) will be "advanced" to the value they are supposed to take just before
1360 the epilog loop. Here we check that the access function of the loop IVs
1361 and the expression that represents the loop bound are simple enough.
1362 These restrictions will be relaxed in the future. */
1363
1364 bool
1365 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1366 {
1367 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1368 basic_block bb = loop->header;
1369 gphi_iterator gsi;
1370
1371 /* Analyze phi functions of the loop header. */
1372
1373 if (dump_enabled_p ())
1374 dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1375 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1376 {
1377 tree evolution_part;
1378
1379 gphi *phi = gsi.phi ();
1380 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1381 if (dump_enabled_p ())
1382 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1383 phi_info->stmt);
1384
1385 /* Skip virtual phi's. The data dependences that are associated with
1386 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1387
1388 Skip reduction phis. */
1389 if (!iv_phi_p (phi_info))
1390 {
1391 if (dump_enabled_p ())
1392 dump_printf_loc (MSG_NOTE, vect_location,
1393 "reduc or virtual phi. skip.\n");
1394 continue;
1395 }
1396
1397 /* Analyze the evolution function. */
1398
1399 evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1400 if (evolution_part == NULL_TREE)
1401 {
1402 if (dump_enabled_p ())
1403 dump_printf (MSG_MISSED_OPTIMIZATION,
1404 "No access function or evolution.\n");
1405 return false;
1406 }
1407
1408 /* FORNOW: We do not transform initial conditions of IVs
1409 which evolution functions are not invariants in the loop. */
1410
1411 if (!expr_invariant_in_loop_p (loop, evolution_part))
1412 {
1413 if (dump_enabled_p ())
1414 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1415 "evolution not invariant in loop.\n");
1416 return false;
1417 }
1418
1419 /* FORNOW: We do not transform initial conditions of IVs
1420 which evolution functions are a polynomial of degree >= 2. */
1421
1422 if (tree_is_chrec (evolution_part))
1423 {
1424 if (dump_enabled_p ())
1425 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1426 "evolution is chrec.\n");
1427 return false;
1428 }
1429 }
1430
1431 return true;
1432 }
1433
1434
1435 /* Function vect_update_ivs_after_vectorizer.
1436
1437 "Advance" the induction variables of LOOP to the value they should take
1438 after the execution of LOOP. This is currently necessary because the
1439 vectorizer does not handle induction variables that are used after the
1440 loop. Such a situation occurs when the last iterations of LOOP are
1441 peeled, because:
1442 1. We introduced new uses after LOOP for IVs that were not originally used
1443 after LOOP: the IVs of LOOP are now used by an epilog loop.
1444 2. LOOP is going to be vectorized; this means that it will iterate N/VF
1445 times, whereas the loop IVs should be bumped N times.
1446
1447 Input:
1448 - LOOP - a loop that is going to be vectorized. The last few iterations
1449 of LOOP were peeled.
1450 - NITERS - the number of iterations that LOOP executes (before it is
1451 vectorized). i.e, the number of times the ivs should be bumped.
1452 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1453 coming out from LOOP on which there are uses of the LOOP ivs
1454 (this is the path from LOOP->exit to epilog_loop->preheader).
1455
1456 The new definitions of the ivs are placed in LOOP->exit.
1457 The phi args associated with the edge UPDATE_E in the bb
1458 UPDATE_E->dest are updated accordingly.
1459
1460 Assumption 1: Like the rest of the vectorizer, this function assumes
1461 a single loop exit that has a single predecessor.
1462
1463 Assumption 2: The phi nodes in the LOOP header and in update_bb are
1464 organized in the same order.
1465
1466 Assumption 3: The access function of the ivs is simple enough (see
1467 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
1468
1469 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1470 coming out of LOOP on which the ivs of LOOP are used (this is the path
1471 that leads to the epilog loop; other paths skip the epilog loop). This
1472 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1473 needs to have its phis updated.
1474 */
1475
1476 static void
1477 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1478 tree niters, edge update_e)
1479 {
1480 gphi_iterator gsi, gsi1;
1481 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1482 basic_block update_bb = update_e->dest;
1483 basic_block exit_bb = single_exit (loop)->dest;
1484
1485 /* Make sure there exists a single-predecessor exit bb: */
1486 gcc_assert (single_pred_p (exit_bb));
1487 gcc_assert (single_succ_edge (exit_bb) == update_e);
1488
1489 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1490 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1491 gsi_next (&gsi), gsi_next (&gsi1))
1492 {
1493 tree init_expr;
1494 tree step_expr, off;
1495 tree type;
1496 tree var, ni, ni_name;
1497 gimple_stmt_iterator last_gsi;
1498
1499 gphi *phi = gsi.phi ();
1500 gphi *phi1 = gsi1.phi ();
1501 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1502 if (dump_enabled_p ())
1503 dump_printf_loc (MSG_NOTE, vect_location,
1504 "vect_update_ivs_after_vectorizer: phi: %G", phi);
1505
1506 /* Skip reduction and virtual phis. */
1507 if (!iv_phi_p (phi_info))
1508 {
1509 if (dump_enabled_p ())
1510 dump_printf_loc (MSG_NOTE, vect_location,
1511 "reduc or virtual phi. skip.\n");
1512 continue;
1513 }
1514
1515 type = TREE_TYPE (gimple_phi_result (phi));
1516 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1517 step_expr = unshare_expr (step_expr);
1518
1519 /* FORNOW: We do not support IVs whose evolution function is a polynomial
1520 of degree >= 2 or exponential. */
1521 gcc_assert (!tree_is_chrec (step_expr));
1522
1523 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1524
1525 off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1526 fold_convert (TREE_TYPE (step_expr), niters),
1527 step_expr);
1528 if (POINTER_TYPE_P (type))
1529 ni = fold_build_pointer_plus (init_expr, off);
1530 else
1531 ni = fold_build2 (PLUS_EXPR, type,
1532 init_expr, fold_convert (type, off));
1533
1534 var = create_tmp_var (type, "tmp");
1535
1536 last_gsi = gsi_last_bb (exit_bb);
1537 gimple_seq new_stmts = NULL;
1538 ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1539 /* Exit_bb shouldn't be empty. */
1540 if (!gsi_end_p (last_gsi))
1541 gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1542 else
1543 gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1544
1545 /* Fix phi expressions in the successor bb. */
1546 adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1547 }
1548 }
1549
1550 /* Return a gimple value containing the misalignment (measured in vector
1551 elements) for the loop described by LOOP_VINFO, i.e. how many elements
1552 it is away from a perfectly aligned address. Add any new statements
1553 to SEQ. */
1554
1555 static tree
1556 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1557 {
1558 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1559 stmt_vec_info stmt_info = dr_info->stmt;
1560 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1561
1562 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1563 unsigned HOST_WIDE_INT target_align_c;
1564 tree target_align_minus_1;
1565
1566 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1567 size_zero_node) < 0;
1568 tree offset = (negative
1569 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1570 : size_zero_node);
1571 tree start_addr = vect_create_addr_base_for_vector_ref (stmt_info, seq,
1572 offset);
1573 tree type = unsigned_type_for (TREE_TYPE (start_addr));
1574 if (target_align.is_constant (&target_align_c))
1575 target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1576 else
1577 {
1578 tree vla = build_int_cst (type, target_align);
1579 tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1580 fold_build2 (MINUS_EXPR, type,
1581 build_int_cst (type, 0), vla));
1582 target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1583 build_int_cst (type, 1));
1584 }
1585
1586 HOST_WIDE_INT elem_size
1587 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1588 tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1589
1590 /* Create: misalign_in_bytes = addr & (target_align - 1). */
1591 tree int_start_addr = fold_convert (type, start_addr);
1592 tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1593 target_align_minus_1);
1594
1595 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
1596 tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1597 elem_size_log);
1598
1599 return misalign_in_elems;
1600 }
1601
1602 /* Function vect_gen_prolog_loop_niters
1603
1604 Generate the number of iterations which should be peeled as prolog for the
1605 loop represented by LOOP_VINFO. It is calculated as the misalignment of
1606 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1607 As a result, after the execution of this loop, the data reference DR will
1608 refer to an aligned location. The following computation is generated:
1609
1610 If the misalignment of DR is known at compile time:
1611 addr_mis = int mis = DR_MISALIGNMENT (dr);
1612 Else, compute address misalignment in bytes:
1613 addr_mis = addr & (target_align - 1)
1614
1615 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1616
1617 (elem_size = element type size; an element is the scalar element whose type
1618 is the inner type of the vectype)
1619
1620 The computations will be emitted at the end of BB. We also compute and
1621 store upper bound (included) of the result in BOUND.
1622
1623 When the step of the data-ref in the loop is not 1 (as in interleaved data
1624 and SLP), the number of iterations of the prolog must be divided by the step
1625 (which is equal to the size of interleaved group).
1626
1627 The above formulas assume that VF == number of elements in the vector. This
1628 may not hold when there are multiple-types in the loop.
1629 In this case, for some data-references in the loop the VF does not represent
1630 the number of elements that fit in the vector. Therefore, instead of VF we
1631 use TYPE_VECTOR_SUBPARTS. */
1632
1633 static tree
1634 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1635 basic_block bb, int *bound)
1636 {
1637 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1638 tree var;
1639 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1640 gimple_seq stmts = NULL, new_stmts = NULL;
1641 tree iters, iters_name;
1642 stmt_vec_info stmt_info = dr_info->stmt;
1643 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1644 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1645
1646 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1647 {
1648 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1649
1650 if (dump_enabled_p ())
1651 dump_printf_loc (MSG_NOTE, vect_location,
1652 "known peeling = %d.\n", npeel);
1653
1654 iters = build_int_cst (niters_type, npeel);
1655 *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1656 }
1657 else
1658 {
1659 tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1660 tree type = TREE_TYPE (misalign_in_elems);
1661 HOST_WIDE_INT elem_size
1662 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1663 /* We only do prolog peeling if the target alignment is known at compile
1664 time. */
1665 poly_uint64 align_in_elems =
1666 exact_div (target_align, elem_size);
1667 tree align_in_elems_minus_1 =
1668 build_int_cst (type, align_in_elems - 1);
1669 tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1670
1671 /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
1672 & (align_in_elems - 1)). */
1673 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1674 size_zero_node) < 0;
1675 if (negative)
1676 iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1677 align_in_elems_tree);
1678 else
1679 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1680 misalign_in_elems);
1681 iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1682 iters = fold_convert (niters_type, iters);
1683 unsigned HOST_WIDE_INT align_in_elems_c;
1684 if (align_in_elems.is_constant (&align_in_elems_c))
1685 *bound = align_in_elems_c - 1;
1686 else
1687 *bound = -1;
1688 }
1689
1690 if (dump_enabled_p ())
1691 dump_printf_loc (MSG_NOTE, vect_location,
1692 "niters for prolog loop: %T\n", iters);
1693
1694 var = create_tmp_var (niters_type, "prolog_loop_niters");
1695 iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1696
1697 if (new_stmts)
1698 gimple_seq_add_seq (&stmts, new_stmts);
1699 if (stmts)
1700 {
1701 gcc_assert (single_succ_p (bb));
1702 gimple_stmt_iterator gsi = gsi_last_bb (bb);
1703 if (gsi_end_p (gsi))
1704 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1705 else
1706 gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1707 }
1708 return iters_name;
1709 }
1710
1711
1712 /* Function vect_update_init_of_dr
1713
1714 If CODE is PLUS, the vector loop starts NITERS iterations after the
1715 scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1716 iterations before the scalar one (using masking to skip inactive
1717 elements). This function updates the information recorded in DR to
1718 account for the difference. Specifically, it updates the OFFSET
1719 field of DR_INFO. */
1720
1721 static void
1722 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
1723 {
1724 struct data_reference *dr = dr_info->dr;
1725 tree offset = dr_info->offset;
1726 if (!offset)
1727 offset = build_zero_cst (sizetype);
1728
1729 niters = fold_build2 (MULT_EXPR, sizetype,
1730 fold_convert (sizetype, niters),
1731 fold_convert (sizetype, DR_STEP (dr)));
1732 offset = fold_build2 (code, sizetype,
1733 fold_convert (sizetype, offset), niters);
1734 dr_info->offset = offset;
1735 }
1736
1737
1738 /* Function vect_update_inits_of_drs
1739
1740 Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1741 CODE and NITERS are as for vect_update_inits_of_dr. */
1742
1743 void
1744 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1745 tree_code code)
1746 {
1747 unsigned int i;
1748 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1749 struct data_reference *dr;
1750
1751 DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1752
1753 /* Adjust niters to sizetype. We used to insert the stmts on loop preheader
1754 here, but since we might use these niters to update the epilogues niters
1755 and data references we can't insert them here as this definition might not
1756 always dominate its uses. */
1757 if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1758 niters = fold_convert (sizetype, niters);
1759
1760 FOR_EACH_VEC_ELT (datarefs, i, dr)
1761 {
1762 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1763 if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
1764 vect_update_init_of_dr (dr_info, niters, code);
1765 }
1766 }
1767
1768 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1769 by masking. This involves calculating the number of iterations to
1770 be peeled and then aligning all memory references appropriately. */
1771
1772 void
1773 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1774 {
1775 tree misalign_in_elems;
1776 tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1777
1778 gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1779
1780 /* From the information recorded in LOOP_VINFO get the number of iterations
1781 that need to be skipped via masking. */
1782 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1783 {
1784 poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1785 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1786 misalign_in_elems = build_int_cst (type, misalign);
1787 }
1788 else
1789 {
1790 gimple_seq seq1 = NULL, seq2 = NULL;
1791 misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1792 misalign_in_elems = fold_convert (type, misalign_in_elems);
1793 misalign_in_elems = force_gimple_operand (misalign_in_elems,
1794 &seq2, true, NULL_TREE);
1795 gimple_seq_add_seq (&seq1, seq2);
1796 if (seq1)
1797 {
1798 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1799 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1800 gcc_assert (!new_bb);
1801 }
1802 }
1803
1804 if (dump_enabled_p ())
1805 dump_printf_loc (MSG_NOTE, vect_location,
1806 "misalignment for fully-masked loop: %T\n",
1807 misalign_in_elems);
1808
1809 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1810
1811 vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1812 }
1813
1814 /* This function builds ni_name = number of iterations. Statements
1815 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
1816 it to TRUE if new ssa_var is generated. */
1817
1818 tree
1819 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1820 {
1821 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1822 if (TREE_CODE (ni) == INTEGER_CST)
1823 return ni;
1824 else
1825 {
1826 tree ni_name, var;
1827 gimple_seq stmts = NULL;
1828 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1829
1830 var = create_tmp_var (TREE_TYPE (ni), "niters");
1831 ni_name = force_gimple_operand (ni, &stmts, false, var);
1832 if (stmts)
1833 {
1834 gsi_insert_seq_on_edge_immediate (pe, stmts);
1835 if (new_var_p != NULL)
1836 *new_var_p = true;
1837 }
1838
1839 return ni_name;
1840 }
1841 }
1842
1843 /* Calculate the number of iterations above which vectorized loop will be
1844 preferred than scalar loop. NITERS_PROLOG is the number of iterations
1845 of prolog loop. If it's integer const, the integer number is also passed
1846 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
1847 number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
1848 value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
1849 threshold below which the scalar (rather than vectorized) loop will be
1850 executed. This function stores the upper bound (inclusive) of the result
1851 in BOUND_SCALAR. */
1852
1853 static tree
1854 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1855 int bound_prolog, poly_int64 bound_epilog, int th,
1856 poly_uint64 *bound_scalar,
1857 bool check_profitability)
1858 {
1859 tree type = TREE_TYPE (niters_prolog);
1860 tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1861 build_int_cst (type, bound_epilog));
1862
1863 *bound_scalar = bound_prolog + bound_epilog;
1864 if (check_profitability)
1865 {
1866 /* TH indicates the minimum niters of vectorized loop, while we
1867 compute the maximum niters of scalar loop. */
1868 th--;
1869 /* Peeling for constant times. */
1870 if (int_niters_prolog >= 0)
1871 {
1872 *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1873 return build_int_cst (type, *bound_scalar);
1874 }
1875 /* Peeling an unknown number of times. Note that both BOUND_PROLOG
1876 and BOUND_EPILOG are inclusive upper bounds. */
1877 if (known_ge (th, bound_prolog + bound_epilog))
1878 {
1879 *bound_scalar = th;
1880 return build_int_cst (type, th);
1881 }
1882 /* Need to do runtime comparison. */
1883 else if (maybe_gt (th, bound_epilog))
1884 {
1885 *bound_scalar = upper_bound (*bound_scalar, th);
1886 return fold_build2 (MAX_EXPR, type,
1887 build_int_cst (type, th), niters);
1888 }
1889 }
1890 return niters;
1891 }
1892
1893 /* NITERS is the number of times that the original scalar loop executes
1894 after peeling. Work out the maximum number of iterations N that can
1895 be handled by the vectorized form of the loop and then either:
1896
1897 a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1898
1899 niters_vector = N
1900
1901 b) set *STEP_VECTOR_PTR to one and generate:
1902
1903 niters_vector = N / vf
1904
1905 In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1906 any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
1907 is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
1908
1909 void
1910 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1911 tree *niters_vector_ptr, tree *step_vector_ptr,
1912 bool niters_no_overflow)
1913 {
1914 tree ni_minus_gap, var;
1915 tree niters_vector, step_vector, type = TREE_TYPE (niters);
1916 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1917 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1918 tree log_vf = NULL_TREE;
1919
1920 /* If epilogue loop is required because of data accesses with gaps, we
1921 subtract one iteration from the total number of iterations here for
1922 correct calculation of RATIO. */
1923 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1924 {
1925 ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1926 build_one_cst (type));
1927 if (!is_gimple_val (ni_minus_gap))
1928 {
1929 var = create_tmp_var (type, "ni_gap");
1930 gimple *stmts = NULL;
1931 ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1932 true, var);
1933 gsi_insert_seq_on_edge_immediate (pe, stmts);
1934 }
1935 }
1936 else
1937 ni_minus_gap = niters;
1938
1939 unsigned HOST_WIDE_INT const_vf;
1940 if (vf.is_constant (&const_vf)
1941 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1942 {
1943 /* Create: niters >> log2(vf) */
1944 /* If it's known that niters == number of latch executions + 1 doesn't
1945 overflow, we can generate niters >> log2(vf); otherwise we generate
1946 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1947 will be at least one. */
1948 log_vf = build_int_cst (type, exact_log2 (const_vf));
1949 if (niters_no_overflow)
1950 niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1951 else
1952 niters_vector
1953 = fold_build2 (PLUS_EXPR, type,
1954 fold_build2 (RSHIFT_EXPR, type,
1955 fold_build2 (MINUS_EXPR, type,
1956 ni_minus_gap,
1957 build_int_cst (type, vf)),
1958 log_vf),
1959 build_int_cst (type, 1));
1960 step_vector = build_one_cst (type);
1961 }
1962 else
1963 {
1964 niters_vector = ni_minus_gap;
1965 step_vector = build_int_cst (type, vf);
1966 }
1967
1968 if (!is_gimple_val (niters_vector))
1969 {
1970 var = create_tmp_var (type, "bnd");
1971 gimple_seq stmts = NULL;
1972 niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1973 gsi_insert_seq_on_edge_immediate (pe, stmts);
1974 /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1975 we set range information to make niters analyzer's life easier. */
1976 if (stmts != NULL && log_vf)
1977 set_range_info (niters_vector, VR_RANGE,
1978 wi::to_wide (build_int_cst (type, 1)),
1979 wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1980 TYPE_MAX_VALUE (type),
1981 log_vf)));
1982 }
1983 *niters_vector_ptr = niters_vector;
1984 *step_vector_ptr = step_vector;
1985
1986 return;
1987 }
1988
1989 /* Given NITERS_VECTOR which is the number of iterations for vectorized
1990 loop specified by LOOP_VINFO after vectorization, compute the number
1991 of iterations before vectorization (niters_vector * vf) and store it
1992 to NITERS_VECTOR_MULT_VF_PTR. */
1993
1994 static void
1995 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
1996 tree niters_vector,
1997 tree *niters_vector_mult_vf_ptr)
1998 {
1999 /* We should be using a step_vector of VF if VF is variable. */
2000 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2001 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2002 tree type = TREE_TYPE (niters_vector);
2003 tree log_vf = build_int_cst (type, exact_log2 (vf));
2004 basic_block exit_bb = single_exit (loop)->dest;
2005
2006 gcc_assert (niters_vector_mult_vf_ptr != NULL);
2007 tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2008 niters_vector, log_vf);
2009 if (!is_gimple_val (niters_vector_mult_vf))
2010 {
2011 tree var = create_tmp_var (type, "niters_vector_mult_vf");
2012 gimple_seq stmts = NULL;
2013 niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2014 &stmts, true, var);
2015 gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2016 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2017 }
2018 *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2019 }
2020
2021 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2022 this function searches for the corresponding lcssa phi node in exit
2023 bb of LOOP. If it is found, return the phi result; otherwise return
2024 NULL. */
2025
2026 static tree
2027 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2028 gphi *lcssa_phi)
2029 {
2030 gphi_iterator gsi;
2031 edge e = single_exit (loop);
2032
2033 gcc_assert (single_pred_p (e->dest));
2034 for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2035 {
2036 gphi *phi = gsi.phi ();
2037 if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2038 PHI_ARG_DEF (lcssa_phi, 0), 0))
2039 return PHI_RESULT (phi);
2040 }
2041 return NULL_TREE;
2042 }
2043
2044 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2045 from SECOND/FIRST and puts it at the original loop's preheader/exit
2046 edge, the two loops are arranged as below:
2047
2048 preheader_a:
2049 first_loop:
2050 header_a:
2051 i_1 = PHI<i_0, i_2>;
2052 ...
2053 i_2 = i_1 + 1;
2054 if (cond_a)
2055 goto latch_a;
2056 else
2057 goto between_bb;
2058 latch_a:
2059 goto header_a;
2060
2061 between_bb:
2062 ;; i_x = PHI<i_2>; ;; LCSSA phi node to be created for FIRST,
2063
2064 second_loop:
2065 header_b:
2066 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2067 or with i_2 if no LCSSA phi is created
2068 under condition of CREATE_LCSSA_FOR_IV_PHIS.
2069 ...
2070 i_4 = i_3 + 1;
2071 if (cond_b)
2072 goto latch_b;
2073 else
2074 goto exit_bb;
2075 latch_b:
2076 goto header_b;
2077
2078 exit_bb:
2079
2080 This function creates loop closed SSA for the first loop; update the
2081 second loop's PHI nodes by replacing argument on incoming edge with the
2082 result of newly created lcssa PHI nodes. IF CREATE_LCSSA_FOR_IV_PHIS
2083 is false, Loop closed ssa phis will only be created for non-iv phis for
2084 the first loop.
2085
2086 This function assumes exit bb of the first loop is preheader bb of the
2087 second loop, i.e, between_bb in the example code. With PHIs updated,
2088 the second loop will execute rest iterations of the first. */
2089
2090 static void
2091 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2092 class loop *first, class loop *second,
2093 bool create_lcssa_for_iv_phis)
2094 {
2095 gphi_iterator gsi_update, gsi_orig;
2096 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2097
2098 edge first_latch_e = EDGE_SUCC (first->latch, 0);
2099 edge second_preheader_e = loop_preheader_edge (second);
2100 basic_block between_bb = single_exit (first)->dest;
2101
2102 gcc_assert (between_bb == second_preheader_e->src);
2103 gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2104 /* Either the first loop or the second is the loop to be vectorized. */
2105 gcc_assert (loop == first || loop == second);
2106
2107 for (gsi_orig = gsi_start_phis (first->header),
2108 gsi_update = gsi_start_phis (second->header);
2109 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2110 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2111 {
2112 gphi *orig_phi = gsi_orig.phi ();
2113 gphi *update_phi = gsi_update.phi ();
2114
2115 tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2116 /* Generate lcssa PHI node for the first loop. */
2117 gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2118 stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2119 if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2120 {
2121 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2122 gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2123 add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2124 arg = new_res;
2125 }
2126
2127 /* Update PHI node in the second loop by replacing arg on the loop's
2128 incoming edge. */
2129 adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2130 }
2131
2132 /* For epilogue peeling we have to make sure to copy all LC PHIs
2133 for correct vectorization of live stmts. */
2134 if (loop == first)
2135 {
2136 basic_block orig_exit = single_exit (second)->dest;
2137 for (gsi_orig = gsi_start_phis (orig_exit);
2138 !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2139 {
2140 gphi *orig_phi = gsi_orig.phi ();
2141 tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2142 if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p (orig_arg))
2143 continue;
2144
2145 /* Already created in the above loop. */
2146 if (find_guard_arg (first, second, orig_phi))
2147 continue;
2148
2149 tree new_res = copy_ssa_name (orig_arg);
2150 gphi *lcphi = create_phi_node (new_res, between_bb);
2151 add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2152 }
2153 }
2154 }
2155
2156 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2157 of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
2158 are two pred edges of the merge point before UPDATE_LOOP. The two loops
2159 appear like below:
2160
2161 guard_bb:
2162 if (cond)
2163 goto merge_bb;
2164 else
2165 goto skip_loop;
2166
2167 skip_loop:
2168 header_a:
2169 i_1 = PHI<i_0, i_2>;
2170 ...
2171 i_2 = i_1 + 1;
2172 if (cond_a)
2173 goto latch_a;
2174 else
2175 goto exit_a;
2176 latch_a:
2177 goto header_a;
2178
2179 exit_a:
2180 i_5 = PHI<i_2>;
2181
2182 merge_bb:
2183 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2184
2185 update_loop:
2186 header_b:
2187 i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
2188 ...
2189 i_4 = i_3 + 1;
2190 if (cond_b)
2191 goto latch_b;
2192 else
2193 goto exit_bb;
2194 latch_b:
2195 goto header_b;
2196
2197 exit_bb:
2198
2199 This function creates PHI nodes at merge_bb and replaces the use of i_5
2200 in the update_loop's PHI node with the result of new PHI result. */
2201
2202 static void
2203 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2204 class loop *update_loop,
2205 edge guard_edge, edge merge_edge)
2206 {
2207 location_t merge_loc, guard_loc;
2208 edge orig_e = loop_preheader_edge (skip_loop);
2209 edge update_e = loop_preheader_edge (update_loop);
2210 gphi_iterator gsi_orig, gsi_update;
2211
2212 for ((gsi_orig = gsi_start_phis (skip_loop->header),
2213 gsi_update = gsi_start_phis (update_loop->header));
2214 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2215 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2216 {
2217 gphi *orig_phi = gsi_orig.phi ();
2218 gphi *update_phi = gsi_update.phi ();
2219
2220 /* Generate new phi node at merge bb of the guard. */
2221 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2222 gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2223
2224 /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
2225 args in NEW_PHI for these edges. */
2226 tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2227 tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2228 merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2229 guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2230 add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2231 add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2232
2233 /* Update phi in UPDATE_PHI. */
2234 adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2235 }
2236 }
2237
2238 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2239 from LOOP. Function slpeel_add_loop_guard adds guard skipping from a
2240 point between the two loops to the end of EPILOG. Edges GUARD_EDGE
2241 and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2242 The CFG looks like:
2243
2244 loop:
2245 header_a:
2246 i_1 = PHI<i_0, i_2>;
2247 ...
2248 i_2 = i_1 + 1;
2249 if (cond_a)
2250 goto latch_a;
2251 else
2252 goto exit_a;
2253 latch_a:
2254 goto header_a;
2255
2256 exit_a:
2257
2258 guard_bb:
2259 if (cond)
2260 goto merge_bb;
2261 else
2262 goto epilog_loop;
2263
2264 ;; fall_through_bb
2265
2266 epilog_loop:
2267 header_b:
2268 i_3 = PHI<i_2, i_4>;
2269 ...
2270 i_4 = i_3 + 1;
2271 if (cond_b)
2272 goto latch_b;
2273 else
2274 goto merge_bb;
2275 latch_b:
2276 goto header_b;
2277
2278 merge_bb:
2279 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2280
2281 exit_bb:
2282 i_x = PHI<i_4>; ;Use of i_4 to be replaced with i_y in merge_bb.
2283
2284 For each name used out side EPILOG (i.e - for each name that has a lcssa
2285 phi in exit_bb) we create a new PHI in merge_bb. The new PHI has two
2286 args corresponding to GUARD_EDGE and MERGE_EDGE. Arg for MERGE_EDGE is
2287 the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2288 by LOOP and is found in the exit bb of LOOP. Arg of the original PHI
2289 in exit_bb will also be updated. */
2290
2291 static void
2292 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2293 edge guard_edge, edge merge_edge)
2294 {
2295 gphi_iterator gsi;
2296 basic_block merge_bb = guard_edge->dest;
2297
2298 gcc_assert (single_succ_p (merge_bb));
2299 edge e = single_succ_edge (merge_bb);
2300 basic_block exit_bb = e->dest;
2301 gcc_assert (single_pred_p (exit_bb));
2302 gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2303
2304 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2305 {
2306 gphi *update_phi = gsi.phi ();
2307 tree old_arg = PHI_ARG_DEF (update_phi, 0);
2308
2309 tree merge_arg = NULL_TREE;
2310
2311 /* If the old argument is a SSA_NAME use its current_def. */
2312 if (TREE_CODE (old_arg) == SSA_NAME)
2313 merge_arg = get_current_def (old_arg);
2314 /* If it's a constant or doesn't have a current_def, just use the old
2315 argument. */
2316 if (!merge_arg)
2317 merge_arg = old_arg;
2318
2319 tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2320 /* If the var is live after loop but not a reduction, we simply
2321 use the old arg. */
2322 if (!guard_arg)
2323 guard_arg = old_arg;
2324
2325 /* Create new phi node in MERGE_BB: */
2326 tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2327 gphi *merge_phi = create_phi_node (new_res, merge_bb);
2328
2329 /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2330 the two PHI args in merge_phi for these edges. */
2331 add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2332 add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2333
2334 /* Update the original phi in exit_bb. */
2335 adjust_phi_and_debug_stmts (update_phi, e, new_res);
2336 }
2337 }
2338
2339 /* EPILOG loop is duplicated from the original loop for vectorizing,
2340 the arg of its loop closed ssa PHI needs to be updated. */
2341
2342 static void
2343 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2344 {
2345 gphi_iterator gsi;
2346 basic_block exit_bb = single_exit (epilog)->dest;
2347
2348 gcc_assert (single_pred_p (exit_bb));
2349 edge e = EDGE_PRED (exit_bb, 0);
2350 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2351 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2352 }
2353
2354 /* Function vect_do_peeling.
2355
2356 Input:
2357 - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2358
2359 preheader:
2360 LOOP:
2361 header_bb:
2362 loop_body
2363 if (exit_loop_cond) goto exit_bb
2364 else goto header_bb
2365 exit_bb:
2366
2367 - NITERS: The number of iterations of the loop.
2368 - NITERSM1: The number of iterations of the loop's latch.
2369 - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2370 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2371 CHECK_PROFITABILITY is true.
2372 Output:
2373 - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2374 iterate after vectorization; see vect_set_loop_condition for details.
2375 - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2376 should be set to the number of scalar iterations handled by the
2377 vector loop. The SSA name is only used on exit from the loop.
2378
2379 This function peels prolog and epilog from the loop, adds guards skipping
2380 PROLOG and EPILOG for various conditions. As a result, the changed CFG
2381 would look like:
2382
2383 guard_bb_1:
2384 if (prefer_scalar_loop) goto merge_bb_1
2385 else goto guard_bb_2
2386
2387 guard_bb_2:
2388 if (skip_prolog) goto merge_bb_2
2389 else goto prolog_preheader
2390
2391 prolog_preheader:
2392 PROLOG:
2393 prolog_header_bb:
2394 prolog_body
2395 if (exit_prolog_cond) goto prolog_exit_bb
2396 else goto prolog_header_bb
2397 prolog_exit_bb:
2398
2399 merge_bb_2:
2400
2401 vector_preheader:
2402 VECTOR LOOP:
2403 vector_header_bb:
2404 vector_body
2405 if (exit_vector_cond) goto vector_exit_bb
2406 else goto vector_header_bb
2407 vector_exit_bb:
2408
2409 guard_bb_3:
2410 if (skip_epilog) goto merge_bb_3
2411 else goto epilog_preheader
2412
2413 merge_bb_1:
2414
2415 epilog_preheader:
2416 EPILOG:
2417 epilog_header_bb:
2418 epilog_body
2419 if (exit_epilog_cond) goto merge_bb_3
2420 else goto epilog_header_bb
2421
2422 merge_bb_3:
2423
2424 Note this function peels prolog and epilog only if it's necessary,
2425 as well as guards.
2426 This function returns the epilogue loop if a decision was made to vectorize
2427 it, otherwise NULL.
2428
2429 The analysis resulting in this epilogue loop's loop_vec_info was performed
2430 in the same vect_analyze_loop call as the main loop's. At that time
2431 vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2432 vectorization factors than the main loop. This list is stored in the main
2433 loop's loop_vec_info in the 'epilogue_vinfos' member. Everytime we decide to
2434 vectorize the epilogue loop for a lower vectorization factor, the
2435 loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2436 updated and linked to the epilogue loop. This is later used to vectorize
2437 the epilogue. The reason the loop_vec_info needs updating is that it was
2438 constructed based on the original main loop, and the epilogue loop is a
2439 copy of this loop, so all links pointing to statements in the original loop
2440 need updating. Furthermore, these loop_vec_infos share the
2441 data_reference's records, which will also need to be updated.
2442
2443 TODO: Guard for prefer_scalar_loop should be emitted along with
2444 versioning conditions if loop versioning is needed. */
2445
2446
2447 class loop *
2448 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2449 tree *niters_vector, tree *step_vector,
2450 tree *niters_vector_mult_vf_var, int th,
2451 bool check_profitability, bool niters_no_overflow,
2452 tree *advance)
2453 {
2454 edge e, guard_e;
2455 tree type = TREE_TYPE (niters), guard_cond;
2456 basic_block guard_bb, guard_to;
2457 profile_probability prob_prolog, prob_vector, prob_epilog;
2458 int estimated_vf;
2459 int prolog_peeling = 0;
2460 bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2461 /* We currently do not support prolog peeling if the target alignment is not
2462 known at compile time. 'vect_gen_prolog_loop_niters' depends on the
2463 target alignment being constant. */
2464 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2465 if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2466 return NULL;
2467
2468 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2469 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2470
2471 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2472 poly_uint64 bound_epilog = 0;
2473 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2474 && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2475 bound_epilog += vf - 1;
2476 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2477 bound_epilog += 1;
2478 bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2479 poly_uint64 bound_scalar = bound_epilog;
2480
2481 if (!prolog_peeling && !epilog_peeling)
2482 return NULL;
2483
2484 prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2485 estimated_vf = vect_vf_for_cost (loop_vinfo);
2486 if (estimated_vf == 2)
2487 estimated_vf = 3;
2488 prob_prolog = prob_epilog = profile_probability::guessed_always ()
2489 .apply_scale (estimated_vf - 1, estimated_vf);
2490
2491 class loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
2492 class loop *first_loop = loop;
2493 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2494
2495 /* We might have a queued need to update virtual SSA form. As we
2496 delete the update SSA machinery below after doing a regular
2497 incremental SSA update during loop copying make sure we don't
2498 lose that fact.
2499 ??? Needing to update virtual SSA form by renaming is unfortunate
2500 but not all of the vectorizer code inserting new loads / stores
2501 properly assigns virtual operands to those statements. */
2502 update_ssa (TODO_update_ssa_only_virtuals);
2503
2504 create_lcssa_for_virtual_phi (loop);
2505
2506 /* If we're vectorizing an epilogue loop, the update_ssa above will
2507 have ensured that the virtual operand is in SSA form throughout the
2508 vectorized main loop. Normally it is possible to trace the updated
2509 vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2510 back to scalar-stmt vuses, meaning that the effect of the SSA update
2511 remains local to the main loop. However, there are rare cases in
2512 which the vectorized loop has vdefs even when the original scalar
2513 loop didn't. For example, vectorizing a load with IFN_LOAD_LANES
2514 introduces clobbers of the temporary vector array, which in turn
2515 needs new vdefs. If the scalar loop doesn't write to memory, these
2516 new vdefs will be the only ones in the vector loop.
2517
2518 In that case, update_ssa will have added a new virtual phi to the
2519 main loop, which previously didn't need one. Ensure that we (locally)
2520 maintain LCSSA form for the virtual operand, just as we would have
2521 done if the virtual phi had existed from the outset. This makes it
2522 easier to duplicate the scalar epilogue loop below. */
2523 tree vop_to_rename = NULL_TREE;
2524 if (loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
2525 {
2526 class loop *orig_loop = LOOP_VINFO_LOOP (orig_loop_vinfo);
2527 vop_to_rename = create_lcssa_for_virtual_phi (orig_loop);
2528 }
2529
2530 if (MAY_HAVE_DEBUG_BIND_STMTS)
2531 {
2532 gcc_assert (!adjust_vec.exists ());
2533 adjust_vec.create (32);
2534 }
2535 initialize_original_copy_tables ();
2536
2537 /* Record the anchor bb at which the guard should be placed if the scalar
2538 loop might be preferred. */
2539 basic_block anchor = loop_preheader_edge (loop)->src;
2540
2541 /* Generate the number of iterations for the prolog loop. We do this here
2542 so that we can also get the upper bound on the number of iterations. */
2543 tree niters_prolog;
2544 int bound_prolog = 0;
2545 if (prolog_peeling)
2546 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2547 &bound_prolog);
2548 else
2549 niters_prolog = build_int_cst (type, 0);
2550
2551 loop_vec_info epilogue_vinfo = NULL;
2552 if (vect_epilogues)
2553 {
2554 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2555 loop_vinfo->epilogue_vinfos.ordered_remove (0);
2556 }
2557
2558 tree niters_vector_mult_vf = NULL_TREE;
2559 /* Saving NITERs before the loop, as this may be changed by prologue. */
2560 tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2561 edge update_e = NULL, skip_e = NULL;
2562 unsigned int lowest_vf = constant_lower_bound (vf);
2563 /* If we know the number of scalar iterations for the main loop we should
2564 check whether after the main loop there are enough iterations left over
2565 for the epilogue. */
2566 if (vect_epilogues
2567 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2568 && prolog_peeling >= 0
2569 && known_eq (vf, lowest_vf))
2570 {
2571 unsigned HOST_WIDE_INT eiters
2572 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
2573 - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2574
2575 eiters -= prolog_peeling;
2576 eiters
2577 = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2578
2579 unsigned int ratio;
2580 unsigned int epilogue_gaps
2581 = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2582 while (!(constant_multiple_p
2583 (GET_MODE_SIZE (loop_vinfo->vector_mode),
2584 GET_MODE_SIZE (epilogue_vinfo->vector_mode), &ratio)
2585 && eiters >= lowest_vf / ratio + epilogue_gaps))
2586 {
2587 delete epilogue_vinfo;
2588 epilogue_vinfo = NULL;
2589 if (loop_vinfo->epilogue_vinfos.length () == 0)
2590 {
2591 vect_epilogues = false;
2592 break;
2593 }
2594 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2595 loop_vinfo->epilogue_vinfos.ordered_remove (0);
2596 epilogue_gaps = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2597 }
2598 }
2599 /* Prolog loop may be skipped. */
2600 bool skip_prolog = (prolog_peeling != 0);
2601 /* Skip this loop to epilog when there are not enough iterations to enter this
2602 vectorized loop. If true we should perform runtime checks on the NITERS
2603 to check whether we should skip the current vectorized loop. If we know
2604 the number of scalar iterations we may choose to add a runtime check if
2605 this number "maybe" smaller than the number of iterations required
2606 when we know the number of scalar iterations may potentially
2607 be smaller than the number of iterations required to enter this loop, for
2608 this we use the upper bounds on the prolog and epilog peeling. When we
2609 don't know the number of iterations and don't require versioning it is
2610 because we have asserted that there are enough scalar iterations to enter
2611 the main loop, so this skip is not necessary. When we are versioning then
2612 we only add such a skip if we have chosen to vectorize the epilogue. */
2613 bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2614 ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2615 bound_prolog + bound_epilog)
2616 : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2617 || vect_epilogues));
2618 /* Epilog loop must be executed if the number of iterations for epilog
2619 loop is known at compile time, otherwise we need to add a check at
2620 the end of vector loop and skip to the end of epilog loop. */
2621 bool skip_epilog = (prolog_peeling < 0
2622 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2623 || !vf.is_constant ());
2624 /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
2625 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2626 skip_epilog = false;
2627
2628 if (skip_vector)
2629 {
2630 split_edge (loop_preheader_edge (loop));
2631
2632 /* Due to the order in which we peel prolog and epilog, we first
2633 propagate probability to the whole loop. The purpose is to
2634 avoid adjusting probabilities of both prolog and vector loops
2635 separately. Note in this case, the probability of epilog loop
2636 needs to be scaled back later. */
2637 basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2638 if (prob_vector.initialized_p ())
2639 {
2640 scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2641 scale_loop_profile (loop, prob_vector, 0);
2642 }
2643 }
2644
2645 dump_user_location_t loop_loc = find_loop_location (loop);
2646 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2647 if (vect_epilogues)
2648 /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2649 use the original scalar loop as remaining epilogue if necessary. */
2650 LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2651 = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2652
2653 if (prolog_peeling)
2654 {
2655 e = loop_preheader_edge (loop);
2656 if (!slpeel_can_duplicate_loop_p (loop, e))
2657 {
2658 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2659 "loop can't be duplicated to preheader edge.\n");
2660 gcc_unreachable ();
2661 }
2662 /* Peel prolog and put it on preheader edge of loop. */
2663 prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2664 if (!prolog)
2665 {
2666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2667 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2668 gcc_unreachable ();
2669 }
2670 prolog->force_vectorize = false;
2671 slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2672 first_loop = prolog;
2673 reset_original_copy_tables ();
2674
2675 /* Update the number of iterations for prolog loop. */
2676 tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2677 vect_set_loop_condition (prolog, NULL, niters_prolog,
2678 step_prolog, NULL_TREE, false);
2679
2680 /* Skip the prolog loop. */
2681 if (skip_prolog)
2682 {
2683 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2684 niters_prolog, build_int_cst (type, 0));
2685 guard_bb = loop_preheader_edge (prolog)->src;
2686 basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2687 guard_to = split_edge (loop_preheader_edge (loop));
2688 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2689 guard_to, guard_bb,
2690 prob_prolog.invert (),
2691 irred_flag);
2692 e = EDGE_PRED (guard_to, 0);
2693 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2694 slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2695
2696 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2697 scale_loop_profile (prolog, prob_prolog, bound_prolog);
2698 }
2699
2700 /* Update init address of DRs. */
2701 vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2702 /* Update niters for vector loop. */
2703 LOOP_VINFO_NITERS (loop_vinfo)
2704 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2705 LOOP_VINFO_NITERSM1 (loop_vinfo)
2706 = fold_build2 (MINUS_EXPR, type,
2707 LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2708 bool new_var_p = false;
2709 niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2710 /* It's guaranteed that vector loop bound before vectorization is at
2711 least VF, so set range information for newly generated var. */
2712 if (new_var_p)
2713 set_range_info (niters, VR_RANGE,
2714 wi::to_wide (build_int_cst (type, vf)),
2715 wi::to_wide (TYPE_MAX_VALUE (type)));
2716
2717 /* Prolog iterates at most bound_prolog times, latch iterates at
2718 most bound_prolog - 1 times. */
2719 record_niter_bound (prolog, bound_prolog - 1, false, true);
2720 delete_update_ssa ();
2721 adjust_vec_debug_stmts ();
2722 scev_reset ();
2723 }
2724
2725 if (epilog_peeling)
2726 {
2727 e = single_exit (loop);
2728 if (!slpeel_can_duplicate_loop_p (loop, e))
2729 {
2730 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2731 "loop can't be duplicated to exit edge.\n");
2732 gcc_unreachable ();
2733 }
2734 /* Peel epilog and put it on exit edge of loop. If we are vectorizing
2735 said epilog then we should use a copy of the main loop as a starting
2736 point. This loop may have already had some preliminary transformations
2737 to allow for more optimal vectorization, for example if-conversion.
2738 If we are not vectorizing the epilog then we should use the scalar loop
2739 as the transformations mentioned above make less or no sense when not
2740 vectorizing. */
2741 epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2742 if (vop_to_rename)
2743 {
2744 /* Vectorizing the main loop can sometimes introduce a vdef to
2745 a loop that previously didn't have one; see the comment above
2746 the definition of VOP_TO_RENAME for details. The definition
2747 D that holds on E will then be different from the definition
2748 VOP_TO_RENAME that holds during SCALAR_LOOP, so we need to
2749 rename VOP_TO_RENAME to D when copying the loop.
2750
2751 The virtual operand is in LCSSA form for the main loop,
2752 and no stmt between the main loop and E needs a vdef,
2753 so we know that D is provided by a phi rather than by a
2754 vdef on a normal gimple stmt. */
2755 basic_block vdef_bb = e->src;
2756 gphi *vphi;
2757 while (!(vphi = get_virtual_phi (vdef_bb)))
2758 vdef_bb = get_immediate_dominator (CDI_DOMINATORS, vdef_bb);
2759 gcc_assert (vop_to_rename != gimple_phi_result (vphi));
2760 set_current_def (vop_to_rename, gimple_phi_result (vphi));
2761 }
2762 epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
2763 if (!epilog)
2764 {
2765 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2766 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2767 gcc_unreachable ();
2768 }
2769 epilog->force_vectorize = false;
2770 slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2771
2772 /* Scalar version loop may be preferred. In this case, add guard
2773 and skip to epilog. Note this only happens when the number of
2774 iterations of loop is unknown at compile time, otherwise this
2775 won't be vectorized. */
2776 if (skip_vector)
2777 {
2778 /* Additional epilogue iteration is peeled if gap exists. */
2779 tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2780 bound_prolog, bound_epilog,
2781 th, &bound_scalar,
2782 check_profitability);
2783 /* Build guard against NITERSM1 since NITERS may overflow. */
2784 guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2785 guard_bb = anchor;
2786 guard_to = split_edge (loop_preheader_edge (epilog));
2787 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2788 guard_to, guard_bb,
2789 prob_vector.invert (),
2790 irred_flag);
2791 skip_e = guard_e;
2792 e = EDGE_PRED (guard_to, 0);
2793 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2794 slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2795
2796 /* Simply propagate profile info from guard_bb to guard_to which is
2797 a merge point of control flow. */
2798 guard_to->count = guard_bb->count;
2799
2800 /* Scale probability of epilog loop back.
2801 FIXME: We should avoid scaling down and back up. Profile may
2802 get lost if we scale down to 0. */
2803 basic_block *bbs = get_loop_body (epilog);
2804 for (unsigned int i = 0; i < epilog->num_nodes; i++)
2805 bbs[i]->count = bbs[i]->count.apply_scale
2806 (bbs[i]->count,
2807 bbs[i]->count.apply_probability
2808 (prob_vector));
2809 free (bbs);
2810 }
2811
2812 basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2813 /* If loop is peeled for non-zero constant times, now niters refers to
2814 orig_niters - prolog_peeling, it won't overflow even the orig_niters
2815 overflows. */
2816 niters_no_overflow |= (prolog_peeling > 0);
2817 vect_gen_vector_loop_niters (loop_vinfo, niters,
2818 niters_vector, step_vector,
2819 niters_no_overflow);
2820 if (!integer_onep (*step_vector))
2821 {
2822 /* On exit from the loop we will have an easy way of calcalating
2823 NITERS_VECTOR / STEP * STEP. Install a dummy definition
2824 until then. */
2825 niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2826 SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2827 *niters_vector_mult_vf_var = niters_vector_mult_vf;
2828 }
2829 else
2830 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2831 &niters_vector_mult_vf);
2832 /* Update IVs of original loop as if they were advanced by
2833 niters_vector_mult_vf steps. */
2834 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2835 update_e = skip_vector ? e : loop_preheader_edge (epilog);
2836 vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2837 update_e);
2838
2839 if (skip_epilog)
2840 {
2841 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2842 niters, niters_vector_mult_vf);
2843 guard_bb = single_exit (loop)->dest;
2844 guard_to = split_edge (single_exit (epilog));
2845 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2846 skip_vector ? anchor : guard_bb,
2847 prob_epilog.invert (),
2848 irred_flag);
2849 slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2850 single_exit (epilog));
2851 /* Only need to handle basic block before epilog loop if it's not
2852 the guard_bb, which is the case when skip_vector is true. */
2853 if (guard_bb != bb_before_epilog)
2854 {
2855 prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2856
2857 scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2858 }
2859 scale_loop_profile (epilog, prob_epilog, 0);
2860 }
2861 else
2862 slpeel_update_phi_nodes_for_lcssa (epilog);
2863
2864 unsigned HOST_WIDE_INT bound;
2865 if (bound_scalar.is_constant (&bound))
2866 {
2867 gcc_assert (bound != 0);
2868 /* -1 to convert loop iterations to latch iterations. */
2869 record_niter_bound (epilog, bound - 1, false, true);
2870 }
2871
2872 delete_update_ssa ();
2873 adjust_vec_debug_stmts ();
2874 scev_reset ();
2875 }
2876
2877 if (vect_epilogues)
2878 {
2879 epilog->aux = epilogue_vinfo;
2880 LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
2881
2882 loop_constraint_clear (epilog, LOOP_C_INFINITE);
2883
2884 /* We now must calculate the number of NITERS performed by the previous
2885 loop and EPILOGUE_NITERS to be performed by the epilogue. */
2886 tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
2887 niters_prolog, niters_vector_mult_vf);
2888
2889 /* If skip_vector we may skip the previous loop, we insert a phi-node to
2890 determine whether we are coming from the previous vectorized loop
2891 using the update_e edge or the skip_vector basic block using the
2892 skip_e edge. */
2893 if (skip_vector)
2894 {
2895 gcc_assert (update_e != NULL && skip_e != NULL);
2896 gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
2897 update_e->dest);
2898 tree new_ssa = make_ssa_name (TREE_TYPE (niters));
2899 gimple *stmt = gimple_build_assign (new_ssa, niters);
2900 gimple_stmt_iterator gsi;
2901 if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
2902 && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
2903 {
2904 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
2905 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
2906 }
2907 else
2908 {
2909 gsi = gsi_last_bb (update_e->src);
2910 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
2911 }
2912
2913 niters = new_ssa;
2914 add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
2915 add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
2916 UNKNOWN_LOCATION);
2917 niters = PHI_RESULT (new_phi);
2918 }
2919
2920 /* Subtract the number of iterations performed by the vectorized loop
2921 from the number of total iterations. */
2922 tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
2923 before_loop_niters,
2924 niters);
2925
2926 LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
2927 LOOP_VINFO_NITERSM1 (epilogue_vinfo)
2928 = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
2929 epilogue_niters,
2930 build_one_cst (TREE_TYPE (epilogue_niters)));
2931
2932 /* Set ADVANCE to the number of iterations performed by the previous
2933 loop and its prologue. */
2934 *advance = niters;
2935
2936 /* Redo the peeling for niter analysis as the NITERs and alignment
2937 may have been updated to take the main loop into account. */
2938 determine_peel_for_niter (epilogue_vinfo);
2939 }
2940
2941 adjust_vec.release ();
2942 free_original_copy_tables ();
2943
2944 return vect_epilogues ? epilog : NULL;
2945 }
2946
2947 /* Function vect_create_cond_for_niters_checks.
2948
2949 Create a conditional expression that represents the run-time checks for
2950 loop's niter. The loop is guaranteed to terminate if the run-time
2951 checks hold.
2952
2953 Input:
2954 COND_EXPR - input conditional expression. New conditions will be chained
2955 with logical AND operation. If it is NULL, then the function
2956 is used to return the number of alias checks.
2957 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2958 to be checked.
2959
2960 Output:
2961 COND_EXPR - conditional expression.
2962
2963 The returned COND_EXPR is the conditional expression to be used in the
2964 if statement that controls which version of the loop gets executed at
2965 runtime. */
2966
2967 static void
2968 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
2969 {
2970 tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
2971
2972 if (*cond_expr)
2973 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2974 *cond_expr, part_cond_expr);
2975 else
2976 *cond_expr = part_cond_expr;
2977 }
2978
2979 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2980 and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
2981
2982 static void
2983 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
2984 {
2985 if (*cond_expr)
2986 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2987 *cond_expr, part_cond_expr);
2988 else
2989 *cond_expr = part_cond_expr;
2990 }
2991
2992 /* Function vect_create_cond_for_align_checks.
2993
2994 Create a conditional expression that represents the alignment checks for
2995 all of data references (array element references) whose alignment must be
2996 checked at runtime.
2997
2998 Input:
2999 COND_EXPR - input conditional expression. New conditions will be chained
3000 with logical AND operation.
3001 LOOP_VINFO - two fields of the loop information are used.
3002 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3003 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3004
3005 Output:
3006 COND_EXPR_STMT_LIST - statements needed to construct the conditional
3007 expression.
3008 The returned value is the conditional expression to be used in the if
3009 statement that controls which version of the loop gets executed at runtime.
3010
3011 The algorithm makes two assumptions:
3012 1) The number of bytes "n" in a vector is a power of 2.
3013 2) An address "a" is aligned if a%n is zero and that this
3014 test can be done as a&(n-1) == 0. For example, for 16
3015 byte vectors the test is a&0xf == 0. */
3016
3017 static void
3018 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3019 tree *cond_expr,
3020 gimple_seq *cond_expr_stmt_list)
3021 {
3022 vec<stmt_vec_info> may_misalign_stmts
3023 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3024 stmt_vec_info stmt_info;
3025 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3026 tree mask_cst;
3027 unsigned int i;
3028 tree int_ptrsize_type;
3029 char tmp_name[20];
3030 tree or_tmp_name = NULL_TREE;
3031 tree and_tmp_name;
3032 gimple *and_stmt;
3033 tree ptrsize_zero;
3034 tree part_cond_expr;
3035
3036 /* Check that mask is one less than a power of 2, i.e., mask is
3037 all zeros followed by all ones. */
3038 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3039
3040 int_ptrsize_type = signed_type_for (ptr_type_node);
3041
3042 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3043 of the first vector of the i'th data reference. */
3044
3045 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3046 {
3047 gimple_seq new_stmt_list = NULL;
3048 tree addr_base;
3049 tree addr_tmp_name;
3050 tree new_or_tmp_name;
3051 gimple *addr_stmt, *or_stmt;
3052 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3053 bool negative = tree_int_cst_compare
3054 (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3055 tree offset = negative
3056 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
3057
3058 /* create: addr_tmp = (int)(address_of_first_vector) */
3059 addr_base =
3060 vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
3061 offset);
3062 if (new_stmt_list != NULL)
3063 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3064
3065 sprintf (tmp_name, "addr2int%d", i);
3066 addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3067 addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3068 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3069
3070 /* The addresses are OR together. */
3071
3072 if (or_tmp_name != NULL_TREE)
3073 {
3074 /* create: or_tmp = or_tmp | addr_tmp */
3075 sprintf (tmp_name, "orptrs%d", i);
3076 new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3077 or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3078 or_tmp_name, addr_tmp_name);
3079 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3080 or_tmp_name = new_or_tmp_name;
3081 }
3082 else
3083 or_tmp_name = addr_tmp_name;
3084
3085 } /* end for i */
3086
3087 mask_cst = build_int_cst (int_ptrsize_type, mask);
3088
3089 /* create: and_tmp = or_tmp & mask */
3090 and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3091
3092 and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3093 or_tmp_name, mask_cst);
3094 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3095
3096 /* Make and_tmp the left operand of the conditional test against zero.
3097 if and_tmp has a nonzero bit then some address is unaligned. */
3098 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3099 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3100 and_tmp_name, ptrsize_zero);
3101 chain_cond_expr (cond_expr, part_cond_expr);
3102 }
3103
3104 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3105 create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3106 Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3107 and this new condition are true. Treat a null *COND_EXPR as "true". */
3108
3109 static void
3110 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3111 {
3112 vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3113 unsigned int i;
3114 vec_object_pair *pair;
3115 FOR_EACH_VEC_ELT (pairs, i, pair)
3116 {
3117 tree addr1 = build_fold_addr_expr (pair->first);
3118 tree addr2 = build_fold_addr_expr (pair->second);
3119 tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3120 addr1, addr2);
3121 chain_cond_expr (cond_expr, part_cond_expr);
3122 }
3123 }
3124
3125 /* Create an expression that is true when all lower-bound conditions for
3126 the vectorized loop are met. Chain this condition with *COND_EXPR. */
3127
3128 static void
3129 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3130 {
3131 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3132 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3133 {
3134 tree expr = lower_bounds[i].expr;
3135 tree type = unsigned_type_for (TREE_TYPE (expr));
3136 expr = fold_convert (type, expr);
3137 poly_uint64 bound = lower_bounds[i].min_value;
3138 if (!lower_bounds[i].unsigned_p)
3139 {
3140 expr = fold_build2 (PLUS_EXPR, type, expr,
3141 build_int_cstu (type, bound - 1));
3142 bound += bound - 1;
3143 }
3144 tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3145 build_int_cstu (type, bound));
3146 chain_cond_expr (cond_expr, part_cond_expr);
3147 }
3148 }
3149
3150 /* Function vect_create_cond_for_alias_checks.
3151
3152 Create a conditional expression that represents the run-time checks for
3153 overlapping of address ranges represented by a list of data references
3154 relations passed as input.
3155
3156 Input:
3157 COND_EXPR - input conditional expression. New conditions will be chained
3158 with logical AND operation. If it is NULL, then the function
3159 is used to return the number of alias checks.
3160 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3161 to be checked.
3162
3163 Output:
3164 COND_EXPR - conditional expression.
3165
3166 The returned COND_EXPR is the conditional expression to be used in the if
3167 statement that controls which version of the loop gets executed at runtime.
3168 */
3169
3170 void
3171 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3172 {
3173 vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
3174 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3175
3176 if (comp_alias_ddrs.is_empty ())
3177 return;
3178
3179 create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3180 &comp_alias_ddrs, cond_expr);
3181 if (dump_enabled_p ())
3182 dump_printf_loc (MSG_NOTE, vect_location,
3183 "created %u versioning for alias checks.\n",
3184 comp_alias_ddrs.length ());
3185 }
3186
3187
3188 /* Function vect_loop_versioning.
3189
3190 If the loop has data references that may or may not be aligned or/and
3191 has data reference relations whose independence was not proven then
3192 two versions of the loop need to be generated, one which is vectorized
3193 and one which isn't. A test is then generated to control which of the
3194 loops is executed. The test checks for the alignment of all of the
3195 data references that may or may not be aligned. An additional
3196 sequence of runtime tests is generated for each pairs of DDRs whose
3197 independence was not proven. The vectorized version of loop is
3198 executed only if both alias and alignment tests are passed.
3199
3200 The test generated to check which version of loop is executed
3201 is modified to also check for profitability as indicated by the
3202 cost model threshold TH.
3203
3204 The versioning precondition(s) are placed in *COND_EXPR and
3205 *COND_EXPR_STMT_LIST. */
3206
3207 class loop *
3208 vect_loop_versioning (loop_vec_info loop_vinfo,
3209 gimple *loop_vectorized_call)
3210 {
3211 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3212 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3213 basic_block condition_bb;
3214 gphi_iterator gsi;
3215 gimple_stmt_iterator cond_exp_gsi;
3216 basic_block merge_bb;
3217 basic_block new_exit_bb;
3218 edge new_exit_e, e;
3219 gphi *orig_phi, *new_phi;
3220 tree cond_expr = NULL_TREE;
3221 gimple_seq cond_expr_stmt_list = NULL;
3222 tree arg;
3223 profile_probability prob = profile_probability::likely ();
3224 gimple_seq gimplify_stmt_list = NULL;
3225 tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3226 bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3227 bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3228 bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3229 poly_uint64 versioning_threshold
3230 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3231 tree version_simd_if_cond
3232 = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3233 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3234
3235 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3236 && !ordered_p (th, versioning_threshold))
3237 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3238 build_int_cst (TREE_TYPE (scalar_loop_iters),
3239 th - 1));
3240 if (maybe_ne (versioning_threshold, 0U))
3241 {
3242 tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3243 build_int_cst (TREE_TYPE (scalar_loop_iters),
3244 versioning_threshold - 1));
3245 if (cond_expr)
3246 cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3247 expr, cond_expr);
3248 else
3249 cond_expr = expr;
3250 }
3251
3252 if (version_niter)
3253 vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3254
3255 if (cond_expr)
3256 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3257 &cond_expr_stmt_list,
3258 is_gimple_condexpr, NULL_TREE);
3259
3260 if (version_align)
3261 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3262 &cond_expr_stmt_list);
3263
3264 if (version_alias)
3265 {
3266 vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3267 vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3268 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3269 }
3270
3271 if (version_simd_if_cond)
3272 {
3273 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3274 if (flag_checking)
3275 if (basic_block bb
3276 = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3277 gcc_assert (bb != loop->header
3278 && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3279 && (scalar_loop == NULL
3280 || (bb != scalar_loop->header
3281 && dominated_by_p (CDI_DOMINATORS,
3282 scalar_loop->header, bb))));
3283 tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3284 tree c = fold_build2 (NE_EXPR, boolean_type_node,
3285 version_simd_if_cond, zero);
3286 if (cond_expr)
3287 cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3288 c, cond_expr);
3289 else
3290 cond_expr = c;
3291 if (dump_enabled_p ())
3292 dump_printf_loc (MSG_NOTE, vect_location,
3293 "created versioning for simd if condition check.\n");
3294 }
3295
3296 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3297 &gimplify_stmt_list,
3298 is_gimple_condexpr, NULL_TREE);
3299 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3300
3301 /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3302 invariant in. */
3303 class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3304 for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3305 !gsi_end_p (gsi); gsi_next (&gsi))
3306 {
3307 gimple *stmt = gsi_stmt (gsi);
3308 update_stmt (stmt);
3309 ssa_op_iter iter;
3310 use_operand_p use_p;
3311 basic_block def_bb;
3312 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3313 if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3314 && flow_bb_inside_loop_p (outermost, def_bb))
3315 outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3316 }
3317
3318 /* Search for the outermost loop we can version. Avoid versioning of
3319 non-perfect nests but allow if-conversion versioned loops inside. */
3320 class loop *loop_to_version = loop;
3321 if (flow_loop_nested_p (outermost, loop))
3322 {
3323 if (dump_enabled_p ())
3324 dump_printf_loc (MSG_NOTE, vect_location,
3325 "trying to apply versioning to outer loop %d\n",
3326 outermost->num);
3327 if (outermost->num == 0)
3328 outermost = superloop_at_depth (loop, 1);
3329 /* And avoid applying versioning on non-perfect nests. */
3330 while (loop_to_version != outermost
3331 && single_exit (loop_outer (loop_to_version))
3332 && (!loop_outer (loop_to_version)->inner->next
3333 || vect_loop_vectorized_call (loop_to_version))
3334 && (!loop_outer (loop_to_version)->inner->next
3335 || !loop_outer (loop_to_version)->inner->next->next))
3336 loop_to_version = loop_outer (loop_to_version);
3337 }
3338
3339 /* Apply versioning. If there is already a scalar version created by
3340 if-conversion re-use that. Note we cannot re-use the copy of
3341 an if-converted outer-loop when vectorizing the inner loop only. */
3342 gcond *cond;
3343 if ((!loop_to_version->inner || loop == loop_to_version)
3344 && loop_vectorized_call)
3345 {
3346 gcc_assert (scalar_loop);
3347 condition_bb = gimple_bb (loop_vectorized_call);
3348 cond = as_a <gcond *> (last_stmt (condition_bb));
3349 gimple_cond_set_condition_from_tree (cond, cond_expr);
3350 update_stmt (cond);
3351
3352 if (cond_expr_stmt_list)
3353 {
3354 cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3355 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3356 GSI_SAME_STMT);
3357 }
3358
3359 /* if-conversion uses profile_probability::always () for both paths,
3360 reset the paths probabilities appropriately. */
3361 edge te, fe;
3362 extract_true_false_edges_from_block (condition_bb, &te, &fe);
3363 te->probability = prob;
3364 fe->probability = prob.invert ();
3365 /* We can scale loops counts immediately but have to postpone
3366 scaling the scalar loop because we re-use it during peeling. */
3367 scale_loop_frequencies (loop_to_version, te->probability);
3368 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3369
3370 nloop = scalar_loop;
3371 if (dump_enabled_p ())
3372 dump_printf_loc (MSG_NOTE, vect_location,
3373 "reusing %sloop version created by if conversion\n",
3374 loop_to_version != loop ? "outer " : "");
3375 }
3376 else
3377 {
3378 if (loop_to_version != loop
3379 && dump_enabled_p ())
3380 dump_printf_loc (MSG_NOTE, vect_location,
3381 "applying loop versioning to outer loop %d\n",
3382 loop_to_version->num);
3383
3384 initialize_original_copy_tables ();
3385 nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3386 prob, prob.invert (), prob, prob.invert (), true);
3387 gcc_assert (nloop);
3388 nloop = get_loop_copy (loop);
3389
3390 /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3391 reap those otherwise; they also refer to the original
3392 loops. */
3393 class loop *l = loop;
3394 while (gimple *call = vect_loop_vectorized_call (l))
3395 {
3396 call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3397 fold_loop_internal_call (call, boolean_false_node);
3398 l = loop_outer (l);
3399 }
3400 free_original_copy_tables ();
3401
3402 if (cond_expr_stmt_list)
3403 {
3404 cond_exp_gsi = gsi_last_bb (condition_bb);
3405 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3406 GSI_SAME_STMT);
3407 }
3408
3409 /* Loop versioning violates an assumption we try to maintain during
3410 vectorization - that the loop exit block has a single predecessor.
3411 After versioning, the exit block of both loop versions is the same
3412 basic block (i.e. it has two predecessors). Just in order to simplify
3413 following transformations in the vectorizer, we fix this situation
3414 here by adding a new (empty) block on the exit-edge of the loop,
3415 with the proper loop-exit phis to maintain loop-closed-form.
3416 If loop versioning wasn't done from loop, but scalar_loop instead,
3417 merge_bb will have already just a single successor. */
3418
3419 merge_bb = single_exit (loop_to_version)->dest;
3420 if (EDGE_COUNT (merge_bb->preds) >= 2)
3421 {
3422 gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3423 new_exit_bb = split_edge (single_exit (loop_to_version));
3424 new_exit_e = single_exit (loop_to_version);
3425 e = EDGE_SUCC (new_exit_bb, 0);
3426
3427 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3428 gsi_next (&gsi))
3429 {
3430 tree new_res;
3431 orig_phi = gsi.phi ();
3432 new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3433 new_phi = create_phi_node (new_res, new_exit_bb);
3434 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3435 add_phi_arg (new_phi, arg, new_exit_e,
3436 gimple_phi_arg_location_from_edge (orig_phi, e));
3437 adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3438 }
3439 }
3440
3441 update_ssa (TODO_update_ssa);
3442 }
3443
3444 if (version_niter)
3445 {
3446 /* The versioned loop could be infinite, we need to clear existing
3447 niter information which is copied from the original loop. */
3448 gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3449 vect_free_loop_info_assumptions (nloop);
3450 /* And set constraint LOOP_C_INFINITE for niter analyzer. */
3451 loop_constraint_set (loop, LOOP_C_INFINITE);
3452 }
3453
3454 if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3455 && dump_enabled_p ())
3456 {
3457 if (version_alias)
3458 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3459 vect_location,
3460 "loop versioned for vectorization because of "
3461 "possible aliasing\n");
3462 if (version_align)
3463 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3464 vect_location,
3465 "loop versioned for vectorization to enhance "
3466 "alignment\n");
3467
3468 }
3469
3470 return nloop;
3471 }