[vect]PR 88915: Vectorize epilogues when versioning loops
[gcc.git] / gcc / tree-vect-loop-manip.c
1 /* Vectorizer Specific Loop Manipulations
2 Copyright (C) 2003-2019 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "fold-const.h"
32 #include "cfganal.h"
33 #include "gimplify.h"
34 #include "gimple-iterator.h"
35 #include "gimplify-me.h"
36 #include "tree-cfg.h"
37 #include "tree-ssa-loop-manip.h"
38 #include "tree-into-ssa.h"
39 #include "tree-ssa.h"
40 #include "cfgloop.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "gimple-fold.h"
45 #include "tree-ssa-loop-niter.h"
46 #include "internal-fn.h"
47 #include "stor-layout.h"
48 #include "optabs-query.h"
49 #include "vec-perm-indices.h"
50
51 /*************************************************************************
52 Simple Loop Peeling Utilities
53
54 Utilities to support loop peeling for vectorization purposes.
55 *************************************************************************/
56
57
58 /* Renames the use *OP_P. */
59
60 static void
61 rename_use_op (use_operand_p op_p)
62 {
63 tree new_name;
64
65 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
66 return;
67
68 new_name = get_current_def (USE_FROM_PTR (op_p));
69
70 /* Something defined outside of the loop. */
71 if (!new_name)
72 return;
73
74 /* An ordinary ssa name defined in the loop. */
75
76 SET_USE (op_p, new_name);
77 }
78
79
80 /* Renames the variables in basic block BB. Allow renaming of PHI arguments
81 on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
82 true. */
83
84 static void
85 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
86 {
87 gimple *stmt;
88 use_operand_p use_p;
89 ssa_op_iter iter;
90 edge e;
91 edge_iterator ei;
92 class loop *loop = bb->loop_father;
93 class loop *outer_loop = NULL;
94
95 if (rename_from_outer_loop)
96 {
97 gcc_assert (loop);
98 outer_loop = loop_outer (loop);
99 }
100
101 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
102 gsi_next (&gsi))
103 {
104 stmt = gsi_stmt (gsi);
105 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
106 rename_use_op (use_p);
107 }
108
109 FOR_EACH_EDGE (e, ei, bb->preds)
110 {
111 if (!flow_bb_inside_loop_p (loop, e->src))
112 {
113 if (!rename_from_outer_loop)
114 continue;
115 if (e->src != outer_loop->header)
116 {
117 if (outer_loop->inner->next)
118 {
119 /* If outer_loop has 2 inner loops, allow there to
120 be an extra basic block which decides which of the
121 two loops to use using LOOP_VECTORIZED. */
122 if (!single_pred_p (e->src)
123 || single_pred (e->src) != outer_loop->header)
124 continue;
125 }
126 }
127 }
128 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
129 gsi_next (&gsi))
130 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
131 }
132 }
133
134
135 struct adjust_info
136 {
137 tree from, to;
138 basic_block bb;
139 };
140
141 /* A stack of values to be adjusted in debug stmts. We have to
142 process them LIFO, so that the closest substitution applies. If we
143 processed them FIFO, without the stack, we might substitute uses
144 with a PHI DEF that would soon become non-dominant, and when we got
145 to the suitable one, it wouldn't have anything to substitute any
146 more. */
147 static vec<adjust_info, va_heap> adjust_vec;
148
149 /* Adjust any debug stmts that referenced AI->from values to use the
150 loop-closed AI->to, if the references are dominated by AI->bb and
151 not by the definition of AI->from. */
152
153 static void
154 adjust_debug_stmts_now (adjust_info *ai)
155 {
156 basic_block bbphi = ai->bb;
157 tree orig_def = ai->from;
158 tree new_def = ai->to;
159 imm_use_iterator imm_iter;
160 gimple *stmt;
161 basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
162
163 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
164
165 /* Adjust any debug stmts that held onto non-loop-closed
166 references. */
167 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
168 {
169 use_operand_p use_p;
170 basic_block bbuse;
171
172 if (!is_gimple_debug (stmt))
173 continue;
174
175 gcc_assert (gimple_debug_bind_p (stmt));
176
177 bbuse = gimple_bb (stmt);
178
179 if ((bbuse == bbphi
180 || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
181 && !(bbuse == bbdef
182 || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
183 {
184 if (new_def)
185 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
186 SET_USE (use_p, new_def);
187 else
188 {
189 gimple_debug_bind_reset_value (stmt);
190 update_stmt (stmt);
191 }
192 }
193 }
194 }
195
196 /* Adjust debug stmts as scheduled before. */
197
198 static void
199 adjust_vec_debug_stmts (void)
200 {
201 if (!MAY_HAVE_DEBUG_BIND_STMTS)
202 return;
203
204 gcc_assert (adjust_vec.exists ());
205
206 while (!adjust_vec.is_empty ())
207 {
208 adjust_debug_stmts_now (&adjust_vec.last ());
209 adjust_vec.pop ();
210 }
211 }
212
213 /* Adjust any debug stmts that referenced FROM values to use the
214 loop-closed TO, if the references are dominated by BB and not by
215 the definition of FROM. If adjust_vec is non-NULL, adjustments
216 will be postponed until adjust_vec_debug_stmts is called. */
217
218 static void
219 adjust_debug_stmts (tree from, tree to, basic_block bb)
220 {
221 adjust_info ai;
222
223 if (MAY_HAVE_DEBUG_BIND_STMTS
224 && TREE_CODE (from) == SSA_NAME
225 && ! SSA_NAME_IS_DEFAULT_DEF (from)
226 && ! virtual_operand_p (from))
227 {
228 ai.from = from;
229 ai.to = to;
230 ai.bb = bb;
231
232 if (adjust_vec.exists ())
233 adjust_vec.safe_push (ai);
234 else
235 adjust_debug_stmts_now (&ai);
236 }
237 }
238
239 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
240 to adjust any debug stmts that referenced the old phi arg,
241 presumably non-loop-closed references left over from other
242 transformations. */
243
244 static void
245 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
246 {
247 tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
248
249 SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
250
251 if (MAY_HAVE_DEBUG_BIND_STMTS)
252 adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
253 gimple_bb (update_phi));
254 }
255
256 /* Define one loop mask MASK from loop LOOP. INIT_MASK is the value that
257 the mask should have during the first iteration and NEXT_MASK is the
258 value that it should have on subsequent iterations. */
259
260 static void
261 vect_set_loop_mask (class loop *loop, tree mask, tree init_mask,
262 tree next_mask)
263 {
264 gphi *phi = create_phi_node (mask, loop->header);
265 add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
266 add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
267 }
268
269 /* Add SEQ to the end of LOOP's preheader block. */
270
271 static void
272 add_preheader_seq (class loop *loop, gimple_seq seq)
273 {
274 if (seq)
275 {
276 edge pe = loop_preheader_edge (loop);
277 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
278 gcc_assert (!new_bb);
279 }
280 }
281
282 /* Add SEQ to the beginning of LOOP's header block. */
283
284 static void
285 add_header_seq (class loop *loop, gimple_seq seq)
286 {
287 if (seq)
288 {
289 gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
290 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
291 }
292 }
293
294 /* Return true if the target can interleave elements of two vectors.
295 OFFSET is 0 if the first half of the vectors should be interleaved
296 or 1 if the second half should. When returning true, store the
297 associated permutation in INDICES. */
298
299 static bool
300 interleave_supported_p (vec_perm_indices *indices, tree vectype,
301 unsigned int offset)
302 {
303 poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
304 poly_uint64 base = exact_div (nelts, 2) * offset;
305 vec_perm_builder sel (nelts, 2, 3);
306 for (unsigned int i = 0; i < 3; ++i)
307 {
308 sel.quick_push (base + i);
309 sel.quick_push (base + i + nelts);
310 }
311 indices->new_vector (sel, 2, nelts);
312 return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
313 }
314
315 /* Try to use permutes to define the masks in DEST_RGM using the masks
316 in SRC_RGM, given that the former has twice as many masks as the
317 latter. Return true on success, adding any new statements to SEQ. */
318
319 static bool
320 vect_maybe_permute_loop_masks (loop_vec_info loop_vinfo, gimple_seq *seq,
321 rgroup_masks *dest_rgm,
322 rgroup_masks *src_rgm)
323 {
324 tree src_masktype = src_rgm->mask_type;
325 tree dest_masktype = dest_rgm->mask_type;
326 machine_mode src_mode = TYPE_MODE (src_masktype);
327 if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
328 && optab_handler (vec_unpacku_hi_optab, src_mode) != CODE_FOR_nothing
329 && optab_handler (vec_unpacku_lo_optab, src_mode) != CODE_FOR_nothing)
330 {
331 /* Unpacking the source masks gives at least as many mask bits as
332 we need. We can then VIEW_CONVERT any excess bits away. */
333 tree unpack_masktype = vect_halve_mask_nunits (loop_vinfo, src_masktype);
334 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
335 {
336 tree src = src_rgm->masks[i / 2];
337 tree dest = dest_rgm->masks[i];
338 tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
339 ? VEC_UNPACK_HI_EXPR
340 : VEC_UNPACK_LO_EXPR);
341 gassign *stmt;
342 if (dest_masktype == unpack_masktype)
343 stmt = gimple_build_assign (dest, code, src);
344 else
345 {
346 tree temp = make_ssa_name (unpack_masktype);
347 stmt = gimple_build_assign (temp, code, src);
348 gimple_seq_add_stmt (seq, stmt);
349 stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
350 build1 (VIEW_CONVERT_EXPR,
351 dest_masktype, temp));
352 }
353 gimple_seq_add_stmt (seq, stmt);
354 }
355 return true;
356 }
357 vec_perm_indices indices[2];
358 if (dest_masktype == src_masktype
359 && interleave_supported_p (&indices[0], src_masktype, 0)
360 && interleave_supported_p (&indices[1], src_masktype, 1))
361 {
362 /* The destination requires twice as many mask bits as the source, so
363 we can use interleaving permutes to double up the number of bits. */
364 tree masks[2];
365 for (unsigned int i = 0; i < 2; ++i)
366 masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
367 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
368 {
369 tree src = src_rgm->masks[i / 2];
370 tree dest = dest_rgm->masks[i];
371 gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
372 src, src, masks[i & 1]);
373 gimple_seq_add_stmt (seq, stmt);
374 }
375 return true;
376 }
377 return false;
378 }
379
380 /* Helper for vect_set_loop_condition_masked. Generate definitions for
381 all the masks in RGM and return a mask that is nonzero when the loop
382 needs to iterate. Add any new preheader statements to PREHEADER_SEQ.
383 Use LOOP_COND_GSI to insert code before the exit gcond.
384
385 RGM belongs to loop LOOP. The loop originally iterated NITERS
386 times and has been vectorized according to LOOP_VINFO.
387
388 If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
389 starts with NITERS_SKIP dummy iterations of the scalar loop before
390 the real work starts. The mask elements for these dummy iterations
391 must be 0, to ensure that the extra iterations do not have an effect.
392
393 It is known that:
394
395 NITERS * RGM->max_nscalars_per_iter
396
397 does not overflow. However, MIGHT_WRAP_P says whether an induction
398 variable that starts at 0 and has step:
399
400 VF * RGM->max_nscalars_per_iter
401
402 might overflow before hitting a value above:
403
404 (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
405
406 This means that we cannot guarantee that such an induction variable
407 would ever hit a value that produces a set of all-false masks for RGM. */
408
409 static tree
410 vect_set_loop_masks_directly (class loop *loop, loop_vec_info loop_vinfo,
411 gimple_seq *preheader_seq,
412 gimple_stmt_iterator loop_cond_gsi,
413 rgroup_masks *rgm, tree niters, tree niters_skip,
414 bool might_wrap_p)
415 {
416 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
417 tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo);
418 tree mask_type = rgm->mask_type;
419 unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
420 poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
421 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
422
423 /* Calculate the maximum number of scalar values that the rgroup
424 handles in total, the number that it handles for each iteration
425 of the vector loop, and the number that it should skip during the
426 first iteration of the vector loop. */
427 tree nscalars_total = niters;
428 tree nscalars_step = build_int_cst (iv_type, vf);
429 tree nscalars_skip = niters_skip;
430 if (nscalars_per_iter != 1)
431 {
432 /* We checked before choosing to use a fully-masked loop that these
433 multiplications don't overflow. */
434 tree compare_factor = build_int_cst (compare_type, nscalars_per_iter);
435 tree iv_factor = build_int_cst (iv_type, nscalars_per_iter);
436 nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
437 nscalars_total, compare_factor);
438 nscalars_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
439 nscalars_step, iv_factor);
440 if (nscalars_skip)
441 nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
442 nscalars_skip, compare_factor);
443 }
444
445 /* Create an induction variable that counts the number of scalars
446 processed. */
447 tree index_before_incr, index_after_incr;
448 gimple_stmt_iterator incr_gsi;
449 bool insert_after;
450 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
451 create_iv (build_int_cst (iv_type, 0), nscalars_step, NULL_TREE, loop,
452 &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
453
454 tree zero_index = build_int_cst (compare_type, 0);
455 tree test_index, test_limit, first_limit;
456 gimple_stmt_iterator *test_gsi;
457 if (might_wrap_p)
458 {
459 /* In principle the loop should stop iterating once the incremented
460 IV reaches a value greater than or equal to:
461
462 NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
463
464 However, there's no guarantee that this addition doesn't overflow
465 the comparison type, or that the IV hits a value above it before
466 wrapping around. We therefore adjust the limit down by one
467 IV step:
468
469 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
470 -[infinite-prec] NSCALARS_STEP
471
472 and compare the IV against this limit _before_ incrementing it.
473 Since the comparison type is unsigned, we actually want the
474 subtraction to saturate at zero:
475
476 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
477 -[sat] NSCALARS_STEP
478
479 And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
480
481 NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
482
483 where the rightmost subtraction can be done directly in
484 COMPARE_TYPE. */
485 test_index = index_before_incr;
486 tree adjust = gimple_convert (preheader_seq, compare_type,
487 nscalars_step);
488 if (nscalars_skip)
489 adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
490 adjust, nscalars_skip);
491 test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
492 nscalars_total, adjust);
493 test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
494 test_limit, adjust);
495 test_gsi = &incr_gsi;
496
497 /* Get a safe limit for the first iteration. */
498 if (nscalars_skip)
499 {
500 /* The first vector iteration can handle at most NSCALARS_STEP
501 scalars. NSCALARS_STEP <= CONST_LIMIT, and adding
502 NSCALARS_SKIP to that cannot overflow. */
503 tree const_limit = build_int_cst (compare_type,
504 LOOP_VINFO_VECT_FACTOR (loop_vinfo)
505 * nscalars_per_iter);
506 first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
507 nscalars_total, const_limit);
508 first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
509 first_limit, nscalars_skip);
510 }
511 else
512 /* For the first iteration it doesn't matter whether the IV hits
513 a value above NSCALARS_TOTAL. That only matters for the latch
514 condition. */
515 first_limit = nscalars_total;
516 }
517 else
518 {
519 /* Test the incremented IV, which will always hit a value above
520 the bound before wrapping. */
521 test_index = index_after_incr;
522 test_limit = nscalars_total;
523 if (nscalars_skip)
524 test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
525 test_limit, nscalars_skip);
526 test_gsi = &loop_cond_gsi;
527
528 first_limit = test_limit;
529 }
530
531 /* Convert the IV value to the comparison type (either a no-op or
532 a demotion). */
533 gimple_seq test_seq = NULL;
534 test_index = gimple_convert (&test_seq, compare_type, test_index);
535 gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
536
537 /* Provide a definition of each mask in the group. */
538 tree next_mask = NULL_TREE;
539 tree mask;
540 unsigned int i;
541 FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
542 {
543 /* Previous masks will cover BIAS scalars. This mask covers the
544 next batch. */
545 poly_uint64 bias = nscalars_per_mask * i;
546 tree bias_tree = build_int_cst (compare_type, bias);
547 gimple *tmp_stmt;
548
549 /* See whether the first iteration of the vector loop is known
550 to have a full mask. */
551 poly_uint64 const_limit;
552 bool first_iteration_full
553 = (poly_int_tree_p (first_limit, &const_limit)
554 && known_ge (const_limit, (i + 1) * nscalars_per_mask));
555
556 /* Rather than have a new IV that starts at BIAS and goes up to
557 TEST_LIMIT, prefer to use the same 0-based IV for each mask
558 and adjust the bound down by BIAS. */
559 tree this_test_limit = test_limit;
560 if (i != 0)
561 {
562 this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
563 compare_type, this_test_limit,
564 bias_tree);
565 this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
566 compare_type, this_test_limit,
567 bias_tree);
568 }
569
570 /* Create the initial mask. First include all scalars that
571 are within the loop limit. */
572 tree init_mask = NULL_TREE;
573 if (!first_iteration_full)
574 {
575 tree start, end;
576 if (first_limit == test_limit)
577 {
578 /* Use a natural test between zero (the initial IV value)
579 and the loop limit. The "else" block would be valid too,
580 but this choice can avoid the need to load BIAS_TREE into
581 a register. */
582 start = zero_index;
583 end = this_test_limit;
584 }
585 else
586 {
587 /* FIRST_LIMIT is the maximum number of scalars handled by the
588 first iteration of the vector loop. Test the portion
589 associated with this mask. */
590 start = bias_tree;
591 end = first_limit;
592 }
593
594 init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
595 tmp_stmt = vect_gen_while (init_mask, start, end);
596 gimple_seq_add_stmt (preheader_seq, tmp_stmt);
597 }
598
599 /* Now AND out the bits that are within the number of skipped
600 scalars. */
601 poly_uint64 const_skip;
602 if (nscalars_skip
603 && !(poly_int_tree_p (nscalars_skip, &const_skip)
604 && known_le (const_skip, bias)))
605 {
606 tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
607 bias_tree, nscalars_skip);
608 if (init_mask)
609 init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
610 init_mask, unskipped_mask);
611 else
612 init_mask = unskipped_mask;
613 }
614
615 if (!init_mask)
616 /* First iteration is full. */
617 init_mask = build_minus_one_cst (mask_type);
618
619 /* Get the mask value for the next iteration of the loop. */
620 next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
621 gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
622 gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
623
624 vect_set_loop_mask (loop, mask, init_mask, next_mask);
625 }
626 return next_mask;
627 }
628
629 /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
630 LOOP_VINFO describes the vectorization of LOOP. NITERS is the
631 number of iterations of the original scalar loop that should be
632 handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are
633 as for vect_set_loop_condition.
634
635 Insert the branch-back condition before LOOP_COND_GSI and return the
636 final gcond. */
637
638 static gcond *
639 vect_set_loop_condition_masked (class loop *loop, loop_vec_info loop_vinfo,
640 tree niters, tree final_iv,
641 bool niters_maybe_zero,
642 gimple_stmt_iterator loop_cond_gsi)
643 {
644 gimple_seq preheader_seq = NULL;
645 gimple_seq header_seq = NULL;
646
647 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
648 unsigned int compare_precision = TYPE_PRECISION (compare_type);
649 tree orig_niters = niters;
650
651 /* Type of the initial value of NITERS. */
652 tree ni_actual_type = TREE_TYPE (niters);
653 unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
654 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
655
656 /* Convert NITERS to the same size as the compare. */
657 if (compare_precision > ni_actual_precision
658 && niters_maybe_zero)
659 {
660 /* We know that there is always at least one iteration, so if the
661 count is zero then it must have wrapped. Cope with this by
662 subtracting 1 before the conversion and adding 1 to the result. */
663 gcc_assert (TYPE_UNSIGNED (ni_actual_type));
664 niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
665 niters, build_minus_one_cst (ni_actual_type));
666 niters = gimple_convert (&preheader_seq, compare_type, niters);
667 niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
668 niters, build_one_cst (compare_type));
669 }
670 else
671 niters = gimple_convert (&preheader_seq, compare_type, niters);
672
673 widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
674
675 /* Iterate over all the rgroups and fill in their masks. We could use
676 the first mask from any rgroup for the loop condition; here we
677 arbitrarily pick the last. */
678 tree test_mask = NULL_TREE;
679 rgroup_masks *rgm;
680 unsigned int i;
681 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
682 FOR_EACH_VEC_ELT (*masks, i, rgm)
683 if (!rgm->masks.is_empty ())
684 {
685 /* First try using permutes. This adds a single vector
686 instruction to the loop for each mask, but needs no extra
687 loop invariants or IVs. */
688 unsigned int nmasks = i + 1;
689 if ((nmasks & 1) == 0)
690 {
691 rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
692 if (!half_rgm->masks.is_empty ()
693 && vect_maybe_permute_loop_masks (loop_vinfo, &header_seq,
694 rgm, half_rgm))
695 continue;
696 }
697
698 /* See whether zero-based IV would ever generate all-false masks
699 before wrapping around. */
700 bool might_wrap_p
701 = (iv_limit == -1
702 || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
703 UNSIGNED)
704 > compare_precision));
705
706 /* Set up all masks for this group. */
707 test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
708 &preheader_seq,
709 loop_cond_gsi, rgm,
710 niters, niters_skip,
711 might_wrap_p);
712 }
713
714 /* Emit all accumulated statements. */
715 add_preheader_seq (loop, preheader_seq);
716 add_header_seq (loop, header_seq);
717
718 /* Get a boolean result that tells us whether to iterate. */
719 edge exit_edge = single_exit (loop);
720 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
721 tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
722 gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
723 NULL_TREE, NULL_TREE);
724 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
725
726 /* The loop iterates (NITERS - 1) / VF + 1 times.
727 Subtract one from this to get the latch count. */
728 tree step = build_int_cst (compare_type,
729 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
730 tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
731 build_minus_one_cst (compare_type));
732 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
733 niters_minus_one, step);
734
735 if (final_iv)
736 {
737 gassign *assign = gimple_build_assign (final_iv, orig_niters);
738 gsi_insert_on_edge_immediate (single_exit (loop), assign);
739 }
740
741 return cond_stmt;
742 }
743
744 /* Like vect_set_loop_condition, but handle the case in which there
745 are no loop masks. */
746
747 static gcond *
748 vect_set_loop_condition_unmasked (class loop *loop, tree niters,
749 tree step, tree final_iv,
750 bool niters_maybe_zero,
751 gimple_stmt_iterator loop_cond_gsi)
752 {
753 tree indx_before_incr, indx_after_incr;
754 gcond *cond_stmt;
755 gcond *orig_cond;
756 edge pe = loop_preheader_edge (loop);
757 edge exit_edge = single_exit (loop);
758 gimple_stmt_iterator incr_gsi;
759 bool insert_after;
760 enum tree_code code;
761 tree niters_type = TREE_TYPE (niters);
762
763 orig_cond = get_loop_exit_condition (loop);
764 gcc_assert (orig_cond);
765 loop_cond_gsi = gsi_for_stmt (orig_cond);
766
767 tree init, limit;
768 if (!niters_maybe_zero && integer_onep (step))
769 {
770 /* In this case we can use a simple 0-based IV:
771
772 A:
773 x = 0;
774 do
775 {
776 ...
777 x += 1;
778 }
779 while (x < NITERS); */
780 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
781 init = build_zero_cst (niters_type);
782 limit = niters;
783 }
784 else
785 {
786 /* The following works for all values of NITERS except 0:
787
788 B:
789 x = 0;
790 do
791 {
792 ...
793 x += STEP;
794 }
795 while (x <= NITERS - STEP);
796
797 so that the loop continues to iterate if x + STEP - 1 < NITERS
798 but stops if x + STEP - 1 >= NITERS.
799
800 However, if NITERS is zero, x never hits a value above NITERS - STEP
801 before wrapping around. There are two obvious ways of dealing with
802 this:
803
804 - start at STEP - 1 and compare x before incrementing it
805 - start at -1 and compare x after incrementing it
806
807 The latter is simpler and is what we use. The loop in this case
808 looks like:
809
810 C:
811 x = -1;
812 do
813 {
814 ...
815 x += STEP;
816 }
817 while (x < NITERS - STEP);
818
819 In both cases the loop limit is NITERS - STEP. */
820 gimple_seq seq = NULL;
821 limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
822 limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
823 if (seq)
824 {
825 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
826 gcc_assert (!new_bb);
827 }
828 if (niters_maybe_zero)
829 {
830 /* Case C. */
831 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
832 init = build_all_ones_cst (niters_type);
833 }
834 else
835 {
836 /* Case B. */
837 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
838 init = build_zero_cst (niters_type);
839 }
840 }
841
842 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
843 create_iv (init, step, NULL_TREE, loop,
844 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
845 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
846 true, NULL_TREE, true,
847 GSI_SAME_STMT);
848 limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
849 true, GSI_SAME_STMT);
850
851 cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
852 NULL_TREE);
853
854 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
855
856 /* Record the number of latch iterations. */
857 if (limit == niters)
858 /* Case A: the loop iterates NITERS times. Subtract one to get the
859 latch count. */
860 loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
861 build_int_cst (niters_type, 1));
862 else
863 /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
864 Subtract one from this to get the latch count. */
865 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
866 limit, step);
867
868 if (final_iv)
869 {
870 gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
871 indx_after_incr, init);
872 gsi_insert_on_edge_immediate (single_exit (loop), assign);
873 }
874
875 return cond_stmt;
876 }
877
878 /* If we're using fully-masked loops, make LOOP iterate:
879
880 N == (NITERS - 1) / STEP + 1
881
882 times. When NITERS is zero, this is equivalent to making the loop
883 execute (1 << M) / STEP times, where M is the precision of NITERS.
884 NITERS_MAYBE_ZERO is true if this last case might occur.
885
886 If we're not using fully-masked loops, make LOOP iterate:
887
888 N == (NITERS - STEP) / STEP + 1
889
890 times, where NITERS is known to be outside the range [1, STEP - 1].
891 This is equivalent to making the loop execute NITERS / STEP times
892 when NITERS is nonzero and (1 << M) / STEP times otherwise.
893 NITERS_MAYBE_ZERO again indicates whether this last case might occur.
894
895 If FINAL_IV is nonnull, it is an SSA name that should be set to
896 N * STEP on exit from the loop.
897
898 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
899
900 void
901 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
902 tree niters, tree step, tree final_iv,
903 bool niters_maybe_zero)
904 {
905 gcond *cond_stmt;
906 gcond *orig_cond = get_loop_exit_condition (loop);
907 gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
908
909 if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
910 cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
911 final_iv, niters_maybe_zero,
912 loop_cond_gsi);
913 else
914 cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
915 final_iv, niters_maybe_zero,
916 loop_cond_gsi);
917
918 /* Remove old loop exit test. */
919 stmt_vec_info orig_cond_info;
920 if (loop_vinfo
921 && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
922 loop_vinfo->remove_stmt (orig_cond_info);
923 else
924 gsi_remove (&loop_cond_gsi, true);
925
926 if (dump_enabled_p ())
927 dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
928 cond_stmt);
929 }
930
931 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
932 For all PHI arguments in FROM->dest and TO->dest from those
933 edges ensure that TO->dest PHI arguments have current_def
934 to that in from. */
935
936 static void
937 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
938 {
939 gimple_stmt_iterator gsi_from, gsi_to;
940
941 for (gsi_from = gsi_start_phis (from->dest),
942 gsi_to = gsi_start_phis (to->dest);
943 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
944 {
945 gimple *from_phi = gsi_stmt (gsi_from);
946 gimple *to_phi = gsi_stmt (gsi_to);
947 tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
948 tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
949 if (virtual_operand_p (from_arg))
950 {
951 gsi_next (&gsi_from);
952 continue;
953 }
954 if (virtual_operand_p (to_arg))
955 {
956 gsi_next (&gsi_to);
957 continue;
958 }
959 if (TREE_CODE (from_arg) != SSA_NAME)
960 gcc_assert (operand_equal_p (from_arg, to_arg, 0));
961 else if (TREE_CODE (to_arg) == SSA_NAME
962 && from_arg != to_arg)
963 {
964 if (get_current_def (to_arg) == NULL_TREE)
965 {
966 gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
967 TREE_TYPE (get_current_def
968 (from_arg))));
969 set_current_def (to_arg, get_current_def (from_arg));
970 }
971 }
972 gsi_next (&gsi_from);
973 gsi_next (&gsi_to);
974 }
975
976 gphi *from_phi = get_virtual_phi (from->dest);
977 gphi *to_phi = get_virtual_phi (to->dest);
978 if (from_phi)
979 set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
980 get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
981 }
982
983
984 /* Given LOOP this function generates a new copy of it and puts it
985 on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
986 non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
987 basic blocks from SCALAR_LOOP instead of LOOP, but to either the
988 entry or exit of LOOP. */
989
990 class loop *
991 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
992 class loop *scalar_loop, edge e)
993 {
994 class loop *new_loop;
995 basic_block *new_bbs, *bbs, *pbbs;
996 bool at_exit;
997 bool was_imm_dom;
998 basic_block exit_dest;
999 edge exit, new_exit;
1000 bool duplicate_outer_loop = false;
1001
1002 exit = single_exit (loop);
1003 at_exit = (e == exit);
1004 if (!at_exit && e != loop_preheader_edge (loop))
1005 return NULL;
1006
1007 if (scalar_loop == NULL)
1008 scalar_loop = loop;
1009
1010 bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1011 pbbs = bbs + 1;
1012 get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1013 /* Allow duplication of outer loops. */
1014 if (scalar_loop->inner)
1015 duplicate_outer_loop = true;
1016 /* Check whether duplication is possible. */
1017 if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1018 {
1019 free (bbs);
1020 return NULL;
1021 }
1022
1023 /* Generate new loop structure. */
1024 new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1025 duplicate_subloops (scalar_loop, new_loop);
1026
1027 exit_dest = exit->dest;
1028 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1029 exit_dest) == loop->header ?
1030 true : false);
1031
1032 /* Also copy the pre-header, this avoids jumping through hoops to
1033 duplicate the loop entry PHI arguments. Create an empty
1034 pre-header unconditionally for this. */
1035 basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1036 edge entry_e = single_pred_edge (preheader);
1037 bbs[0] = preheader;
1038 new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1039
1040 exit = single_exit (scalar_loop);
1041 copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1042 &exit, 1, &new_exit, NULL,
1043 at_exit ? loop->latch : e->src, true);
1044 exit = single_exit (loop);
1045 basic_block new_preheader = new_bbs[0];
1046
1047 add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1048
1049 if (scalar_loop != loop)
1050 {
1051 /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1052 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1053 but LOOP will not. slpeel_update_phi_nodes_for_guard{1,2} expects
1054 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1055 header) to have current_def set, so copy them over. */
1056 slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1057 exit);
1058 slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1059 0),
1060 EDGE_SUCC (loop->latch, 0));
1061 }
1062
1063 if (at_exit) /* Add the loop copy at exit. */
1064 {
1065 if (scalar_loop != loop)
1066 {
1067 gphi_iterator gsi;
1068 new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1069
1070 for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1071 gsi_next (&gsi))
1072 {
1073 gphi *phi = gsi.phi ();
1074 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1075 location_t orig_locus
1076 = gimple_phi_arg_location_from_edge (phi, e);
1077
1078 add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1079 }
1080 }
1081 redirect_edge_and_branch_force (e, new_preheader);
1082 flush_pending_stmts (e);
1083 set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1084 if (was_imm_dom || duplicate_outer_loop)
1085 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1086
1087 /* And remove the non-necessary forwarder again. Keep the other
1088 one so we have a proper pre-header for the loop at the exit edge. */
1089 redirect_edge_pred (single_succ_edge (preheader),
1090 single_pred (preheader));
1091 delete_basic_block (preheader);
1092 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1093 loop_preheader_edge (scalar_loop)->src);
1094 }
1095 else /* Add the copy at entry. */
1096 {
1097 if (scalar_loop != loop)
1098 {
1099 /* Remove the non-necessary forwarder of scalar_loop again. */
1100 redirect_edge_pred (single_succ_edge (preheader),
1101 single_pred (preheader));
1102 delete_basic_block (preheader);
1103 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1104 loop_preheader_edge (scalar_loop)->src);
1105 preheader = split_edge (loop_preheader_edge (loop));
1106 entry_e = single_pred_edge (preheader);
1107 }
1108
1109 redirect_edge_and_branch_force (entry_e, new_preheader);
1110 flush_pending_stmts (entry_e);
1111 set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1112
1113 redirect_edge_and_branch_force (new_exit, preheader);
1114 flush_pending_stmts (new_exit);
1115 set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1116
1117 /* And remove the non-necessary forwarder again. Keep the other
1118 one so we have a proper pre-header for the loop at the exit edge. */
1119 redirect_edge_pred (single_succ_edge (new_preheader),
1120 single_pred (new_preheader));
1121 delete_basic_block (new_preheader);
1122 set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1123 loop_preheader_edge (new_loop)->src);
1124 }
1125
1126 /* Skip new preheader since it's deleted if copy loop is added at entry. */
1127 for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1128 rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1129
1130 if (scalar_loop != loop)
1131 {
1132 /* Update new_loop->header PHIs, so that on the preheader
1133 edge they are the ones from loop rather than scalar_loop. */
1134 gphi_iterator gsi_orig, gsi_new;
1135 edge orig_e = loop_preheader_edge (loop);
1136 edge new_e = loop_preheader_edge (new_loop);
1137
1138 for (gsi_orig = gsi_start_phis (loop->header),
1139 gsi_new = gsi_start_phis (new_loop->header);
1140 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1141 gsi_next (&gsi_orig), gsi_next (&gsi_new))
1142 {
1143 gphi *orig_phi = gsi_orig.phi ();
1144 gphi *new_phi = gsi_new.phi ();
1145 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1146 location_t orig_locus
1147 = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1148
1149 add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1150 }
1151 }
1152
1153 free (new_bbs);
1154 free (bbs);
1155
1156 checking_verify_dominators (CDI_DOMINATORS);
1157
1158 return new_loop;
1159 }
1160
1161
1162 /* Given the condition expression COND, put it as the last statement of
1163 GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1164 DOM_BB; return the skip edge. GUARD_TO is the target basic block to
1165 skip the loop. PROBABILITY is the skip edge's probability. Mark the
1166 new edge as irreducible if IRREDUCIBLE_P is true. */
1167
1168 static edge
1169 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1170 basic_block guard_to, basic_block dom_bb,
1171 profile_probability probability, bool irreducible_p)
1172 {
1173 gimple_stmt_iterator gsi;
1174 edge new_e, enter_e;
1175 gcond *cond_stmt;
1176 gimple_seq gimplify_stmt_list = NULL;
1177
1178 enter_e = EDGE_SUCC (guard_bb, 0);
1179 enter_e->flags &= ~EDGE_FALLTHRU;
1180 enter_e->flags |= EDGE_FALSE_VALUE;
1181 gsi = gsi_last_bb (guard_bb);
1182
1183 cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1184 NULL_TREE);
1185 if (gimplify_stmt_list)
1186 gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1187
1188 cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1189 gsi = gsi_last_bb (guard_bb);
1190 gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1191
1192 /* Add new edge to connect guard block to the merge/loop-exit block. */
1193 new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1194
1195 new_e->probability = probability;
1196 if (irreducible_p)
1197 new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1198
1199 enter_e->probability = probability.invert ();
1200 set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1201
1202 /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
1203 if (enter_e->dest->loop_father->header == enter_e->dest)
1204 split_edge (enter_e);
1205
1206 return new_e;
1207 }
1208
1209
1210 /* This function verifies that the following restrictions apply to LOOP:
1211 (1) it consists of exactly 2 basic blocks - header, and an empty latch
1212 for innermost loop and 5 basic blocks for outer-loop.
1213 (2) it is single entry, single exit
1214 (3) its exit condition is the last stmt in the header
1215 (4) E is the entry/exit edge of LOOP.
1216 */
1217
1218 bool
1219 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1220 {
1221 edge exit_e = single_exit (loop);
1222 edge entry_e = loop_preheader_edge (loop);
1223 gcond *orig_cond = get_loop_exit_condition (loop);
1224 gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1225 unsigned int num_bb = loop->inner? 5 : 2;
1226
1227 /* All loops have an outer scope; the only case loop->outer is NULL is for
1228 the function itself. */
1229 if (!loop_outer (loop)
1230 || loop->num_nodes != num_bb
1231 || !empty_block_p (loop->latch)
1232 || !single_exit (loop)
1233 /* Verify that new loop exit condition can be trivially modified. */
1234 || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1235 || (e != exit_e && e != entry_e))
1236 return false;
1237
1238 return true;
1239 }
1240
1241 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1242 in the exit bb and rename all the uses after the loop. This simplifies
1243 the *guard[12] routines, which assume loop closed SSA form for all PHIs
1244 (but normally loop closed SSA form doesn't require virtual PHIs to be
1245 in the same form). Doing this early simplifies the checking what
1246 uses should be renamed. */
1247
1248 static void
1249 create_lcssa_for_virtual_phi (class loop *loop)
1250 {
1251 gphi_iterator gsi;
1252 edge exit_e = single_exit (loop);
1253
1254 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1255 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1256 {
1257 gphi *phi = gsi.phi ();
1258 for (gsi = gsi_start_phis (exit_e->dest);
1259 !gsi_end_p (gsi); gsi_next (&gsi))
1260 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1261 break;
1262 if (gsi_end_p (gsi))
1263 {
1264 tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1265 gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1266 tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1267 imm_use_iterator imm_iter;
1268 gimple *stmt;
1269 use_operand_p use_p;
1270
1271 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1272 = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1273 add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1274 gimple_phi_set_result (new_phi, new_vop);
1275 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1276 if (stmt != new_phi
1277 && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1278 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1279 SET_USE (use_p, new_vop);
1280 }
1281 break;
1282 }
1283
1284 }
1285
1286 /* Function vect_get_loop_location.
1287
1288 Extract the location of the loop in the source code.
1289 If the loop is not well formed for vectorization, an estimated
1290 location is calculated.
1291 Return the loop location if succeed and NULL if not. */
1292
1293 dump_user_location_t
1294 find_loop_location (class loop *loop)
1295 {
1296 gimple *stmt = NULL;
1297 basic_block bb;
1298 gimple_stmt_iterator si;
1299
1300 if (!loop)
1301 return dump_user_location_t ();
1302
1303 stmt = get_loop_exit_condition (loop);
1304
1305 if (stmt
1306 && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1307 return stmt;
1308
1309 /* If we got here the loop is probably not "well formed",
1310 try to estimate the loop location */
1311
1312 if (!loop->header)
1313 return dump_user_location_t ();
1314
1315 bb = loop->header;
1316
1317 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1318 {
1319 stmt = gsi_stmt (si);
1320 if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1321 return stmt;
1322 }
1323
1324 return dump_user_location_t ();
1325 }
1326
1327 /* Return true if the phi described by STMT_INFO defines an IV of the
1328 loop to be vectorized. */
1329
1330 static bool
1331 iv_phi_p (stmt_vec_info stmt_info)
1332 {
1333 gphi *phi = as_a <gphi *> (stmt_info->stmt);
1334 if (virtual_operand_p (PHI_RESULT (phi)))
1335 return false;
1336
1337 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1338 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1339 return false;
1340
1341 return true;
1342 }
1343
1344 /* Function vect_can_advance_ivs_p
1345
1346 In case the number of iterations that LOOP iterates is unknown at compile
1347 time, an epilog loop will be generated, and the loop induction variables
1348 (IVs) will be "advanced" to the value they are supposed to take just before
1349 the epilog loop. Here we check that the access function of the loop IVs
1350 and the expression that represents the loop bound are simple enough.
1351 These restrictions will be relaxed in the future. */
1352
1353 bool
1354 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1355 {
1356 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1357 basic_block bb = loop->header;
1358 gphi_iterator gsi;
1359
1360 /* Analyze phi functions of the loop header. */
1361
1362 if (dump_enabled_p ())
1363 dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1364 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1365 {
1366 tree evolution_part;
1367
1368 gphi *phi = gsi.phi ();
1369 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1370 if (dump_enabled_p ())
1371 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1372 phi_info->stmt);
1373
1374 /* Skip virtual phi's. The data dependences that are associated with
1375 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1376
1377 Skip reduction phis. */
1378 if (!iv_phi_p (phi_info))
1379 {
1380 if (dump_enabled_p ())
1381 dump_printf_loc (MSG_NOTE, vect_location,
1382 "reduc or virtual phi. skip.\n");
1383 continue;
1384 }
1385
1386 /* Analyze the evolution function. */
1387
1388 evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1389 if (evolution_part == NULL_TREE)
1390 {
1391 if (dump_enabled_p ())
1392 dump_printf (MSG_MISSED_OPTIMIZATION,
1393 "No access function or evolution.\n");
1394 return false;
1395 }
1396
1397 /* FORNOW: We do not transform initial conditions of IVs
1398 which evolution functions are not invariants in the loop. */
1399
1400 if (!expr_invariant_in_loop_p (loop, evolution_part))
1401 {
1402 if (dump_enabled_p ())
1403 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1404 "evolution not invariant in loop.\n");
1405 return false;
1406 }
1407
1408 /* FORNOW: We do not transform initial conditions of IVs
1409 which evolution functions are a polynomial of degree >= 2. */
1410
1411 if (tree_is_chrec (evolution_part))
1412 {
1413 if (dump_enabled_p ())
1414 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1415 "evolution is chrec.\n");
1416 return false;
1417 }
1418 }
1419
1420 return true;
1421 }
1422
1423
1424 /* Function vect_update_ivs_after_vectorizer.
1425
1426 "Advance" the induction variables of LOOP to the value they should take
1427 after the execution of LOOP. This is currently necessary because the
1428 vectorizer does not handle induction variables that are used after the
1429 loop. Such a situation occurs when the last iterations of LOOP are
1430 peeled, because:
1431 1. We introduced new uses after LOOP for IVs that were not originally used
1432 after LOOP: the IVs of LOOP are now used by an epilog loop.
1433 2. LOOP is going to be vectorized; this means that it will iterate N/VF
1434 times, whereas the loop IVs should be bumped N times.
1435
1436 Input:
1437 - LOOP - a loop that is going to be vectorized. The last few iterations
1438 of LOOP were peeled.
1439 - NITERS - the number of iterations that LOOP executes (before it is
1440 vectorized). i.e, the number of times the ivs should be bumped.
1441 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1442 coming out from LOOP on which there are uses of the LOOP ivs
1443 (this is the path from LOOP->exit to epilog_loop->preheader).
1444
1445 The new definitions of the ivs are placed in LOOP->exit.
1446 The phi args associated with the edge UPDATE_E in the bb
1447 UPDATE_E->dest are updated accordingly.
1448
1449 Assumption 1: Like the rest of the vectorizer, this function assumes
1450 a single loop exit that has a single predecessor.
1451
1452 Assumption 2: The phi nodes in the LOOP header and in update_bb are
1453 organized in the same order.
1454
1455 Assumption 3: The access function of the ivs is simple enough (see
1456 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
1457
1458 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1459 coming out of LOOP on which the ivs of LOOP are used (this is the path
1460 that leads to the epilog loop; other paths skip the epilog loop). This
1461 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1462 needs to have its phis updated.
1463 */
1464
1465 static void
1466 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1467 tree niters, edge update_e)
1468 {
1469 gphi_iterator gsi, gsi1;
1470 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1471 basic_block update_bb = update_e->dest;
1472 basic_block exit_bb = single_exit (loop)->dest;
1473
1474 /* Make sure there exists a single-predecessor exit bb: */
1475 gcc_assert (single_pred_p (exit_bb));
1476 gcc_assert (single_succ_edge (exit_bb) == update_e);
1477
1478 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1479 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1480 gsi_next (&gsi), gsi_next (&gsi1))
1481 {
1482 tree init_expr;
1483 tree step_expr, off;
1484 tree type;
1485 tree var, ni, ni_name;
1486 gimple_stmt_iterator last_gsi;
1487
1488 gphi *phi = gsi.phi ();
1489 gphi *phi1 = gsi1.phi ();
1490 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1491 if (dump_enabled_p ())
1492 dump_printf_loc (MSG_NOTE, vect_location,
1493 "vect_update_ivs_after_vectorizer: phi: %G", phi);
1494
1495 /* Skip reduction and virtual phis. */
1496 if (!iv_phi_p (phi_info))
1497 {
1498 if (dump_enabled_p ())
1499 dump_printf_loc (MSG_NOTE, vect_location,
1500 "reduc or virtual phi. skip.\n");
1501 continue;
1502 }
1503
1504 type = TREE_TYPE (gimple_phi_result (phi));
1505 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1506 step_expr = unshare_expr (step_expr);
1507
1508 /* FORNOW: We do not support IVs whose evolution function is a polynomial
1509 of degree >= 2 or exponential. */
1510 gcc_assert (!tree_is_chrec (step_expr));
1511
1512 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1513
1514 off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1515 fold_convert (TREE_TYPE (step_expr), niters),
1516 step_expr);
1517 if (POINTER_TYPE_P (type))
1518 ni = fold_build_pointer_plus (init_expr, off);
1519 else
1520 ni = fold_build2 (PLUS_EXPR, type,
1521 init_expr, fold_convert (type, off));
1522
1523 var = create_tmp_var (type, "tmp");
1524
1525 last_gsi = gsi_last_bb (exit_bb);
1526 gimple_seq new_stmts = NULL;
1527 ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1528 /* Exit_bb shouldn't be empty. */
1529 if (!gsi_end_p (last_gsi))
1530 gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1531 else
1532 gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1533
1534 /* Fix phi expressions in the successor bb. */
1535 adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1536 }
1537 }
1538
1539 /* Return a gimple value containing the misalignment (measured in vector
1540 elements) for the loop described by LOOP_VINFO, i.e. how many elements
1541 it is away from a perfectly aligned address. Add any new statements
1542 to SEQ. */
1543
1544 static tree
1545 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1546 {
1547 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1548 stmt_vec_info stmt_info = dr_info->stmt;
1549 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1550
1551 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1552 unsigned HOST_WIDE_INT target_align_c;
1553 tree target_align_minus_1;
1554
1555 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1556 size_zero_node) < 0;
1557 tree offset = (negative
1558 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1559 : size_zero_node);
1560 tree start_addr = vect_create_addr_base_for_vector_ref (stmt_info, seq,
1561 offset);
1562 tree type = unsigned_type_for (TREE_TYPE (start_addr));
1563 if (target_align.is_constant (&target_align_c))
1564 target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1565 else
1566 {
1567 tree vla = build_int_cst (type, target_align);
1568 tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1569 fold_build2 (MINUS_EXPR, type,
1570 build_int_cst (type, 0), vla));
1571 target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1572 build_int_cst (type, 1));
1573 }
1574
1575 HOST_WIDE_INT elem_size
1576 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1577 tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1578
1579 /* Create: misalign_in_bytes = addr & (target_align - 1). */
1580 tree int_start_addr = fold_convert (type, start_addr);
1581 tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1582 target_align_minus_1);
1583
1584 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
1585 tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1586 elem_size_log);
1587
1588 return misalign_in_elems;
1589 }
1590
1591 /* Function vect_gen_prolog_loop_niters
1592
1593 Generate the number of iterations which should be peeled as prolog for the
1594 loop represented by LOOP_VINFO. It is calculated as the misalignment of
1595 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1596 As a result, after the execution of this loop, the data reference DR will
1597 refer to an aligned location. The following computation is generated:
1598
1599 If the misalignment of DR is known at compile time:
1600 addr_mis = int mis = DR_MISALIGNMENT (dr);
1601 Else, compute address misalignment in bytes:
1602 addr_mis = addr & (target_align - 1)
1603
1604 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1605
1606 (elem_size = element type size; an element is the scalar element whose type
1607 is the inner type of the vectype)
1608
1609 The computations will be emitted at the end of BB. We also compute and
1610 store upper bound (included) of the result in BOUND.
1611
1612 When the step of the data-ref in the loop is not 1 (as in interleaved data
1613 and SLP), the number of iterations of the prolog must be divided by the step
1614 (which is equal to the size of interleaved group).
1615
1616 The above formulas assume that VF == number of elements in the vector. This
1617 may not hold when there are multiple-types in the loop.
1618 In this case, for some data-references in the loop the VF does not represent
1619 the number of elements that fit in the vector. Therefore, instead of VF we
1620 use TYPE_VECTOR_SUBPARTS. */
1621
1622 static tree
1623 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1624 basic_block bb, int *bound)
1625 {
1626 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1627 tree var;
1628 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1629 gimple_seq stmts = NULL, new_stmts = NULL;
1630 tree iters, iters_name;
1631 stmt_vec_info stmt_info = dr_info->stmt;
1632 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1633 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1634
1635 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1636 {
1637 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1638
1639 if (dump_enabled_p ())
1640 dump_printf_loc (MSG_NOTE, vect_location,
1641 "known peeling = %d.\n", npeel);
1642
1643 iters = build_int_cst (niters_type, npeel);
1644 *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1645 }
1646 else
1647 {
1648 tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1649 tree type = TREE_TYPE (misalign_in_elems);
1650 HOST_WIDE_INT elem_size
1651 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1652 /* We only do prolog peeling if the target alignment is known at compile
1653 time. */
1654 poly_uint64 align_in_elems =
1655 exact_div (target_align, elem_size);
1656 tree align_in_elems_minus_1 =
1657 build_int_cst (type, align_in_elems - 1);
1658 tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1659
1660 /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
1661 & (align_in_elems - 1)). */
1662 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1663 size_zero_node) < 0;
1664 if (negative)
1665 iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1666 align_in_elems_tree);
1667 else
1668 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1669 misalign_in_elems);
1670 iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1671 iters = fold_convert (niters_type, iters);
1672 unsigned HOST_WIDE_INT align_in_elems_c;
1673 if (align_in_elems.is_constant (&align_in_elems_c))
1674 *bound = align_in_elems_c - 1;
1675 else
1676 *bound = -1;
1677 }
1678
1679 if (dump_enabled_p ())
1680 dump_printf_loc (MSG_NOTE, vect_location,
1681 "niters for prolog loop: %T\n", iters);
1682
1683 var = create_tmp_var (niters_type, "prolog_loop_niters");
1684 iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1685
1686 if (new_stmts)
1687 gimple_seq_add_seq (&stmts, new_stmts);
1688 if (stmts)
1689 {
1690 gcc_assert (single_succ_p (bb));
1691 gimple_stmt_iterator gsi = gsi_last_bb (bb);
1692 if (gsi_end_p (gsi))
1693 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1694 else
1695 gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1696 }
1697 return iters_name;
1698 }
1699
1700
1701 /* Function vect_update_init_of_dr
1702
1703 If CODE is PLUS, the vector loop starts NITERS iterations after the
1704 scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1705 iterations before the scalar one (using masking to skip inactive
1706 elements). This function updates the information recorded in DR to
1707 account for the difference. Specifically, it updates the OFFSET
1708 field of DR. */
1709
1710 static void
1711 vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code)
1712 {
1713 tree offset = DR_OFFSET (dr);
1714
1715 niters = fold_build2 (MULT_EXPR, sizetype,
1716 fold_convert (sizetype, niters),
1717 fold_convert (sizetype, DR_STEP (dr)));
1718 offset = fold_build2 (code, sizetype,
1719 fold_convert (sizetype, offset), niters);
1720 DR_OFFSET (dr) = offset;
1721 }
1722
1723
1724 /* Function vect_update_inits_of_drs
1725
1726 Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1727 CODE and NITERS are as for vect_update_inits_of_dr. */
1728
1729 void
1730 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1731 tree_code code)
1732 {
1733 unsigned int i;
1734 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1735 struct data_reference *dr;
1736
1737 DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1738
1739 /* Adjust niters to sizetype. We used to insert the stmts on loop preheader
1740 here, but since we might use these niters to update the epilogues niters
1741 and data references we can't insert them here as this definition might not
1742 always dominate its uses. */
1743 if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1744 niters = fold_convert (sizetype, niters);
1745
1746 FOR_EACH_VEC_ELT (datarefs, i, dr)
1747 {
1748 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1749 if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
1750 vect_update_init_of_dr (dr, niters, code);
1751 }
1752 }
1753
1754 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1755 by masking. This involves calculating the number of iterations to
1756 be peeled and then aligning all memory references appropriately. */
1757
1758 void
1759 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1760 {
1761 tree misalign_in_elems;
1762 tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1763
1764 gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1765
1766 /* From the information recorded in LOOP_VINFO get the number of iterations
1767 that need to be skipped via masking. */
1768 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1769 {
1770 poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1771 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1772 misalign_in_elems = build_int_cst (type, misalign);
1773 }
1774 else
1775 {
1776 gimple_seq seq1 = NULL, seq2 = NULL;
1777 misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1778 misalign_in_elems = fold_convert (type, misalign_in_elems);
1779 misalign_in_elems = force_gimple_operand (misalign_in_elems,
1780 &seq2, true, NULL_TREE);
1781 gimple_seq_add_seq (&seq1, seq2);
1782 if (seq1)
1783 {
1784 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1785 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1786 gcc_assert (!new_bb);
1787 }
1788 }
1789
1790 if (dump_enabled_p ())
1791 dump_printf_loc (MSG_NOTE, vect_location,
1792 "misalignment for fully-masked loop: %T\n",
1793 misalign_in_elems);
1794
1795 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1796
1797 vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1798 }
1799
1800 /* This function builds ni_name = number of iterations. Statements
1801 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
1802 it to TRUE if new ssa_var is generated. */
1803
1804 tree
1805 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1806 {
1807 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1808 if (TREE_CODE (ni) == INTEGER_CST)
1809 return ni;
1810 else
1811 {
1812 tree ni_name, var;
1813 gimple_seq stmts = NULL;
1814 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1815
1816 var = create_tmp_var (TREE_TYPE (ni), "niters");
1817 ni_name = force_gimple_operand (ni, &stmts, false, var);
1818 if (stmts)
1819 {
1820 gsi_insert_seq_on_edge_immediate (pe, stmts);
1821 if (new_var_p != NULL)
1822 *new_var_p = true;
1823 }
1824
1825 return ni_name;
1826 }
1827 }
1828
1829 /* Calculate the number of iterations above which vectorized loop will be
1830 preferred than scalar loop. NITERS_PROLOG is the number of iterations
1831 of prolog loop. If it's integer const, the integer number is also passed
1832 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
1833 number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
1834 value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
1835 threshold below which the scalar (rather than vectorized) loop will be
1836 executed. This function stores the upper bound (inclusive) of the result
1837 in BOUND_SCALAR. */
1838
1839 static tree
1840 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1841 int bound_prolog, poly_int64 bound_epilog, int th,
1842 poly_uint64 *bound_scalar,
1843 bool check_profitability)
1844 {
1845 tree type = TREE_TYPE (niters_prolog);
1846 tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1847 build_int_cst (type, bound_epilog));
1848
1849 *bound_scalar = bound_prolog + bound_epilog;
1850 if (check_profitability)
1851 {
1852 /* TH indicates the minimum niters of vectorized loop, while we
1853 compute the maximum niters of scalar loop. */
1854 th--;
1855 /* Peeling for constant times. */
1856 if (int_niters_prolog >= 0)
1857 {
1858 *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1859 return build_int_cst (type, *bound_scalar);
1860 }
1861 /* Peeling an unknown number of times. Note that both BOUND_PROLOG
1862 and BOUND_EPILOG are inclusive upper bounds. */
1863 if (known_ge (th, bound_prolog + bound_epilog))
1864 {
1865 *bound_scalar = th;
1866 return build_int_cst (type, th);
1867 }
1868 /* Need to do runtime comparison. */
1869 else if (maybe_gt (th, bound_epilog))
1870 {
1871 *bound_scalar = upper_bound (*bound_scalar, th);
1872 return fold_build2 (MAX_EXPR, type,
1873 build_int_cst (type, th), niters);
1874 }
1875 }
1876 return niters;
1877 }
1878
1879 /* NITERS is the number of times that the original scalar loop executes
1880 after peeling. Work out the maximum number of iterations N that can
1881 be handled by the vectorized form of the loop and then either:
1882
1883 a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1884
1885 niters_vector = N
1886
1887 b) set *STEP_VECTOR_PTR to one and generate:
1888
1889 niters_vector = N / vf
1890
1891 In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1892 any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
1893 is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
1894
1895 void
1896 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1897 tree *niters_vector_ptr, tree *step_vector_ptr,
1898 bool niters_no_overflow)
1899 {
1900 tree ni_minus_gap, var;
1901 tree niters_vector, step_vector, type = TREE_TYPE (niters);
1902 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1903 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1904 tree log_vf = NULL_TREE;
1905
1906 /* If epilogue loop is required because of data accesses with gaps, we
1907 subtract one iteration from the total number of iterations here for
1908 correct calculation of RATIO. */
1909 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1910 {
1911 ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1912 build_one_cst (type));
1913 if (!is_gimple_val (ni_minus_gap))
1914 {
1915 var = create_tmp_var (type, "ni_gap");
1916 gimple *stmts = NULL;
1917 ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1918 true, var);
1919 gsi_insert_seq_on_edge_immediate (pe, stmts);
1920 }
1921 }
1922 else
1923 ni_minus_gap = niters;
1924
1925 unsigned HOST_WIDE_INT const_vf;
1926 if (vf.is_constant (&const_vf)
1927 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1928 {
1929 /* Create: niters >> log2(vf) */
1930 /* If it's known that niters == number of latch executions + 1 doesn't
1931 overflow, we can generate niters >> log2(vf); otherwise we generate
1932 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1933 will be at least one. */
1934 log_vf = build_int_cst (type, exact_log2 (const_vf));
1935 if (niters_no_overflow)
1936 niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1937 else
1938 niters_vector
1939 = fold_build2 (PLUS_EXPR, type,
1940 fold_build2 (RSHIFT_EXPR, type,
1941 fold_build2 (MINUS_EXPR, type,
1942 ni_minus_gap,
1943 build_int_cst (type, vf)),
1944 log_vf),
1945 build_int_cst (type, 1));
1946 step_vector = build_one_cst (type);
1947 }
1948 else
1949 {
1950 niters_vector = ni_minus_gap;
1951 step_vector = build_int_cst (type, vf);
1952 }
1953
1954 if (!is_gimple_val (niters_vector))
1955 {
1956 var = create_tmp_var (type, "bnd");
1957 gimple_seq stmts = NULL;
1958 niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1959 gsi_insert_seq_on_edge_immediate (pe, stmts);
1960 /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1961 we set range information to make niters analyzer's life easier. */
1962 if (stmts != NULL && log_vf)
1963 set_range_info (niters_vector, VR_RANGE,
1964 wi::to_wide (build_int_cst (type, 1)),
1965 wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1966 TYPE_MAX_VALUE (type),
1967 log_vf)));
1968 }
1969 *niters_vector_ptr = niters_vector;
1970 *step_vector_ptr = step_vector;
1971
1972 return;
1973 }
1974
1975 /* Given NITERS_VECTOR which is the number of iterations for vectorized
1976 loop specified by LOOP_VINFO after vectorization, compute the number
1977 of iterations before vectorization (niters_vector * vf) and store it
1978 to NITERS_VECTOR_MULT_VF_PTR. */
1979
1980 static void
1981 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
1982 tree niters_vector,
1983 tree *niters_vector_mult_vf_ptr)
1984 {
1985 /* We should be using a step_vector of VF if VF is variable. */
1986 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
1987 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1988 tree type = TREE_TYPE (niters_vector);
1989 tree log_vf = build_int_cst (type, exact_log2 (vf));
1990 basic_block exit_bb = single_exit (loop)->dest;
1991
1992 gcc_assert (niters_vector_mult_vf_ptr != NULL);
1993 tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
1994 niters_vector, log_vf);
1995 if (!is_gimple_val (niters_vector_mult_vf))
1996 {
1997 tree var = create_tmp_var (type, "niters_vector_mult_vf");
1998 gimple_seq stmts = NULL;
1999 niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2000 &stmts, true, var);
2001 gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2002 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2003 }
2004 *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2005 }
2006
2007 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2008 from SECOND/FIRST and puts it at the original loop's preheader/exit
2009 edge, the two loops are arranged as below:
2010
2011 preheader_a:
2012 first_loop:
2013 header_a:
2014 i_1 = PHI<i_0, i_2>;
2015 ...
2016 i_2 = i_1 + 1;
2017 if (cond_a)
2018 goto latch_a;
2019 else
2020 goto between_bb;
2021 latch_a:
2022 goto header_a;
2023
2024 between_bb:
2025 ;; i_x = PHI<i_2>; ;; LCSSA phi node to be created for FIRST,
2026
2027 second_loop:
2028 header_b:
2029 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2030 or with i_2 if no LCSSA phi is created
2031 under condition of CREATE_LCSSA_FOR_IV_PHIS.
2032 ...
2033 i_4 = i_3 + 1;
2034 if (cond_b)
2035 goto latch_b;
2036 else
2037 goto exit_bb;
2038 latch_b:
2039 goto header_b;
2040
2041 exit_bb:
2042
2043 This function creates loop closed SSA for the first loop; update the
2044 second loop's PHI nodes by replacing argument on incoming edge with the
2045 result of newly created lcssa PHI nodes. IF CREATE_LCSSA_FOR_IV_PHIS
2046 is false, Loop closed ssa phis will only be created for non-iv phis for
2047 the first loop.
2048
2049 This function assumes exit bb of the first loop is preheader bb of the
2050 second loop, i.e, between_bb in the example code. With PHIs updated,
2051 the second loop will execute rest iterations of the first. */
2052
2053 static void
2054 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2055 class loop *first, class loop *second,
2056 bool create_lcssa_for_iv_phis)
2057 {
2058 gphi_iterator gsi_update, gsi_orig;
2059 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2060
2061 edge first_latch_e = EDGE_SUCC (first->latch, 0);
2062 edge second_preheader_e = loop_preheader_edge (second);
2063 basic_block between_bb = single_exit (first)->dest;
2064
2065 gcc_assert (between_bb == second_preheader_e->src);
2066 gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2067 /* Either the first loop or the second is the loop to be vectorized. */
2068 gcc_assert (loop == first || loop == second);
2069
2070 for (gsi_orig = gsi_start_phis (first->header),
2071 gsi_update = gsi_start_phis (second->header);
2072 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2073 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2074 {
2075 gphi *orig_phi = gsi_orig.phi ();
2076 gphi *update_phi = gsi_update.phi ();
2077
2078 tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2079 /* Generate lcssa PHI node for the first loop. */
2080 gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2081 stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2082 if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2083 {
2084 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2085 gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2086 add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2087 arg = new_res;
2088 }
2089
2090 /* Update PHI node in the second loop by replacing arg on the loop's
2091 incoming edge. */
2092 adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2093 }
2094 }
2095
2096 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2097 of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
2098 are two pred edges of the merge point before UPDATE_LOOP. The two loops
2099 appear like below:
2100
2101 guard_bb:
2102 if (cond)
2103 goto merge_bb;
2104 else
2105 goto skip_loop;
2106
2107 skip_loop:
2108 header_a:
2109 i_1 = PHI<i_0, i_2>;
2110 ...
2111 i_2 = i_1 + 1;
2112 if (cond_a)
2113 goto latch_a;
2114 else
2115 goto exit_a;
2116 latch_a:
2117 goto header_a;
2118
2119 exit_a:
2120 i_5 = PHI<i_2>;
2121
2122 merge_bb:
2123 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2124
2125 update_loop:
2126 header_b:
2127 i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
2128 ...
2129 i_4 = i_3 + 1;
2130 if (cond_b)
2131 goto latch_b;
2132 else
2133 goto exit_bb;
2134 latch_b:
2135 goto header_b;
2136
2137 exit_bb:
2138
2139 This function creates PHI nodes at merge_bb and replaces the use of i_5
2140 in the update_loop's PHI node with the result of new PHI result. */
2141
2142 static void
2143 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2144 class loop *update_loop,
2145 edge guard_edge, edge merge_edge)
2146 {
2147 location_t merge_loc, guard_loc;
2148 edge orig_e = loop_preheader_edge (skip_loop);
2149 edge update_e = loop_preheader_edge (update_loop);
2150 gphi_iterator gsi_orig, gsi_update;
2151
2152 for ((gsi_orig = gsi_start_phis (skip_loop->header),
2153 gsi_update = gsi_start_phis (update_loop->header));
2154 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2155 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2156 {
2157 gphi *orig_phi = gsi_orig.phi ();
2158 gphi *update_phi = gsi_update.phi ();
2159
2160 /* Generate new phi node at merge bb of the guard. */
2161 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2162 gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2163
2164 /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
2165 args in NEW_PHI for these edges. */
2166 tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2167 tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2168 merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2169 guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2170 add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2171 add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2172
2173 /* Update phi in UPDATE_PHI. */
2174 adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2175 }
2176 }
2177
2178 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2179 this function searches for the corresponding lcssa phi node in exit
2180 bb of LOOP. If it is found, return the phi result; otherwise return
2181 NULL. */
2182
2183 static tree
2184 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2185 gphi *lcssa_phi)
2186 {
2187 gphi_iterator gsi;
2188 edge e = single_exit (loop);
2189
2190 gcc_assert (single_pred_p (e->dest));
2191 for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2192 {
2193 gphi *phi = gsi.phi ();
2194 if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2195 PHI_ARG_DEF (lcssa_phi, 0), 0))
2196 return PHI_RESULT (phi);
2197 }
2198 return NULL_TREE;
2199 }
2200
2201 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2202 from LOOP. Function slpeel_add_loop_guard adds guard skipping from a
2203 point between the two loops to the end of EPILOG. Edges GUARD_EDGE
2204 and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2205 The CFG looks like:
2206
2207 loop:
2208 header_a:
2209 i_1 = PHI<i_0, i_2>;
2210 ...
2211 i_2 = i_1 + 1;
2212 if (cond_a)
2213 goto latch_a;
2214 else
2215 goto exit_a;
2216 latch_a:
2217 goto header_a;
2218
2219 exit_a:
2220
2221 guard_bb:
2222 if (cond)
2223 goto merge_bb;
2224 else
2225 goto epilog_loop;
2226
2227 ;; fall_through_bb
2228
2229 epilog_loop:
2230 header_b:
2231 i_3 = PHI<i_2, i_4>;
2232 ...
2233 i_4 = i_3 + 1;
2234 if (cond_b)
2235 goto latch_b;
2236 else
2237 goto merge_bb;
2238 latch_b:
2239 goto header_b;
2240
2241 merge_bb:
2242 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2243
2244 exit_bb:
2245 i_x = PHI<i_4>; ;Use of i_4 to be replaced with i_y in merge_bb.
2246
2247 For each name used out side EPILOG (i.e - for each name that has a lcssa
2248 phi in exit_bb) we create a new PHI in merge_bb. The new PHI has two
2249 args corresponding to GUARD_EDGE and MERGE_EDGE. Arg for MERGE_EDGE is
2250 the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2251 by LOOP and is found in the exit bb of LOOP. Arg of the original PHI
2252 in exit_bb will also be updated. */
2253
2254 static void
2255 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2256 edge guard_edge, edge merge_edge)
2257 {
2258 gphi_iterator gsi;
2259 basic_block merge_bb = guard_edge->dest;
2260
2261 gcc_assert (single_succ_p (merge_bb));
2262 edge e = single_succ_edge (merge_bb);
2263 basic_block exit_bb = e->dest;
2264 gcc_assert (single_pred_p (exit_bb));
2265 gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2266
2267 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2268 {
2269 gphi *update_phi = gsi.phi ();
2270 tree old_arg = PHI_ARG_DEF (update_phi, 0);
2271 /* This loop-closed-phi actually doesn't represent a use out of the
2272 loop - the phi arg is a constant. */
2273 if (TREE_CODE (old_arg) != SSA_NAME)
2274 continue;
2275
2276 tree merge_arg = get_current_def (old_arg);
2277 if (!merge_arg)
2278 merge_arg = old_arg;
2279
2280 tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2281 /* If the var is live after loop but not a reduction, we simply
2282 use the old arg. */
2283 if (!guard_arg)
2284 guard_arg = old_arg;
2285
2286 /* Create new phi node in MERGE_BB: */
2287 tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2288 gphi *merge_phi = create_phi_node (new_res, merge_bb);
2289
2290 /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2291 the two PHI args in merge_phi for these edges. */
2292 add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2293 add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2294
2295 /* Update the original phi in exit_bb. */
2296 adjust_phi_and_debug_stmts (update_phi, e, new_res);
2297 }
2298 }
2299
2300 /* EPILOG loop is duplicated from the original loop for vectorizing,
2301 the arg of its loop closed ssa PHI needs to be updated. */
2302
2303 static void
2304 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2305 {
2306 gphi_iterator gsi;
2307 basic_block exit_bb = single_exit (epilog)->dest;
2308
2309 gcc_assert (single_pred_p (exit_bb));
2310 edge e = EDGE_PRED (exit_bb, 0);
2311 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2312 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2313 }
2314
2315 /* Function vect_do_peeling.
2316
2317 Input:
2318 - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2319
2320 preheader:
2321 LOOP:
2322 header_bb:
2323 loop_body
2324 if (exit_loop_cond) goto exit_bb
2325 else goto header_bb
2326 exit_bb:
2327
2328 - NITERS: The number of iterations of the loop.
2329 - NITERSM1: The number of iterations of the loop's latch.
2330 - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2331 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2332 CHECK_PROFITABILITY is true.
2333 Output:
2334 - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2335 iterate after vectorization; see vect_set_loop_condition for details.
2336 - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2337 should be set to the number of scalar iterations handled by the
2338 vector loop. The SSA name is only used on exit from the loop.
2339
2340 This function peels prolog and epilog from the loop, adds guards skipping
2341 PROLOG and EPILOG for various conditions. As a result, the changed CFG
2342 would look like:
2343
2344 guard_bb_1:
2345 if (prefer_scalar_loop) goto merge_bb_1
2346 else goto guard_bb_2
2347
2348 guard_bb_2:
2349 if (skip_prolog) goto merge_bb_2
2350 else goto prolog_preheader
2351
2352 prolog_preheader:
2353 PROLOG:
2354 prolog_header_bb:
2355 prolog_body
2356 if (exit_prolog_cond) goto prolog_exit_bb
2357 else goto prolog_header_bb
2358 prolog_exit_bb:
2359
2360 merge_bb_2:
2361
2362 vector_preheader:
2363 VECTOR LOOP:
2364 vector_header_bb:
2365 vector_body
2366 if (exit_vector_cond) goto vector_exit_bb
2367 else goto vector_header_bb
2368 vector_exit_bb:
2369
2370 guard_bb_3:
2371 if (skip_epilog) goto merge_bb_3
2372 else goto epilog_preheader
2373
2374 merge_bb_1:
2375
2376 epilog_preheader:
2377 EPILOG:
2378 epilog_header_bb:
2379 epilog_body
2380 if (exit_epilog_cond) goto merge_bb_3
2381 else goto epilog_header_bb
2382
2383 merge_bb_3:
2384
2385 Note this function peels prolog and epilog only if it's necessary,
2386 as well as guards.
2387 This function returns the epilogue loop if a decision was made to vectorize
2388 it, otherwise NULL.
2389
2390 The analysis resulting in this epilogue loop's loop_vec_info was performed
2391 in the same vect_analyze_loop call as the main loop's. At that time
2392 vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2393 vectorization factors than the main loop. This list is stored in the main
2394 loop's loop_vec_info in the 'epilogue_vinfos' member. Everytime we decide to
2395 vectorize the epilogue loop for a lower vectorization factor, the
2396 loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2397 updated and linked to the epilogue loop. This is later used to vectorize
2398 the epilogue. The reason the loop_vec_info needs updating is that it was
2399 constructed based on the original main loop, and the epilogue loop is a
2400 copy of this loop, so all links pointing to statements in the original loop
2401 need updating. Furthermore, these loop_vec_infos share the
2402 data_reference's records, which will also need to be updated.
2403
2404 TODO: Guard for prefer_scalar_loop should be emitted along with
2405 versioning conditions if loop versioning is needed. */
2406
2407
2408 class loop *
2409 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2410 tree *niters_vector, tree *step_vector,
2411 tree *niters_vector_mult_vf_var, int th,
2412 bool check_profitability, bool niters_no_overflow,
2413 tree *advance, drs_init_vec &orig_drs_init)
2414 {
2415 edge e, guard_e;
2416 tree type = TREE_TYPE (niters), guard_cond;
2417 basic_block guard_bb, guard_to;
2418 profile_probability prob_prolog, prob_vector, prob_epilog;
2419 int estimated_vf;
2420 int prolog_peeling = 0;
2421 bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2422 /* We currently do not support prolog peeling if the target alignment is not
2423 known at compile time. 'vect_gen_prolog_loop_niters' depends on the
2424 target alignment being constant. */
2425 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2426 if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2427 return NULL;
2428
2429 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2430 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2431
2432 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2433 poly_uint64 bound_epilog = 0;
2434 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2435 && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2436 bound_epilog += vf - 1;
2437 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2438 bound_epilog += 1;
2439 bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2440 poly_uint64 bound_scalar = bound_epilog;
2441
2442 if (!prolog_peeling && !epilog_peeling)
2443 return NULL;
2444
2445 prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2446 estimated_vf = vect_vf_for_cost (loop_vinfo);
2447 if (estimated_vf == 2)
2448 estimated_vf = 3;
2449 prob_prolog = prob_epilog = profile_probability::guessed_always ()
2450 .apply_scale (estimated_vf - 1, estimated_vf);
2451
2452 class loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
2453 class loop *first_loop = loop;
2454 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2455 create_lcssa_for_virtual_phi (loop);
2456 update_ssa (TODO_update_ssa_only_virtuals);
2457
2458 if (MAY_HAVE_DEBUG_BIND_STMTS)
2459 {
2460 gcc_assert (!adjust_vec.exists ());
2461 adjust_vec.create (32);
2462 }
2463 initialize_original_copy_tables ();
2464
2465 /* Record the anchor bb at which the guard should be placed if the scalar
2466 loop might be preferred. */
2467 basic_block anchor = loop_preheader_edge (loop)->src;
2468
2469 /* Generate the number of iterations for the prolog loop. We do this here
2470 so that we can also get the upper bound on the number of iterations. */
2471 tree niters_prolog;
2472 int bound_prolog = 0;
2473 if (prolog_peeling)
2474 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2475 &bound_prolog);
2476 else
2477 niters_prolog = build_int_cst (type, 0);
2478
2479 loop_vec_info epilogue_vinfo = NULL;
2480 if (vect_epilogues)
2481 {
2482 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2483 loop_vinfo->epilogue_vinfos.ordered_remove (0);
2484 }
2485
2486 tree niters_vector_mult_vf = NULL_TREE;
2487 /* Saving NITERs before the loop, as this may be changed by prologue. */
2488 tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2489 edge update_e = NULL, skip_e = NULL;
2490 unsigned int lowest_vf = constant_lower_bound (vf);
2491 /* If we know the number of scalar iterations for the main loop we should
2492 check whether after the main loop there are enough iterations left over
2493 for the epilogue. */
2494 if (vect_epilogues
2495 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2496 && prolog_peeling >= 0
2497 && known_eq (vf, lowest_vf))
2498 {
2499 unsigned HOST_WIDE_INT eiters
2500 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
2501 - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2502
2503 eiters -= prolog_peeling;
2504 eiters
2505 = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2506
2507 unsigned int ratio;
2508 while (!(constant_multiple_p (loop_vinfo->vector_size,
2509 epilogue_vinfo->vector_size, &ratio)
2510 && eiters >= lowest_vf / ratio))
2511 {
2512 delete epilogue_vinfo;
2513 epilogue_vinfo = NULL;
2514 if (loop_vinfo->epilogue_vinfos.length () == 0)
2515 {
2516 vect_epilogues = false;
2517 break;
2518 }
2519 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2520 loop_vinfo->epilogue_vinfos.ordered_remove (0);
2521 }
2522 }
2523 /* Prolog loop may be skipped. */
2524 bool skip_prolog = (prolog_peeling != 0);
2525 /* Skip this loop to epilog when there are not enough iterations to enter this
2526 vectorized loop. If true we should perform runtime checks on the NITERS
2527 to check whether we should skip the current vectorized loop. If we know
2528 the number of scalar iterations we may choose to add a runtime check if
2529 this number "maybe" smaller than the number of iterations required
2530 when we know the number of scalar iterations may potentially
2531 be smaller than the number of iterations required to enter this loop, for
2532 this we use the upper bounds on the prolog and epilog peeling. When we
2533 don't know the number of iterations and don't require versioning it is
2534 because we have asserted that there are enough scalar iterations to enter
2535 the main loop, so this skip is not necessary. When we are versioning then
2536 we only add such a skip if we have chosen to vectorize the epilogue. */
2537 bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2538 ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2539 bound_prolog + bound_epilog)
2540 : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2541 || vect_epilogues));
2542 /* Epilog loop must be executed if the number of iterations for epilog
2543 loop is known at compile time, otherwise we need to add a check at
2544 the end of vector loop and skip to the end of epilog loop. */
2545 bool skip_epilog = (prolog_peeling < 0
2546 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2547 || !vf.is_constant ());
2548 /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
2549 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2550 skip_epilog = false;
2551
2552 if (skip_vector)
2553 {
2554 split_edge (loop_preheader_edge (loop));
2555
2556 /* Due to the order in which we peel prolog and epilog, we first
2557 propagate probability to the whole loop. The purpose is to
2558 avoid adjusting probabilities of both prolog and vector loops
2559 separately. Note in this case, the probability of epilog loop
2560 needs to be scaled back later. */
2561 basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2562 if (prob_vector.initialized_p ())
2563 {
2564 scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2565 scale_loop_profile (loop, prob_vector, 0);
2566 }
2567 }
2568
2569 dump_user_location_t loop_loc = find_loop_location (loop);
2570 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2571 if (vect_epilogues)
2572 /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2573 use the original scalar loop as remaining epilogue if necessary. */
2574 LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2575 = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2576
2577 if (prolog_peeling)
2578 {
2579 e = loop_preheader_edge (loop);
2580 if (!slpeel_can_duplicate_loop_p (loop, e))
2581 {
2582 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2583 "loop can't be duplicated to preheader edge.\n");
2584 gcc_unreachable ();
2585 }
2586 /* Peel prolog and put it on preheader edge of loop. */
2587 prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2588 if (!prolog)
2589 {
2590 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2591 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2592 gcc_unreachable ();
2593 }
2594 prolog->force_vectorize = false;
2595 slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2596 first_loop = prolog;
2597 reset_original_copy_tables ();
2598
2599 /* Update the number of iterations for prolog loop. */
2600 tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2601 vect_set_loop_condition (prolog, NULL, niters_prolog,
2602 step_prolog, NULL_TREE, false);
2603
2604 /* Skip the prolog loop. */
2605 if (skip_prolog)
2606 {
2607 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2608 niters_prolog, build_int_cst (type, 0));
2609 guard_bb = loop_preheader_edge (prolog)->src;
2610 basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2611 guard_to = split_edge (loop_preheader_edge (loop));
2612 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2613 guard_to, guard_bb,
2614 prob_prolog.invert (),
2615 irred_flag);
2616 e = EDGE_PRED (guard_to, 0);
2617 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2618 slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2619
2620 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2621 scale_loop_profile (prolog, prob_prolog, bound_prolog);
2622 }
2623
2624 /* Save original inits for each data_reference before advancing them with
2625 NITERS_PROLOG. */
2626 unsigned int i;
2627 struct data_reference *dr;
2628 vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
2629 FOR_EACH_VEC_ELT (datarefs, i, dr)
2630 orig_drs_init.safe_push (std::make_pair (dr, DR_OFFSET (dr)));
2631
2632 /* Update init address of DRs. */
2633 vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2634 /* Update niters for vector loop. */
2635 LOOP_VINFO_NITERS (loop_vinfo)
2636 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2637 LOOP_VINFO_NITERSM1 (loop_vinfo)
2638 = fold_build2 (MINUS_EXPR, type,
2639 LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2640 bool new_var_p = false;
2641 niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2642 /* It's guaranteed that vector loop bound before vectorization is at
2643 least VF, so set range information for newly generated var. */
2644 if (new_var_p)
2645 set_range_info (niters, VR_RANGE,
2646 wi::to_wide (build_int_cst (type, vf)),
2647 wi::to_wide (TYPE_MAX_VALUE (type)));
2648
2649 /* Prolog iterates at most bound_prolog times, latch iterates at
2650 most bound_prolog - 1 times. */
2651 record_niter_bound (prolog, bound_prolog - 1, false, true);
2652 delete_update_ssa ();
2653 adjust_vec_debug_stmts ();
2654 scev_reset ();
2655 }
2656
2657 if (epilog_peeling)
2658 {
2659 e = single_exit (loop);
2660 if (!slpeel_can_duplicate_loop_p (loop, e))
2661 {
2662 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2663 "loop can't be duplicated to exit edge.\n");
2664 gcc_unreachable ();
2665 }
2666 /* Peel epilog and put it on exit edge of loop. If we are vectorizing
2667 said epilog then we should use a copy of the main loop as a starting
2668 point. This loop may have already had some preliminary transformations
2669 to allow for more optimal vectorization, for example if-conversion.
2670 If we are not vectorizing the epilog then we should use the scalar loop
2671 as the transformations mentioned above make less or no sense when not
2672 vectorizing. */
2673 epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2674 epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
2675 if (!epilog)
2676 {
2677 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2678 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2679 gcc_unreachable ();
2680 }
2681 epilog->force_vectorize = false;
2682 slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2683
2684 /* Scalar version loop may be preferred. In this case, add guard
2685 and skip to epilog. Note this only happens when the number of
2686 iterations of loop is unknown at compile time, otherwise this
2687 won't be vectorized. */
2688 if (skip_vector)
2689 {
2690 /* Additional epilogue iteration is peeled if gap exists. */
2691 tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2692 bound_prolog, bound_epilog,
2693 th, &bound_scalar,
2694 check_profitability);
2695 /* Build guard against NITERSM1 since NITERS may overflow. */
2696 guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2697 guard_bb = anchor;
2698 guard_to = split_edge (loop_preheader_edge (epilog));
2699 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2700 guard_to, guard_bb,
2701 prob_vector.invert (),
2702 irred_flag);
2703 skip_e = guard_e;
2704 e = EDGE_PRED (guard_to, 0);
2705 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2706 slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2707
2708 /* Simply propagate profile info from guard_bb to guard_to which is
2709 a merge point of control flow. */
2710 guard_to->count = guard_bb->count;
2711
2712 /* Scale probability of epilog loop back.
2713 FIXME: We should avoid scaling down and back up. Profile may
2714 get lost if we scale down to 0. */
2715 basic_block *bbs = get_loop_body (epilog);
2716 for (unsigned int i = 0; i < epilog->num_nodes; i++)
2717 bbs[i]->count = bbs[i]->count.apply_scale
2718 (bbs[i]->count,
2719 bbs[i]->count.apply_probability
2720 (prob_vector));
2721 free (bbs);
2722 }
2723
2724 basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2725 /* If loop is peeled for non-zero constant times, now niters refers to
2726 orig_niters - prolog_peeling, it won't overflow even the orig_niters
2727 overflows. */
2728 niters_no_overflow |= (prolog_peeling > 0);
2729 vect_gen_vector_loop_niters (loop_vinfo, niters,
2730 niters_vector, step_vector,
2731 niters_no_overflow);
2732 if (!integer_onep (*step_vector))
2733 {
2734 /* On exit from the loop we will have an easy way of calcalating
2735 NITERS_VECTOR / STEP * STEP. Install a dummy definition
2736 until then. */
2737 niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2738 SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2739 *niters_vector_mult_vf_var = niters_vector_mult_vf;
2740 }
2741 else
2742 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2743 &niters_vector_mult_vf);
2744 /* Update IVs of original loop as if they were advanced by
2745 niters_vector_mult_vf steps. */
2746 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2747 update_e = skip_vector ? e : loop_preheader_edge (epilog);
2748 vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2749 update_e);
2750
2751 if (skip_epilog)
2752 {
2753 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2754 niters, niters_vector_mult_vf);
2755 guard_bb = single_exit (loop)->dest;
2756 guard_to = split_edge (single_exit (epilog));
2757 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2758 skip_vector ? anchor : guard_bb,
2759 prob_epilog.invert (),
2760 irred_flag);
2761 slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2762 single_exit (epilog));
2763 /* Only need to handle basic block before epilog loop if it's not
2764 the guard_bb, which is the case when skip_vector is true. */
2765 if (guard_bb != bb_before_epilog)
2766 {
2767 prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2768
2769 scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2770 }
2771 scale_loop_profile (epilog, prob_epilog, 0);
2772 }
2773 else
2774 slpeel_update_phi_nodes_for_lcssa (epilog);
2775
2776 unsigned HOST_WIDE_INT bound;
2777 if (bound_scalar.is_constant (&bound))
2778 {
2779 gcc_assert (bound != 0);
2780 /* -1 to convert loop iterations to latch iterations. */
2781 record_niter_bound (epilog, bound - 1, false, true);
2782 }
2783
2784 delete_update_ssa ();
2785 adjust_vec_debug_stmts ();
2786 scev_reset ();
2787 }
2788
2789 if (vect_epilogues)
2790 {
2791 epilog->aux = epilogue_vinfo;
2792 LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
2793
2794 loop_constraint_clear (epilog, LOOP_C_INFINITE);
2795
2796 /* We now must calculate the number of NITERS performed by the previous
2797 loop and EPILOGUE_NITERS to be performed by the epilogue. */
2798 tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
2799 niters_prolog, niters_vector_mult_vf);
2800
2801 /* If skip_vector we may skip the previous loop, we insert a phi-node to
2802 determine whether we are coming from the previous vectorized loop
2803 using the update_e edge or the skip_vector basic block using the
2804 skip_e edge. */
2805 if (skip_vector)
2806 {
2807 gcc_assert (update_e != NULL && skip_e != NULL);
2808 gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
2809 update_e->dest);
2810 tree new_ssa = make_ssa_name (TREE_TYPE (niters));
2811 gimple *stmt = gimple_build_assign (new_ssa, niters);
2812 gimple_stmt_iterator gsi;
2813 if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
2814 && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
2815 {
2816 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
2817 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
2818 }
2819 else
2820 {
2821 gsi = gsi_last_bb (update_e->src);
2822 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
2823 }
2824
2825 niters = new_ssa;
2826 add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
2827 add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
2828 UNKNOWN_LOCATION);
2829 niters = PHI_RESULT (new_phi);
2830 }
2831
2832 /* Subtract the number of iterations performed by the vectorized loop
2833 from the number of total iterations. */
2834 tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
2835 before_loop_niters,
2836 niters);
2837
2838 LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
2839 LOOP_VINFO_NITERSM1 (epilogue_vinfo)
2840 = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
2841 epilogue_niters,
2842 build_one_cst (TREE_TYPE (epilogue_niters)));
2843
2844 /* Set ADVANCE to the number of iterations performed by the previous
2845 loop and its prologue. */
2846 *advance = niters;
2847
2848 /* Redo the peeling for niter analysis as the NITERs and alignment
2849 may have been updated to take the main loop into account. */
2850 determine_peel_for_niter (epilogue_vinfo);
2851 }
2852
2853 adjust_vec.release ();
2854 free_original_copy_tables ();
2855
2856 return vect_epilogues ? epilog : NULL;
2857 }
2858
2859 /* Function vect_create_cond_for_niters_checks.
2860
2861 Create a conditional expression that represents the run-time checks for
2862 loop's niter. The loop is guaranteed to terminate if the run-time
2863 checks hold.
2864
2865 Input:
2866 COND_EXPR - input conditional expression. New conditions will be chained
2867 with logical AND operation. If it is NULL, then the function
2868 is used to return the number of alias checks.
2869 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2870 to be checked.
2871
2872 Output:
2873 COND_EXPR - conditional expression.
2874
2875 The returned COND_EXPR is the conditional expression to be used in the
2876 if statement that controls which version of the loop gets executed at
2877 runtime. */
2878
2879 static void
2880 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
2881 {
2882 tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
2883
2884 if (*cond_expr)
2885 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2886 *cond_expr, part_cond_expr);
2887 else
2888 *cond_expr = part_cond_expr;
2889 }
2890
2891 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2892 and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
2893
2894 static void
2895 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
2896 {
2897 if (*cond_expr)
2898 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2899 *cond_expr, part_cond_expr);
2900 else
2901 *cond_expr = part_cond_expr;
2902 }
2903
2904 /* Function vect_create_cond_for_align_checks.
2905
2906 Create a conditional expression that represents the alignment checks for
2907 all of data references (array element references) whose alignment must be
2908 checked at runtime.
2909
2910 Input:
2911 COND_EXPR - input conditional expression. New conditions will be chained
2912 with logical AND operation.
2913 LOOP_VINFO - two fields of the loop information are used.
2914 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
2915 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
2916
2917 Output:
2918 COND_EXPR_STMT_LIST - statements needed to construct the conditional
2919 expression.
2920 The returned value is the conditional expression to be used in the if
2921 statement that controls which version of the loop gets executed at runtime.
2922
2923 The algorithm makes two assumptions:
2924 1) The number of bytes "n" in a vector is a power of 2.
2925 2) An address "a" is aligned if a%n is zero and that this
2926 test can be done as a&(n-1) == 0. For example, for 16
2927 byte vectors the test is a&0xf == 0. */
2928
2929 static void
2930 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
2931 tree *cond_expr,
2932 gimple_seq *cond_expr_stmt_list)
2933 {
2934 vec<stmt_vec_info> may_misalign_stmts
2935 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2936 stmt_vec_info stmt_info;
2937 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
2938 tree mask_cst;
2939 unsigned int i;
2940 tree int_ptrsize_type;
2941 char tmp_name[20];
2942 tree or_tmp_name = NULL_TREE;
2943 tree and_tmp_name;
2944 gimple *and_stmt;
2945 tree ptrsize_zero;
2946 tree part_cond_expr;
2947
2948 /* Check that mask is one less than a power of 2, i.e., mask is
2949 all zeros followed by all ones. */
2950 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
2951
2952 int_ptrsize_type = signed_type_for (ptr_type_node);
2953
2954 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
2955 of the first vector of the i'th data reference. */
2956
2957 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2958 {
2959 gimple_seq new_stmt_list = NULL;
2960 tree addr_base;
2961 tree addr_tmp_name;
2962 tree new_or_tmp_name;
2963 gimple *addr_stmt, *or_stmt;
2964 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2965 bool negative = tree_int_cst_compare
2966 (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
2967 tree offset = negative
2968 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
2969
2970 /* create: addr_tmp = (int)(address_of_first_vector) */
2971 addr_base =
2972 vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
2973 offset);
2974 if (new_stmt_list != NULL)
2975 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
2976
2977 sprintf (tmp_name, "addr2int%d", i);
2978 addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2979 addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
2980 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
2981
2982 /* The addresses are OR together. */
2983
2984 if (or_tmp_name != NULL_TREE)
2985 {
2986 /* create: or_tmp = or_tmp | addr_tmp */
2987 sprintf (tmp_name, "orptrs%d", i);
2988 new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2989 or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
2990 or_tmp_name, addr_tmp_name);
2991 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
2992 or_tmp_name = new_or_tmp_name;
2993 }
2994 else
2995 or_tmp_name = addr_tmp_name;
2996
2997 } /* end for i */
2998
2999 mask_cst = build_int_cst (int_ptrsize_type, mask);
3000
3001 /* create: and_tmp = or_tmp & mask */
3002 and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3003
3004 and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3005 or_tmp_name, mask_cst);
3006 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3007
3008 /* Make and_tmp the left operand of the conditional test against zero.
3009 if and_tmp has a nonzero bit then some address is unaligned. */
3010 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3011 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3012 and_tmp_name, ptrsize_zero);
3013 chain_cond_expr (cond_expr, part_cond_expr);
3014 }
3015
3016 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3017 create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3018 Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3019 and this new condition are true. Treat a null *COND_EXPR as "true". */
3020
3021 static void
3022 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3023 {
3024 vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3025 unsigned int i;
3026 vec_object_pair *pair;
3027 FOR_EACH_VEC_ELT (pairs, i, pair)
3028 {
3029 tree addr1 = build_fold_addr_expr (pair->first);
3030 tree addr2 = build_fold_addr_expr (pair->second);
3031 tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3032 addr1, addr2);
3033 chain_cond_expr (cond_expr, part_cond_expr);
3034 }
3035 }
3036
3037 /* Create an expression that is true when all lower-bound conditions for
3038 the vectorized loop are met. Chain this condition with *COND_EXPR. */
3039
3040 static void
3041 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3042 {
3043 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3044 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3045 {
3046 tree expr = lower_bounds[i].expr;
3047 tree type = unsigned_type_for (TREE_TYPE (expr));
3048 expr = fold_convert (type, expr);
3049 poly_uint64 bound = lower_bounds[i].min_value;
3050 if (!lower_bounds[i].unsigned_p)
3051 {
3052 expr = fold_build2 (PLUS_EXPR, type, expr,
3053 build_int_cstu (type, bound - 1));
3054 bound += bound - 1;
3055 }
3056 tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3057 build_int_cstu (type, bound));
3058 chain_cond_expr (cond_expr, part_cond_expr);
3059 }
3060 }
3061
3062 /* Function vect_create_cond_for_alias_checks.
3063
3064 Create a conditional expression that represents the run-time checks for
3065 overlapping of address ranges represented by a list of data references
3066 relations passed as input.
3067
3068 Input:
3069 COND_EXPR - input conditional expression. New conditions will be chained
3070 with logical AND operation. If it is NULL, then the function
3071 is used to return the number of alias checks.
3072 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3073 to be checked.
3074
3075 Output:
3076 COND_EXPR - conditional expression.
3077
3078 The returned COND_EXPR is the conditional expression to be used in the if
3079 statement that controls which version of the loop gets executed at runtime.
3080 */
3081
3082 void
3083 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3084 {
3085 vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
3086 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3087
3088 if (comp_alias_ddrs.is_empty ())
3089 return;
3090
3091 create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3092 &comp_alias_ddrs, cond_expr);
3093 if (dump_enabled_p ())
3094 dump_printf_loc (MSG_NOTE, vect_location,
3095 "created %u versioning for alias checks.\n",
3096 comp_alias_ddrs.length ());
3097 }
3098
3099
3100 /* Function vect_loop_versioning.
3101
3102 If the loop has data references that may or may not be aligned or/and
3103 has data reference relations whose independence was not proven then
3104 two versions of the loop need to be generated, one which is vectorized
3105 and one which isn't. A test is then generated to control which of the
3106 loops is executed. The test checks for the alignment of all of the
3107 data references that may or may not be aligned. An additional
3108 sequence of runtime tests is generated for each pairs of DDRs whose
3109 independence was not proven. The vectorized version of loop is
3110 executed only if both alias and alignment tests are passed.
3111
3112 The test generated to check which version of loop is executed
3113 is modified to also check for profitability as indicated by the
3114 cost model threshold TH.
3115
3116 The versioning precondition(s) are placed in *COND_EXPR and
3117 *COND_EXPR_STMT_LIST. */
3118
3119 class loop *
3120 vect_loop_versioning (loop_vec_info loop_vinfo)
3121 {
3122 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3123 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3124 basic_block condition_bb;
3125 gphi_iterator gsi;
3126 gimple_stmt_iterator cond_exp_gsi;
3127 basic_block merge_bb;
3128 basic_block new_exit_bb;
3129 edge new_exit_e, e;
3130 gphi *orig_phi, *new_phi;
3131 tree cond_expr = NULL_TREE;
3132 gimple_seq cond_expr_stmt_list = NULL;
3133 tree arg;
3134 profile_probability prob = profile_probability::likely ();
3135 gimple_seq gimplify_stmt_list = NULL;
3136 tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3137 bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3138 bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3139 bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3140 poly_uint64 versioning_threshold
3141 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3142 tree version_simd_if_cond
3143 = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3144 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3145
3146 if (th >= vect_vf_for_cost (loop_vinfo)
3147 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3148 && !ordered_p (th, versioning_threshold))
3149 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3150 build_int_cst (TREE_TYPE (scalar_loop_iters),
3151 th - 1));
3152 if (maybe_ne (versioning_threshold, 0U))
3153 {
3154 tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3155 build_int_cst (TREE_TYPE (scalar_loop_iters),
3156 versioning_threshold - 1));
3157 if (cond_expr)
3158 cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3159 expr, cond_expr);
3160 else
3161 cond_expr = expr;
3162 }
3163
3164 if (version_niter)
3165 vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3166
3167 if (cond_expr)
3168 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3169 &cond_expr_stmt_list,
3170 is_gimple_condexpr, NULL_TREE);
3171
3172 if (version_align)
3173 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3174 &cond_expr_stmt_list);
3175
3176 if (version_alias)
3177 {
3178 vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3179 vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3180 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3181 }
3182
3183 if (version_simd_if_cond)
3184 {
3185 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3186 if (flag_checking)
3187 if (basic_block bb
3188 = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3189 gcc_assert (bb != loop->header
3190 && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3191 && (scalar_loop == NULL
3192 || (bb != scalar_loop->header
3193 && dominated_by_p (CDI_DOMINATORS,
3194 scalar_loop->header, bb))));
3195 tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3196 tree c = fold_build2 (NE_EXPR, boolean_type_node,
3197 version_simd_if_cond, zero);
3198 if (cond_expr)
3199 cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3200 c, cond_expr);
3201 else
3202 cond_expr = c;
3203 if (dump_enabled_p ())
3204 dump_printf_loc (MSG_NOTE, vect_location,
3205 "created versioning for simd if condition check.\n");
3206 }
3207
3208 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3209 &gimplify_stmt_list,
3210 is_gimple_condexpr, NULL_TREE);
3211 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3212
3213 /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3214 invariant in. */
3215 class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3216 for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3217 !gsi_end_p (gsi); gsi_next (&gsi))
3218 {
3219 gimple *stmt = gsi_stmt (gsi);
3220 update_stmt (stmt);
3221 ssa_op_iter iter;
3222 use_operand_p use_p;
3223 basic_block def_bb;
3224 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3225 if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3226 && flow_bb_inside_loop_p (outermost, def_bb))
3227 outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3228 }
3229
3230 /* Search for the outermost loop we can version. Avoid versioning of
3231 non-perfect nests but allow if-conversion versioned loops inside. */
3232 class loop *loop_to_version = loop;
3233 if (flow_loop_nested_p (outermost, loop))
3234 {
3235 if (dump_enabled_p ())
3236 dump_printf_loc (MSG_NOTE, vect_location,
3237 "trying to apply versioning to outer loop %d\n",
3238 outermost->num);
3239 if (outermost->num == 0)
3240 outermost = superloop_at_depth (loop, 1);
3241 /* And avoid applying versioning on non-perfect nests. */
3242 while (loop_to_version != outermost
3243 && single_exit (loop_outer (loop_to_version))
3244 && (!loop_outer (loop_to_version)->inner->next
3245 || vect_loop_vectorized_call (loop_to_version))
3246 && (!loop_outer (loop_to_version)->inner->next
3247 || !loop_outer (loop_to_version)->inner->next->next))
3248 loop_to_version = loop_outer (loop_to_version);
3249 }
3250
3251 /* Apply versioning. If there is already a scalar version created by
3252 if-conversion re-use that. Note we cannot re-use the copy of
3253 an if-converted outer-loop when vectorizing the inner loop only. */
3254 gcond *cond;
3255 gimple *call;
3256 if ((!loop_to_version->inner || loop == loop_to_version)
3257 && (call = vect_loop_vectorized_call (loop_to_version, &cond)))
3258 {
3259 gcc_assert (scalar_loop);
3260 condition_bb = gimple_bb (cond);
3261 gimple_cond_set_condition_from_tree (cond, cond_expr);
3262 update_stmt (cond);
3263
3264 if (cond_expr_stmt_list)
3265 {
3266 cond_exp_gsi = gsi_for_stmt (call);
3267 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3268 GSI_SAME_STMT);
3269 }
3270
3271 /* if-conversion uses profile_probability::always () for both paths,
3272 reset the paths probabilities appropriately. */
3273 edge te, fe;
3274 extract_true_false_edges_from_block (condition_bb, &te, &fe);
3275 te->probability = prob;
3276 fe->probability = prob.invert ();
3277 /* We can scale loops counts immediately but have to postpone
3278 scaling the scalar loop because we re-use it during peeling. */
3279 scale_loop_frequencies (loop_to_version, te->probability);
3280 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3281
3282 nloop = scalar_loop;
3283 if (dump_enabled_p ())
3284 dump_printf_loc (MSG_NOTE, vect_location,
3285 "reusing %sloop version created by if conversion\n",
3286 loop_to_version != loop ? "outer " : "");
3287 }
3288 else
3289 {
3290 if (loop_to_version != loop
3291 && dump_enabled_p ())
3292 dump_printf_loc (MSG_NOTE, vect_location,
3293 "applying loop versioning to outer loop %d\n",
3294 loop_to_version->num);
3295
3296 initialize_original_copy_tables ();
3297 nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3298 prob, prob.invert (), prob, prob.invert (), true);
3299 gcc_assert (nloop);
3300 nloop = get_loop_copy (loop);
3301
3302 /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3303 reap those otherwise; they also refer to the original
3304 loops. */
3305 class loop *l = loop;
3306 while (gimple *call = vect_loop_vectorized_call (l))
3307 {
3308 call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3309 fold_loop_internal_call (call, boolean_false_node);
3310 l = loop_outer (l);
3311 }
3312 free_original_copy_tables ();
3313
3314 if (cond_expr_stmt_list)
3315 {
3316 cond_exp_gsi = gsi_last_bb (condition_bb);
3317 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3318 GSI_SAME_STMT);
3319 }
3320
3321 /* Loop versioning violates an assumption we try to maintain during
3322 vectorization - that the loop exit block has a single predecessor.
3323 After versioning, the exit block of both loop versions is the same
3324 basic block (i.e. it has two predecessors). Just in order to simplify
3325 following transformations in the vectorizer, we fix this situation
3326 here by adding a new (empty) block on the exit-edge of the loop,
3327 with the proper loop-exit phis to maintain loop-closed-form.
3328 If loop versioning wasn't done from loop, but scalar_loop instead,
3329 merge_bb will have already just a single successor. */
3330
3331 merge_bb = single_exit (loop_to_version)->dest;
3332 if (EDGE_COUNT (merge_bb->preds) >= 2)
3333 {
3334 gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3335 new_exit_bb = split_edge (single_exit (loop_to_version));
3336 new_exit_e = single_exit (loop_to_version);
3337 e = EDGE_SUCC (new_exit_bb, 0);
3338
3339 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3340 gsi_next (&gsi))
3341 {
3342 tree new_res;
3343 orig_phi = gsi.phi ();
3344 new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3345 new_phi = create_phi_node (new_res, new_exit_bb);
3346 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3347 add_phi_arg (new_phi, arg, new_exit_e,
3348 gimple_phi_arg_location_from_edge (orig_phi, e));
3349 adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3350 }
3351 }
3352
3353 update_ssa (TODO_update_ssa);
3354 }
3355
3356 if (version_niter)
3357 {
3358 /* The versioned loop could be infinite, we need to clear existing
3359 niter information which is copied from the original loop. */
3360 gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3361 vect_free_loop_info_assumptions (nloop);
3362 /* And set constraint LOOP_C_INFINITE for niter analyzer. */
3363 loop_constraint_set (loop, LOOP_C_INFINITE);
3364 }
3365
3366 if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3367 && dump_enabled_p ())
3368 {
3369 if (version_alias)
3370 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3371 vect_location,
3372 "loop versioned for vectorization because of "
3373 "possible aliasing\n");
3374 if (version_align)
3375 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3376 vect_location,
3377 "loop versioned for vectorization to enhance "
3378 "alignment\n");
3379
3380 }
3381
3382 return nloop;
3383 }