1 /* Vectorizer Specific Loop Manipulations
2 Copyright (C) 2003-2019 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
29 #include "tree-pass.h"
31 #include "fold-const.h"
34 #include "gimple-iterator.h"
35 #include "gimplify-me.h"
37 #include "tree-ssa-loop-manip.h"
38 #include "tree-into-ssa.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "gimple-fold.h"
45 #include "tree-ssa-loop-niter.h"
46 #include "internal-fn.h"
47 #include "stor-layout.h"
48 #include "optabs-query.h"
49 #include "vec-perm-indices.h"
51 /*************************************************************************
52 Simple Loop Peeling Utilities
54 Utilities to support loop peeling for vectorization purposes.
55 *************************************************************************/
58 /* Renames the use *OP_P. */
61 rename_use_op (use_operand_p op_p
)
65 if (TREE_CODE (USE_FROM_PTR (op_p
)) != SSA_NAME
)
68 new_name
= get_current_def (USE_FROM_PTR (op_p
));
70 /* Something defined outside of the loop. */
74 /* An ordinary ssa name defined in the loop. */
76 SET_USE (op_p
, new_name
);
80 /* Renames the variables in basic block BB. Allow renaming of PHI arguments
81 on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
85 rename_variables_in_bb (basic_block bb
, bool rename_from_outer_loop
)
92 class loop
*loop
= bb
->loop_father
;
93 class loop
*outer_loop
= NULL
;
95 if (rename_from_outer_loop
)
98 outer_loop
= loop_outer (loop
);
101 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
);
104 stmt
= gsi_stmt (gsi
);
105 FOR_EACH_SSA_USE_OPERAND (use_p
, stmt
, iter
, SSA_OP_ALL_USES
)
106 rename_use_op (use_p
);
109 FOR_EACH_EDGE (e
, ei
, bb
->preds
)
111 if (!flow_bb_inside_loop_p (loop
, e
->src
))
113 if (!rename_from_outer_loop
)
115 if (e
->src
!= outer_loop
->header
)
117 if (outer_loop
->inner
->next
)
119 /* If outer_loop has 2 inner loops, allow there to
120 be an extra basic block which decides which of the
121 two loops to use using LOOP_VECTORIZED. */
122 if (!single_pred_p (e
->src
)
123 || single_pred (e
->src
) != outer_loop
->header
)
128 for (gphi_iterator gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
);
130 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi
.phi (), e
));
141 /* A stack of values to be adjusted in debug stmts. We have to
142 process them LIFO, so that the closest substitution applies. If we
143 processed them FIFO, without the stack, we might substitute uses
144 with a PHI DEF that would soon become non-dominant, and when we got
145 to the suitable one, it wouldn't have anything to substitute any
147 static vec
<adjust_info
, va_heap
> adjust_vec
;
149 /* Adjust any debug stmts that referenced AI->from values to use the
150 loop-closed AI->to, if the references are dominated by AI->bb and
151 not by the definition of AI->from. */
154 adjust_debug_stmts_now (adjust_info
*ai
)
156 basic_block bbphi
= ai
->bb
;
157 tree orig_def
= ai
->from
;
158 tree new_def
= ai
->to
;
159 imm_use_iterator imm_iter
;
161 basic_block bbdef
= gimple_bb (SSA_NAME_DEF_STMT (orig_def
));
163 gcc_assert (dom_info_available_p (CDI_DOMINATORS
));
165 /* Adjust any debug stmts that held onto non-loop-closed
167 FOR_EACH_IMM_USE_STMT (stmt
, imm_iter
, orig_def
)
172 if (!is_gimple_debug (stmt
))
175 gcc_assert (gimple_debug_bind_p (stmt
));
177 bbuse
= gimple_bb (stmt
);
180 || dominated_by_p (CDI_DOMINATORS
, bbuse
, bbphi
))
182 || dominated_by_p (CDI_DOMINATORS
, bbuse
, bbdef
)))
185 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
186 SET_USE (use_p
, new_def
);
189 gimple_debug_bind_reset_value (stmt
);
196 /* Adjust debug stmts as scheduled before. */
199 adjust_vec_debug_stmts (void)
201 if (!MAY_HAVE_DEBUG_BIND_STMTS
)
204 gcc_assert (adjust_vec
.exists ());
206 while (!adjust_vec
.is_empty ())
208 adjust_debug_stmts_now (&adjust_vec
.last ());
213 /* Adjust any debug stmts that referenced FROM values to use the
214 loop-closed TO, if the references are dominated by BB and not by
215 the definition of FROM. If adjust_vec is non-NULL, adjustments
216 will be postponed until adjust_vec_debug_stmts is called. */
219 adjust_debug_stmts (tree from
, tree to
, basic_block bb
)
223 if (MAY_HAVE_DEBUG_BIND_STMTS
224 && TREE_CODE (from
) == SSA_NAME
225 && ! SSA_NAME_IS_DEFAULT_DEF (from
)
226 && ! virtual_operand_p (from
))
232 if (adjust_vec
.exists ())
233 adjust_vec
.safe_push (ai
);
235 adjust_debug_stmts_now (&ai
);
239 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
240 to adjust any debug stmts that referenced the old phi arg,
241 presumably non-loop-closed references left over from other
245 adjust_phi_and_debug_stmts (gimple
*update_phi
, edge e
, tree new_def
)
247 tree orig_def
= PHI_ARG_DEF_FROM_EDGE (update_phi
, e
);
249 SET_PHI_ARG_DEF (update_phi
, e
->dest_idx
, new_def
);
251 if (MAY_HAVE_DEBUG_BIND_STMTS
)
252 adjust_debug_stmts (orig_def
, PHI_RESULT (update_phi
),
253 gimple_bb (update_phi
));
256 /* Define one loop mask MASK from loop LOOP. INIT_MASK is the value that
257 the mask should have during the first iteration and NEXT_MASK is the
258 value that it should have on subsequent iterations. */
261 vect_set_loop_mask (class loop
*loop
, tree mask
, tree init_mask
,
264 gphi
*phi
= create_phi_node (mask
, loop
->header
);
265 add_phi_arg (phi
, init_mask
, loop_preheader_edge (loop
), UNKNOWN_LOCATION
);
266 add_phi_arg (phi
, next_mask
, loop_latch_edge (loop
), UNKNOWN_LOCATION
);
269 /* Add SEQ to the end of LOOP's preheader block. */
272 add_preheader_seq (class loop
*loop
, gimple_seq seq
)
276 edge pe
= loop_preheader_edge (loop
);
277 basic_block new_bb
= gsi_insert_seq_on_edge_immediate (pe
, seq
);
278 gcc_assert (!new_bb
);
282 /* Add SEQ to the beginning of LOOP's header block. */
285 add_header_seq (class loop
*loop
, gimple_seq seq
)
289 gimple_stmt_iterator gsi
= gsi_after_labels (loop
->header
);
290 gsi_insert_seq_before (&gsi
, seq
, GSI_SAME_STMT
);
294 /* Return true if the target can interleave elements of two vectors.
295 OFFSET is 0 if the first half of the vectors should be interleaved
296 or 1 if the second half should. When returning true, store the
297 associated permutation in INDICES. */
300 interleave_supported_p (vec_perm_indices
*indices
, tree vectype
,
303 poly_uint64 nelts
= TYPE_VECTOR_SUBPARTS (vectype
);
304 poly_uint64 base
= exact_div (nelts
, 2) * offset
;
305 vec_perm_builder
sel (nelts
, 2, 3);
306 for (unsigned int i
= 0; i
< 3; ++i
)
308 sel
.quick_push (base
+ i
);
309 sel
.quick_push (base
+ i
+ nelts
);
311 indices
->new_vector (sel
, 2, nelts
);
312 return can_vec_perm_const_p (TYPE_MODE (vectype
), *indices
);
315 /* Try to use permutes to define the masks in DEST_RGM using the masks
316 in SRC_RGM, given that the former has twice as many masks as the
317 latter. Return true on success, adding any new statements to SEQ. */
320 vect_maybe_permute_loop_masks (loop_vec_info loop_vinfo
, gimple_seq
*seq
,
321 rgroup_masks
*dest_rgm
,
322 rgroup_masks
*src_rgm
)
324 tree src_masktype
= src_rgm
->mask_type
;
325 tree dest_masktype
= dest_rgm
->mask_type
;
326 machine_mode src_mode
= TYPE_MODE (src_masktype
);
327 if (dest_rgm
->max_nscalars_per_iter
<= src_rgm
->max_nscalars_per_iter
328 && optab_handler (vec_unpacku_hi_optab
, src_mode
) != CODE_FOR_nothing
329 && optab_handler (vec_unpacku_lo_optab
, src_mode
) != CODE_FOR_nothing
)
331 /* Unpacking the source masks gives at least as many mask bits as
332 we need. We can then VIEW_CONVERT any excess bits away. */
333 tree unpack_masktype
= vect_halve_mask_nunits (loop_vinfo
, src_masktype
);
334 for (unsigned int i
= 0; i
< dest_rgm
->masks
.length (); ++i
)
336 tree src
= src_rgm
->masks
[i
/ 2];
337 tree dest
= dest_rgm
->masks
[i
];
338 tree_code code
= ((i
& 1) == (BYTES_BIG_ENDIAN
? 0 : 1)
340 : VEC_UNPACK_LO_EXPR
);
342 if (dest_masktype
== unpack_masktype
)
343 stmt
= gimple_build_assign (dest
, code
, src
);
346 tree temp
= make_ssa_name (unpack_masktype
);
347 stmt
= gimple_build_assign (temp
, code
, src
);
348 gimple_seq_add_stmt (seq
, stmt
);
349 stmt
= gimple_build_assign (dest
, VIEW_CONVERT_EXPR
,
350 build1 (VIEW_CONVERT_EXPR
,
351 dest_masktype
, temp
));
353 gimple_seq_add_stmt (seq
, stmt
);
357 vec_perm_indices indices
[2];
358 if (dest_masktype
== src_masktype
359 && interleave_supported_p (&indices
[0], src_masktype
, 0)
360 && interleave_supported_p (&indices
[1], src_masktype
, 1))
362 /* The destination requires twice as many mask bits as the source, so
363 we can use interleaving permutes to double up the number of bits. */
365 for (unsigned int i
= 0; i
< 2; ++i
)
366 masks
[i
] = vect_gen_perm_mask_checked (src_masktype
, indices
[i
]);
367 for (unsigned int i
= 0; i
< dest_rgm
->masks
.length (); ++i
)
369 tree src
= src_rgm
->masks
[i
/ 2];
370 tree dest
= dest_rgm
->masks
[i
];
371 gimple
*stmt
= gimple_build_assign (dest
, VEC_PERM_EXPR
,
372 src
, src
, masks
[i
& 1]);
373 gimple_seq_add_stmt (seq
, stmt
);
380 /* Helper for vect_set_loop_condition_masked. Generate definitions for
381 all the masks in RGM and return a mask that is nonzero when the loop
382 needs to iterate. Add any new preheader statements to PREHEADER_SEQ.
383 Use LOOP_COND_GSI to insert code before the exit gcond.
385 RGM belongs to loop LOOP. The loop originally iterated NITERS
386 times and has been vectorized according to LOOP_VINFO.
388 If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
389 starts with NITERS_SKIP dummy iterations of the scalar loop before
390 the real work starts. The mask elements for these dummy iterations
391 must be 0, to ensure that the extra iterations do not have an effect.
395 NITERS * RGM->max_nscalars_per_iter
397 does not overflow. However, MIGHT_WRAP_P says whether an induction
398 variable that starts at 0 and has step:
400 VF * RGM->max_nscalars_per_iter
402 might overflow before hitting a value above:
404 (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
406 This means that we cannot guarantee that such an induction variable
407 would ever hit a value that produces a set of all-false masks for RGM. */
410 vect_set_loop_masks_directly (class loop
*loop
, loop_vec_info loop_vinfo
,
411 gimple_seq
*preheader_seq
,
412 gimple_stmt_iterator loop_cond_gsi
,
413 rgroup_masks
*rgm
, tree niters
, tree niters_skip
,
416 tree compare_type
= LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo
);
417 tree iv_type
= LOOP_VINFO_MASK_IV_TYPE (loop_vinfo
);
418 tree mask_type
= rgm
->mask_type
;
419 unsigned int nscalars_per_iter
= rgm
->max_nscalars_per_iter
;
420 poly_uint64 nscalars_per_mask
= TYPE_VECTOR_SUBPARTS (mask_type
);
421 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
423 /* Calculate the maximum number of scalar values that the rgroup
424 handles in total, the number that it handles for each iteration
425 of the vector loop, and the number that it should skip during the
426 first iteration of the vector loop. */
427 tree nscalars_total
= niters
;
428 tree nscalars_step
= build_int_cst (iv_type
, vf
);
429 tree nscalars_skip
= niters_skip
;
430 if (nscalars_per_iter
!= 1)
432 /* We checked before choosing to use a fully-masked loop that these
433 multiplications don't overflow. */
434 tree compare_factor
= build_int_cst (compare_type
, nscalars_per_iter
);
435 tree iv_factor
= build_int_cst (iv_type
, nscalars_per_iter
);
436 nscalars_total
= gimple_build (preheader_seq
, MULT_EXPR
, compare_type
,
437 nscalars_total
, compare_factor
);
438 nscalars_step
= gimple_build (preheader_seq
, MULT_EXPR
, iv_type
,
439 nscalars_step
, iv_factor
);
441 nscalars_skip
= gimple_build (preheader_seq
, MULT_EXPR
, compare_type
,
442 nscalars_skip
, compare_factor
);
445 /* Create an induction variable that counts the number of scalars
447 tree index_before_incr
, index_after_incr
;
448 gimple_stmt_iterator incr_gsi
;
450 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
451 create_iv (build_int_cst (iv_type
, 0), nscalars_step
, NULL_TREE
, loop
,
452 &incr_gsi
, insert_after
, &index_before_incr
, &index_after_incr
);
454 tree zero_index
= build_int_cst (compare_type
, 0);
455 tree test_index
, test_limit
, first_limit
;
456 gimple_stmt_iterator
*test_gsi
;
459 /* In principle the loop should stop iterating once the incremented
460 IV reaches a value greater than or equal to:
462 NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
464 However, there's no guarantee that this addition doesn't overflow
465 the comparison type, or that the IV hits a value above it before
466 wrapping around. We therefore adjust the limit down by one
469 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
470 -[infinite-prec] NSCALARS_STEP
472 and compare the IV against this limit _before_ incrementing it.
473 Since the comparison type is unsigned, we actually want the
474 subtraction to saturate at zero:
476 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
479 And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
481 NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
483 where the rightmost subtraction can be done directly in
485 test_index
= index_before_incr
;
486 tree adjust
= gimple_convert (preheader_seq
, compare_type
,
489 adjust
= gimple_build (preheader_seq
, MINUS_EXPR
, compare_type
,
490 adjust
, nscalars_skip
);
491 test_limit
= gimple_build (preheader_seq
, MAX_EXPR
, compare_type
,
492 nscalars_total
, adjust
);
493 test_limit
= gimple_build (preheader_seq
, MINUS_EXPR
, compare_type
,
495 test_gsi
= &incr_gsi
;
497 /* Get a safe limit for the first iteration. */
500 /* The first vector iteration can handle at most NSCALARS_STEP
501 scalars. NSCALARS_STEP <= CONST_LIMIT, and adding
502 NSCALARS_SKIP to that cannot overflow. */
503 tree const_limit
= build_int_cst (compare_type
,
504 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
505 * nscalars_per_iter
);
506 first_limit
= gimple_build (preheader_seq
, MIN_EXPR
, compare_type
,
507 nscalars_total
, const_limit
);
508 first_limit
= gimple_build (preheader_seq
, PLUS_EXPR
, compare_type
,
509 first_limit
, nscalars_skip
);
512 /* For the first iteration it doesn't matter whether the IV hits
513 a value above NSCALARS_TOTAL. That only matters for the latch
515 first_limit
= nscalars_total
;
519 /* Test the incremented IV, which will always hit a value above
520 the bound before wrapping. */
521 test_index
= index_after_incr
;
522 test_limit
= nscalars_total
;
524 test_limit
= gimple_build (preheader_seq
, PLUS_EXPR
, compare_type
,
525 test_limit
, nscalars_skip
);
526 test_gsi
= &loop_cond_gsi
;
528 first_limit
= test_limit
;
531 /* Convert the IV value to the comparison type (either a no-op or
533 gimple_seq test_seq
= NULL
;
534 test_index
= gimple_convert (&test_seq
, compare_type
, test_index
);
535 gsi_insert_seq_before (test_gsi
, test_seq
, GSI_SAME_STMT
);
537 /* Provide a definition of each mask in the group. */
538 tree next_mask
= NULL_TREE
;
541 FOR_EACH_VEC_ELT_REVERSE (rgm
->masks
, i
, mask
)
543 /* Previous masks will cover BIAS scalars. This mask covers the
545 poly_uint64 bias
= nscalars_per_mask
* i
;
546 tree bias_tree
= build_int_cst (compare_type
, bias
);
549 /* See whether the first iteration of the vector loop is known
550 to have a full mask. */
551 poly_uint64 const_limit
;
552 bool first_iteration_full
553 = (poly_int_tree_p (first_limit
, &const_limit
)
554 && known_ge (const_limit
, (i
+ 1) * nscalars_per_mask
));
556 /* Rather than have a new IV that starts at BIAS and goes up to
557 TEST_LIMIT, prefer to use the same 0-based IV for each mask
558 and adjust the bound down by BIAS. */
559 tree this_test_limit
= test_limit
;
562 this_test_limit
= gimple_build (preheader_seq
, MAX_EXPR
,
563 compare_type
, this_test_limit
,
565 this_test_limit
= gimple_build (preheader_seq
, MINUS_EXPR
,
566 compare_type
, this_test_limit
,
570 /* Create the initial mask. First include all scalars that
571 are within the loop limit. */
572 tree init_mask
= NULL_TREE
;
573 if (!first_iteration_full
)
576 if (first_limit
== test_limit
)
578 /* Use a natural test between zero (the initial IV value)
579 and the loop limit. The "else" block would be valid too,
580 but this choice can avoid the need to load BIAS_TREE into
583 end
= this_test_limit
;
587 /* FIRST_LIMIT is the maximum number of scalars handled by the
588 first iteration of the vector loop. Test the portion
589 associated with this mask. */
594 init_mask
= make_temp_ssa_name (mask_type
, NULL
, "max_mask");
595 tmp_stmt
= vect_gen_while (init_mask
, start
, end
);
596 gimple_seq_add_stmt (preheader_seq
, tmp_stmt
);
599 /* Now AND out the bits that are within the number of skipped
601 poly_uint64 const_skip
;
603 && !(poly_int_tree_p (nscalars_skip
, &const_skip
)
604 && known_le (const_skip
, bias
)))
606 tree unskipped_mask
= vect_gen_while_not (preheader_seq
, mask_type
,
607 bias_tree
, nscalars_skip
);
609 init_mask
= gimple_build (preheader_seq
, BIT_AND_EXPR
, mask_type
,
610 init_mask
, unskipped_mask
);
612 init_mask
= unskipped_mask
;
616 /* First iteration is full. */
617 init_mask
= build_minus_one_cst (mask_type
);
619 /* Get the mask value for the next iteration of the loop. */
620 next_mask
= make_temp_ssa_name (mask_type
, NULL
, "next_mask");
621 gcall
*call
= vect_gen_while (next_mask
, test_index
, this_test_limit
);
622 gsi_insert_before (test_gsi
, call
, GSI_SAME_STMT
);
624 vect_set_loop_mask (loop
, mask
, init_mask
, next_mask
);
629 /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
630 LOOP_VINFO describes the vectorization of LOOP. NITERS is the
631 number of iterations of the original scalar loop that should be
632 handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are
633 as for vect_set_loop_condition.
635 Insert the branch-back condition before LOOP_COND_GSI and return the
639 vect_set_loop_condition_masked (class loop
*loop
, loop_vec_info loop_vinfo
,
640 tree niters
, tree final_iv
,
641 bool niters_maybe_zero
,
642 gimple_stmt_iterator loop_cond_gsi
)
644 gimple_seq preheader_seq
= NULL
;
645 gimple_seq header_seq
= NULL
;
647 tree compare_type
= LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo
);
648 unsigned int compare_precision
= TYPE_PRECISION (compare_type
);
649 tree orig_niters
= niters
;
651 /* Type of the initial value of NITERS. */
652 tree ni_actual_type
= TREE_TYPE (niters
);
653 unsigned int ni_actual_precision
= TYPE_PRECISION (ni_actual_type
);
654 tree niters_skip
= LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
);
656 /* Convert NITERS to the same size as the compare. */
657 if (compare_precision
> ni_actual_precision
658 && niters_maybe_zero
)
660 /* We know that there is always at least one iteration, so if the
661 count is zero then it must have wrapped. Cope with this by
662 subtracting 1 before the conversion and adding 1 to the result. */
663 gcc_assert (TYPE_UNSIGNED (ni_actual_type
));
664 niters
= gimple_build (&preheader_seq
, PLUS_EXPR
, ni_actual_type
,
665 niters
, build_minus_one_cst (ni_actual_type
));
666 niters
= gimple_convert (&preheader_seq
, compare_type
, niters
);
667 niters
= gimple_build (&preheader_seq
, PLUS_EXPR
, compare_type
,
668 niters
, build_one_cst (compare_type
));
671 niters
= gimple_convert (&preheader_seq
, compare_type
, niters
);
673 widest_int iv_limit
= vect_iv_limit_for_full_masking (loop_vinfo
);
675 /* Iterate over all the rgroups and fill in their masks. We could use
676 the first mask from any rgroup for the loop condition; here we
677 arbitrarily pick the last. */
678 tree test_mask
= NULL_TREE
;
681 vec_loop_masks
*masks
= &LOOP_VINFO_MASKS (loop_vinfo
);
682 FOR_EACH_VEC_ELT (*masks
, i
, rgm
)
683 if (!rgm
->masks
.is_empty ())
685 /* First try using permutes. This adds a single vector
686 instruction to the loop for each mask, but needs no extra
687 loop invariants or IVs. */
688 unsigned int nmasks
= i
+ 1;
689 if ((nmasks
& 1) == 0)
691 rgroup_masks
*half_rgm
= &(*masks
)[nmasks
/ 2 - 1];
692 if (!half_rgm
->masks
.is_empty ()
693 && vect_maybe_permute_loop_masks (loop_vinfo
, &header_seq
,
698 /* See whether zero-based IV would ever generate all-false masks
699 before wrapping around. */
702 || (wi::min_precision (iv_limit
* rgm
->max_nscalars_per_iter
,
704 > compare_precision
));
706 /* Set up all masks for this group. */
707 test_mask
= vect_set_loop_masks_directly (loop
, loop_vinfo
,
714 /* Emit all accumulated statements. */
715 add_preheader_seq (loop
, preheader_seq
);
716 add_header_seq (loop
, header_seq
);
718 /* Get a boolean result that tells us whether to iterate. */
719 edge exit_edge
= single_exit (loop
);
720 tree_code code
= (exit_edge
->flags
& EDGE_TRUE_VALUE
) ? EQ_EXPR
: NE_EXPR
;
721 tree zero_mask
= build_zero_cst (TREE_TYPE (test_mask
));
722 gcond
*cond_stmt
= gimple_build_cond (code
, test_mask
, zero_mask
,
723 NULL_TREE
, NULL_TREE
);
724 gsi_insert_before (&loop_cond_gsi
, cond_stmt
, GSI_SAME_STMT
);
726 /* The loop iterates (NITERS - 1) / VF + 1 times.
727 Subtract one from this to get the latch count. */
728 tree step
= build_int_cst (compare_type
,
729 LOOP_VINFO_VECT_FACTOR (loop_vinfo
));
730 tree niters_minus_one
= fold_build2 (PLUS_EXPR
, compare_type
, niters
,
731 build_minus_one_cst (compare_type
));
732 loop
->nb_iterations
= fold_build2 (TRUNC_DIV_EXPR
, compare_type
,
733 niters_minus_one
, step
);
737 gassign
*assign
= gimple_build_assign (final_iv
, orig_niters
);
738 gsi_insert_on_edge_immediate (single_exit (loop
), assign
);
744 /* Like vect_set_loop_condition, but handle the case in which there
745 are no loop masks. */
748 vect_set_loop_condition_unmasked (class loop
*loop
, tree niters
,
749 tree step
, tree final_iv
,
750 bool niters_maybe_zero
,
751 gimple_stmt_iterator loop_cond_gsi
)
753 tree indx_before_incr
, indx_after_incr
;
756 edge pe
= loop_preheader_edge (loop
);
757 edge exit_edge
= single_exit (loop
);
758 gimple_stmt_iterator incr_gsi
;
761 tree niters_type
= TREE_TYPE (niters
);
763 orig_cond
= get_loop_exit_condition (loop
);
764 gcc_assert (orig_cond
);
765 loop_cond_gsi
= gsi_for_stmt (orig_cond
);
768 if (!niters_maybe_zero
&& integer_onep (step
))
770 /* In this case we can use a simple 0-based IV:
779 while (x < NITERS); */
780 code
= (exit_edge
->flags
& EDGE_TRUE_VALUE
) ? GE_EXPR
: LT_EXPR
;
781 init
= build_zero_cst (niters_type
);
786 /* The following works for all values of NITERS except 0:
795 while (x <= NITERS - STEP);
797 so that the loop continues to iterate if x + STEP - 1 < NITERS
798 but stops if x + STEP - 1 >= NITERS.
800 However, if NITERS is zero, x never hits a value above NITERS - STEP
801 before wrapping around. There are two obvious ways of dealing with
804 - start at STEP - 1 and compare x before incrementing it
805 - start at -1 and compare x after incrementing it
807 The latter is simpler and is what we use. The loop in this case
817 while (x < NITERS - STEP);
819 In both cases the loop limit is NITERS - STEP. */
820 gimple_seq seq
= NULL
;
821 limit
= force_gimple_operand (niters
, &seq
, true, NULL_TREE
);
822 limit
= gimple_build (&seq
, MINUS_EXPR
, TREE_TYPE (limit
), limit
, step
);
825 basic_block new_bb
= gsi_insert_seq_on_edge_immediate (pe
, seq
);
826 gcc_assert (!new_bb
);
828 if (niters_maybe_zero
)
831 code
= (exit_edge
->flags
& EDGE_TRUE_VALUE
) ? GE_EXPR
: LT_EXPR
;
832 init
= build_all_ones_cst (niters_type
);
837 code
= (exit_edge
->flags
& EDGE_TRUE_VALUE
) ? GT_EXPR
: LE_EXPR
;
838 init
= build_zero_cst (niters_type
);
842 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
843 create_iv (init
, step
, NULL_TREE
, loop
,
844 &incr_gsi
, insert_after
, &indx_before_incr
, &indx_after_incr
);
845 indx_after_incr
= force_gimple_operand_gsi (&loop_cond_gsi
, indx_after_incr
,
846 true, NULL_TREE
, true,
848 limit
= force_gimple_operand_gsi (&loop_cond_gsi
, limit
, true, NULL_TREE
,
849 true, GSI_SAME_STMT
);
851 cond_stmt
= gimple_build_cond (code
, indx_after_incr
, limit
, NULL_TREE
,
854 gsi_insert_before (&loop_cond_gsi
, cond_stmt
, GSI_SAME_STMT
);
856 /* Record the number of latch iterations. */
858 /* Case A: the loop iterates NITERS times. Subtract one to get the
860 loop
->nb_iterations
= fold_build2 (MINUS_EXPR
, niters_type
, niters
,
861 build_int_cst (niters_type
, 1));
863 /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
864 Subtract one from this to get the latch count. */
865 loop
->nb_iterations
= fold_build2 (TRUNC_DIV_EXPR
, niters_type
,
870 gassign
*assign
= gimple_build_assign (final_iv
, MINUS_EXPR
,
871 indx_after_incr
, init
);
872 gsi_insert_on_edge_immediate (single_exit (loop
), assign
);
878 /* If we're using fully-masked loops, make LOOP iterate:
880 N == (NITERS - 1) / STEP + 1
882 times. When NITERS is zero, this is equivalent to making the loop
883 execute (1 << M) / STEP times, where M is the precision of NITERS.
884 NITERS_MAYBE_ZERO is true if this last case might occur.
886 If we're not using fully-masked loops, make LOOP iterate:
888 N == (NITERS - STEP) / STEP + 1
890 times, where NITERS is known to be outside the range [1, STEP - 1].
891 This is equivalent to making the loop execute NITERS / STEP times
892 when NITERS is nonzero and (1 << M) / STEP times otherwise.
893 NITERS_MAYBE_ZERO again indicates whether this last case might occur.
895 If FINAL_IV is nonnull, it is an SSA name that should be set to
896 N * STEP on exit from the loop.
898 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
901 vect_set_loop_condition (class loop
*loop
, loop_vec_info loop_vinfo
,
902 tree niters
, tree step
, tree final_iv
,
903 bool niters_maybe_zero
)
906 gcond
*orig_cond
= get_loop_exit_condition (loop
);
907 gimple_stmt_iterator loop_cond_gsi
= gsi_for_stmt (orig_cond
);
909 if (loop_vinfo
&& LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
910 cond_stmt
= vect_set_loop_condition_masked (loop
, loop_vinfo
, niters
,
911 final_iv
, niters_maybe_zero
,
914 cond_stmt
= vect_set_loop_condition_unmasked (loop
, niters
, step
,
915 final_iv
, niters_maybe_zero
,
918 /* Remove old loop exit test. */
919 stmt_vec_info orig_cond_info
;
921 && (orig_cond_info
= loop_vinfo
->lookup_stmt (orig_cond
)))
922 loop_vinfo
->remove_stmt (orig_cond_info
);
924 gsi_remove (&loop_cond_gsi
, true);
926 if (dump_enabled_p ())
927 dump_printf_loc (MSG_NOTE
, vect_location
, "New loop exit condition: %G",
931 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
932 For all PHI arguments in FROM->dest and TO->dest from those
933 edges ensure that TO->dest PHI arguments have current_def
937 slpeel_duplicate_current_defs_from_edges (edge from
, edge to
)
939 gimple_stmt_iterator gsi_from
, gsi_to
;
941 for (gsi_from
= gsi_start_phis (from
->dest
),
942 gsi_to
= gsi_start_phis (to
->dest
);
943 !gsi_end_p (gsi_from
) && !gsi_end_p (gsi_to
);)
945 gimple
*from_phi
= gsi_stmt (gsi_from
);
946 gimple
*to_phi
= gsi_stmt (gsi_to
);
947 tree from_arg
= PHI_ARG_DEF_FROM_EDGE (from_phi
, from
);
948 tree to_arg
= PHI_ARG_DEF_FROM_EDGE (to_phi
, to
);
949 if (virtual_operand_p (from_arg
))
951 gsi_next (&gsi_from
);
954 if (virtual_operand_p (to_arg
))
959 if (TREE_CODE (from_arg
) != SSA_NAME
)
960 gcc_assert (operand_equal_p (from_arg
, to_arg
, 0));
961 else if (TREE_CODE (to_arg
) == SSA_NAME
962 && from_arg
!= to_arg
)
964 if (get_current_def (to_arg
) == NULL_TREE
)
966 gcc_assert (types_compatible_p (TREE_TYPE (to_arg
),
967 TREE_TYPE (get_current_def
969 set_current_def (to_arg
, get_current_def (from_arg
));
972 gsi_next (&gsi_from
);
976 gphi
*from_phi
= get_virtual_phi (from
->dest
);
977 gphi
*to_phi
= get_virtual_phi (to
->dest
);
979 set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi
, to
),
980 get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi
, from
)));
984 /* Given LOOP this function generates a new copy of it and puts it
985 on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
986 non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
987 basic blocks from SCALAR_LOOP instead of LOOP, but to either the
988 entry or exit of LOOP. */
991 slpeel_tree_duplicate_loop_to_edge_cfg (class loop
*loop
,
992 class loop
*scalar_loop
, edge e
)
994 class loop
*new_loop
;
995 basic_block
*new_bbs
, *bbs
, *pbbs
;
998 basic_block exit_dest
;
1000 bool duplicate_outer_loop
= false;
1002 exit
= single_exit (loop
);
1003 at_exit
= (e
== exit
);
1004 if (!at_exit
&& e
!= loop_preheader_edge (loop
))
1007 if (scalar_loop
== NULL
)
1010 bbs
= XNEWVEC (basic_block
, scalar_loop
->num_nodes
+ 1);
1012 get_loop_body_with_size (scalar_loop
, pbbs
, scalar_loop
->num_nodes
);
1013 /* Allow duplication of outer loops. */
1014 if (scalar_loop
->inner
)
1015 duplicate_outer_loop
= true;
1016 /* Check whether duplication is possible. */
1017 if (!can_copy_bbs_p (pbbs
, scalar_loop
->num_nodes
))
1023 /* Generate new loop structure. */
1024 new_loop
= duplicate_loop (scalar_loop
, loop_outer (scalar_loop
));
1025 duplicate_subloops (scalar_loop
, new_loop
);
1027 exit_dest
= exit
->dest
;
1028 was_imm_dom
= (get_immediate_dominator (CDI_DOMINATORS
,
1029 exit_dest
) == loop
->header
?
1032 /* Also copy the pre-header, this avoids jumping through hoops to
1033 duplicate the loop entry PHI arguments. Create an empty
1034 pre-header unconditionally for this. */
1035 basic_block preheader
= split_edge (loop_preheader_edge (scalar_loop
));
1036 edge entry_e
= single_pred_edge (preheader
);
1038 new_bbs
= XNEWVEC (basic_block
, scalar_loop
->num_nodes
+ 1);
1040 exit
= single_exit (scalar_loop
);
1041 copy_bbs (bbs
, scalar_loop
->num_nodes
+ 1, new_bbs
,
1042 &exit
, 1, &new_exit
, NULL
,
1043 at_exit
? loop
->latch
: e
->src
, true);
1044 exit
= single_exit (loop
);
1045 basic_block new_preheader
= new_bbs
[0];
1047 add_phi_args_after_copy (new_bbs
, scalar_loop
->num_nodes
+ 1, NULL
);
1049 if (scalar_loop
!= loop
)
1051 /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1052 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1053 but LOOP will not. slpeel_update_phi_nodes_for_guard{1,2} expects
1054 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1055 header) to have current_def set, so copy them over. */
1056 slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop
),
1058 slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop
->latch
,
1060 EDGE_SUCC (loop
->latch
, 0));
1063 if (at_exit
) /* Add the loop copy at exit. */
1065 if (scalar_loop
!= loop
)
1068 new_exit
= redirect_edge_and_branch (new_exit
, exit_dest
);
1070 for (gsi
= gsi_start_phis (exit_dest
); !gsi_end_p (gsi
);
1073 gphi
*phi
= gsi
.phi ();
1074 tree orig_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, e
);
1075 location_t orig_locus
1076 = gimple_phi_arg_location_from_edge (phi
, e
);
1078 add_phi_arg (phi
, orig_arg
, new_exit
, orig_locus
);
1081 redirect_edge_and_branch_force (e
, new_preheader
);
1082 flush_pending_stmts (e
);
1083 set_immediate_dominator (CDI_DOMINATORS
, new_preheader
, e
->src
);
1084 if (was_imm_dom
|| duplicate_outer_loop
)
1085 set_immediate_dominator (CDI_DOMINATORS
, exit_dest
, new_exit
->src
);
1087 /* And remove the non-necessary forwarder again. Keep the other
1088 one so we have a proper pre-header for the loop at the exit edge. */
1089 redirect_edge_pred (single_succ_edge (preheader
),
1090 single_pred (preheader
));
1091 delete_basic_block (preheader
);
1092 set_immediate_dominator (CDI_DOMINATORS
, scalar_loop
->header
,
1093 loop_preheader_edge (scalar_loop
)->src
);
1095 else /* Add the copy at entry. */
1097 if (scalar_loop
!= loop
)
1099 /* Remove the non-necessary forwarder of scalar_loop again. */
1100 redirect_edge_pred (single_succ_edge (preheader
),
1101 single_pred (preheader
));
1102 delete_basic_block (preheader
);
1103 set_immediate_dominator (CDI_DOMINATORS
, scalar_loop
->header
,
1104 loop_preheader_edge (scalar_loop
)->src
);
1105 preheader
= split_edge (loop_preheader_edge (loop
));
1106 entry_e
= single_pred_edge (preheader
);
1109 redirect_edge_and_branch_force (entry_e
, new_preheader
);
1110 flush_pending_stmts (entry_e
);
1111 set_immediate_dominator (CDI_DOMINATORS
, new_preheader
, entry_e
->src
);
1113 redirect_edge_and_branch_force (new_exit
, preheader
);
1114 flush_pending_stmts (new_exit
);
1115 set_immediate_dominator (CDI_DOMINATORS
, preheader
, new_exit
->src
);
1117 /* And remove the non-necessary forwarder again. Keep the other
1118 one so we have a proper pre-header for the loop at the exit edge. */
1119 redirect_edge_pred (single_succ_edge (new_preheader
),
1120 single_pred (new_preheader
));
1121 delete_basic_block (new_preheader
);
1122 set_immediate_dominator (CDI_DOMINATORS
, new_loop
->header
,
1123 loop_preheader_edge (new_loop
)->src
);
1126 /* Skip new preheader since it's deleted if copy loop is added at entry. */
1127 for (unsigned i
= (at_exit
? 0 : 1); i
< scalar_loop
->num_nodes
+ 1; i
++)
1128 rename_variables_in_bb (new_bbs
[i
], duplicate_outer_loop
);
1130 if (scalar_loop
!= loop
)
1132 /* Update new_loop->header PHIs, so that on the preheader
1133 edge they are the ones from loop rather than scalar_loop. */
1134 gphi_iterator gsi_orig
, gsi_new
;
1135 edge orig_e
= loop_preheader_edge (loop
);
1136 edge new_e
= loop_preheader_edge (new_loop
);
1138 for (gsi_orig
= gsi_start_phis (loop
->header
),
1139 gsi_new
= gsi_start_phis (new_loop
->header
);
1140 !gsi_end_p (gsi_orig
) && !gsi_end_p (gsi_new
);
1141 gsi_next (&gsi_orig
), gsi_next (&gsi_new
))
1143 gphi
*orig_phi
= gsi_orig
.phi ();
1144 gphi
*new_phi
= gsi_new
.phi ();
1145 tree orig_arg
= PHI_ARG_DEF_FROM_EDGE (orig_phi
, orig_e
);
1146 location_t orig_locus
1147 = gimple_phi_arg_location_from_edge (orig_phi
, orig_e
);
1149 add_phi_arg (new_phi
, orig_arg
, new_e
, orig_locus
);
1156 checking_verify_dominators (CDI_DOMINATORS
);
1162 /* Given the condition expression COND, put it as the last statement of
1163 GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1164 DOM_BB; return the skip edge. GUARD_TO is the target basic block to
1165 skip the loop. PROBABILITY is the skip edge's probability. Mark the
1166 new edge as irreducible if IRREDUCIBLE_P is true. */
1169 slpeel_add_loop_guard (basic_block guard_bb
, tree cond
,
1170 basic_block guard_to
, basic_block dom_bb
,
1171 profile_probability probability
, bool irreducible_p
)
1173 gimple_stmt_iterator gsi
;
1174 edge new_e
, enter_e
;
1176 gimple_seq gimplify_stmt_list
= NULL
;
1178 enter_e
= EDGE_SUCC (guard_bb
, 0);
1179 enter_e
->flags
&= ~EDGE_FALLTHRU
;
1180 enter_e
->flags
|= EDGE_FALSE_VALUE
;
1181 gsi
= gsi_last_bb (guard_bb
);
1183 cond
= force_gimple_operand_1 (cond
, &gimplify_stmt_list
, is_gimple_condexpr
,
1185 if (gimplify_stmt_list
)
1186 gsi_insert_seq_after (&gsi
, gimplify_stmt_list
, GSI_NEW_STMT
);
1188 cond_stmt
= gimple_build_cond_from_tree (cond
, NULL_TREE
, NULL_TREE
);
1189 gsi
= gsi_last_bb (guard_bb
);
1190 gsi_insert_after (&gsi
, cond_stmt
, GSI_NEW_STMT
);
1192 /* Add new edge to connect guard block to the merge/loop-exit block. */
1193 new_e
= make_edge (guard_bb
, guard_to
, EDGE_TRUE_VALUE
);
1195 new_e
->probability
= probability
;
1197 new_e
->flags
|= EDGE_IRREDUCIBLE_LOOP
;
1199 enter_e
->probability
= probability
.invert ();
1200 set_immediate_dominator (CDI_DOMINATORS
, guard_to
, dom_bb
);
1202 /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
1203 if (enter_e
->dest
->loop_father
->header
== enter_e
->dest
)
1204 split_edge (enter_e
);
1210 /* This function verifies that the following restrictions apply to LOOP:
1211 (1) it consists of exactly 2 basic blocks - header, and an empty latch
1212 for innermost loop and 5 basic blocks for outer-loop.
1213 (2) it is single entry, single exit
1214 (3) its exit condition is the last stmt in the header
1215 (4) E is the entry/exit edge of LOOP.
1219 slpeel_can_duplicate_loop_p (const class loop
*loop
, const_edge e
)
1221 edge exit_e
= single_exit (loop
);
1222 edge entry_e
= loop_preheader_edge (loop
);
1223 gcond
*orig_cond
= get_loop_exit_condition (loop
);
1224 gimple_stmt_iterator loop_exit_gsi
= gsi_last_bb (exit_e
->src
);
1225 unsigned int num_bb
= loop
->inner
? 5 : 2;
1227 /* All loops have an outer scope; the only case loop->outer is NULL is for
1228 the function itself. */
1229 if (!loop_outer (loop
)
1230 || loop
->num_nodes
!= num_bb
1231 || !empty_block_p (loop
->latch
)
1232 || !single_exit (loop
)
1233 /* Verify that new loop exit condition can be trivially modified. */
1234 || (!orig_cond
|| orig_cond
!= gsi_stmt (loop_exit_gsi
))
1235 || (e
!= exit_e
&& e
!= entry_e
))
1241 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1242 in the exit bb and rename all the uses after the loop. This simplifies
1243 the *guard[12] routines, which assume loop closed SSA form for all PHIs
1244 (but normally loop closed SSA form doesn't require virtual PHIs to be
1245 in the same form). Doing this early simplifies the checking what
1246 uses should be renamed. */
1249 create_lcssa_for_virtual_phi (class loop
*loop
)
1252 edge exit_e
= single_exit (loop
);
1254 for (gsi
= gsi_start_phis (loop
->header
); !gsi_end_p (gsi
); gsi_next (&gsi
))
1255 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi
))))
1257 gphi
*phi
= gsi
.phi ();
1258 for (gsi
= gsi_start_phis (exit_e
->dest
);
1259 !gsi_end_p (gsi
); gsi_next (&gsi
))
1260 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi
))))
1262 if (gsi_end_p (gsi
))
1264 tree new_vop
= copy_ssa_name (PHI_RESULT (phi
));
1265 gphi
*new_phi
= create_phi_node (new_vop
, exit_e
->dest
);
1266 tree vop
= PHI_ARG_DEF_FROM_EDGE (phi
, EDGE_SUCC (loop
->latch
, 0));
1267 imm_use_iterator imm_iter
;
1269 use_operand_p use_p
;
1271 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop
)
1272 = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop
);
1273 add_phi_arg (new_phi
, vop
, exit_e
, UNKNOWN_LOCATION
);
1274 gimple_phi_set_result (new_phi
, new_vop
);
1275 FOR_EACH_IMM_USE_STMT (stmt
, imm_iter
, vop
)
1277 && !flow_bb_inside_loop_p (loop
, gimple_bb (stmt
)))
1278 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
1279 SET_USE (use_p
, new_vop
);
1286 /* Function vect_get_loop_location.
1288 Extract the location of the loop in the source code.
1289 If the loop is not well formed for vectorization, an estimated
1290 location is calculated.
1291 Return the loop location if succeed and NULL if not. */
1293 dump_user_location_t
1294 find_loop_location (class loop
*loop
)
1296 gimple
*stmt
= NULL
;
1298 gimple_stmt_iterator si
;
1301 return dump_user_location_t ();
1303 stmt
= get_loop_exit_condition (loop
);
1306 && LOCATION_LOCUS (gimple_location (stmt
)) > BUILTINS_LOCATION
)
1309 /* If we got here the loop is probably not "well formed",
1310 try to estimate the loop location */
1313 return dump_user_location_t ();
1317 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
1319 stmt
= gsi_stmt (si
);
1320 if (LOCATION_LOCUS (gimple_location (stmt
)) > BUILTINS_LOCATION
)
1324 return dump_user_location_t ();
1327 /* Return true if the phi described by STMT_INFO defines an IV of the
1328 loop to be vectorized. */
1331 iv_phi_p (stmt_vec_info stmt_info
)
1333 gphi
*phi
= as_a
<gphi
*> (stmt_info
->stmt
);
1334 if (virtual_operand_p (PHI_RESULT (phi
)))
1337 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
1338 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
1344 /* Function vect_can_advance_ivs_p
1346 In case the number of iterations that LOOP iterates is unknown at compile
1347 time, an epilog loop will be generated, and the loop induction variables
1348 (IVs) will be "advanced" to the value they are supposed to take just before
1349 the epilog loop. Here we check that the access function of the loop IVs
1350 and the expression that represents the loop bound are simple enough.
1351 These restrictions will be relaxed in the future. */
1354 vect_can_advance_ivs_p (loop_vec_info loop_vinfo
)
1356 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1357 basic_block bb
= loop
->header
;
1360 /* Analyze phi functions of the loop header. */
1362 if (dump_enabled_p ())
1363 dump_printf_loc (MSG_NOTE
, vect_location
, "vect_can_advance_ivs_p:\n");
1364 for (gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
1366 tree evolution_part
;
1368 gphi
*phi
= gsi
.phi ();
1369 stmt_vec_info phi_info
= loop_vinfo
->lookup_stmt (phi
);
1370 if (dump_enabled_p ())
1371 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: %G",
1374 /* Skip virtual phi's. The data dependences that are associated with
1375 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1377 Skip reduction phis. */
1378 if (!iv_phi_p (phi_info
))
1380 if (dump_enabled_p ())
1381 dump_printf_loc (MSG_NOTE
, vect_location
,
1382 "reduc or virtual phi. skip.\n");
1386 /* Analyze the evolution function. */
1388 evolution_part
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info
);
1389 if (evolution_part
== NULL_TREE
)
1391 if (dump_enabled_p ())
1392 dump_printf (MSG_MISSED_OPTIMIZATION
,
1393 "No access function or evolution.\n");
1397 /* FORNOW: We do not transform initial conditions of IVs
1398 which evolution functions are not invariants in the loop. */
1400 if (!expr_invariant_in_loop_p (loop
, evolution_part
))
1402 if (dump_enabled_p ())
1403 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1404 "evolution not invariant in loop.\n");
1408 /* FORNOW: We do not transform initial conditions of IVs
1409 which evolution functions are a polynomial of degree >= 2. */
1411 if (tree_is_chrec (evolution_part
))
1413 if (dump_enabled_p ())
1414 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1415 "evolution is chrec.\n");
1424 /* Function vect_update_ivs_after_vectorizer.
1426 "Advance" the induction variables of LOOP to the value they should take
1427 after the execution of LOOP. This is currently necessary because the
1428 vectorizer does not handle induction variables that are used after the
1429 loop. Such a situation occurs when the last iterations of LOOP are
1431 1. We introduced new uses after LOOP for IVs that were not originally used
1432 after LOOP: the IVs of LOOP are now used by an epilog loop.
1433 2. LOOP is going to be vectorized; this means that it will iterate N/VF
1434 times, whereas the loop IVs should be bumped N times.
1437 - LOOP - a loop that is going to be vectorized. The last few iterations
1438 of LOOP were peeled.
1439 - NITERS - the number of iterations that LOOP executes (before it is
1440 vectorized). i.e, the number of times the ivs should be bumped.
1441 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1442 coming out from LOOP on which there are uses of the LOOP ivs
1443 (this is the path from LOOP->exit to epilog_loop->preheader).
1445 The new definitions of the ivs are placed in LOOP->exit.
1446 The phi args associated with the edge UPDATE_E in the bb
1447 UPDATE_E->dest are updated accordingly.
1449 Assumption 1: Like the rest of the vectorizer, this function assumes
1450 a single loop exit that has a single predecessor.
1452 Assumption 2: The phi nodes in the LOOP header and in update_bb are
1453 organized in the same order.
1455 Assumption 3: The access function of the ivs is simple enough (see
1456 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
1458 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1459 coming out of LOOP on which the ivs of LOOP are used (this is the path
1460 that leads to the epilog loop; other paths skip the epilog loop). This
1461 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1462 needs to have its phis updated.
1466 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo
,
1467 tree niters
, edge update_e
)
1469 gphi_iterator gsi
, gsi1
;
1470 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1471 basic_block update_bb
= update_e
->dest
;
1472 basic_block exit_bb
= single_exit (loop
)->dest
;
1474 /* Make sure there exists a single-predecessor exit bb: */
1475 gcc_assert (single_pred_p (exit_bb
));
1476 gcc_assert (single_succ_edge (exit_bb
) == update_e
);
1478 for (gsi
= gsi_start_phis (loop
->header
), gsi1
= gsi_start_phis (update_bb
);
1479 !gsi_end_p (gsi
) && !gsi_end_p (gsi1
);
1480 gsi_next (&gsi
), gsi_next (&gsi1
))
1483 tree step_expr
, off
;
1485 tree var
, ni
, ni_name
;
1486 gimple_stmt_iterator last_gsi
;
1488 gphi
*phi
= gsi
.phi ();
1489 gphi
*phi1
= gsi1
.phi ();
1490 stmt_vec_info phi_info
= loop_vinfo
->lookup_stmt (phi
);
1491 if (dump_enabled_p ())
1492 dump_printf_loc (MSG_NOTE
, vect_location
,
1493 "vect_update_ivs_after_vectorizer: phi: %G", phi
);
1495 /* Skip reduction and virtual phis. */
1496 if (!iv_phi_p (phi_info
))
1498 if (dump_enabled_p ())
1499 dump_printf_loc (MSG_NOTE
, vect_location
,
1500 "reduc or virtual phi. skip.\n");
1504 type
= TREE_TYPE (gimple_phi_result (phi
));
1505 step_expr
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info
);
1506 step_expr
= unshare_expr (step_expr
);
1508 /* FORNOW: We do not support IVs whose evolution function is a polynomial
1509 of degree >= 2 or exponential. */
1510 gcc_assert (!tree_is_chrec (step_expr
));
1512 init_expr
= PHI_ARG_DEF_FROM_EDGE (phi
, loop_preheader_edge (loop
));
1514 off
= fold_build2 (MULT_EXPR
, TREE_TYPE (step_expr
),
1515 fold_convert (TREE_TYPE (step_expr
), niters
),
1517 if (POINTER_TYPE_P (type
))
1518 ni
= fold_build_pointer_plus (init_expr
, off
);
1520 ni
= fold_build2 (PLUS_EXPR
, type
,
1521 init_expr
, fold_convert (type
, off
));
1523 var
= create_tmp_var (type
, "tmp");
1525 last_gsi
= gsi_last_bb (exit_bb
);
1526 gimple_seq new_stmts
= NULL
;
1527 ni_name
= force_gimple_operand (ni
, &new_stmts
, false, var
);
1528 /* Exit_bb shouldn't be empty. */
1529 if (!gsi_end_p (last_gsi
))
1530 gsi_insert_seq_after (&last_gsi
, new_stmts
, GSI_SAME_STMT
);
1532 gsi_insert_seq_before (&last_gsi
, new_stmts
, GSI_SAME_STMT
);
1534 /* Fix phi expressions in the successor bb. */
1535 adjust_phi_and_debug_stmts (phi1
, update_e
, ni_name
);
1539 /* Return a gimple value containing the misalignment (measured in vector
1540 elements) for the loop described by LOOP_VINFO, i.e. how many elements
1541 it is away from a perfectly aligned address. Add any new statements
1545 get_misalign_in_elems (gimple
**seq
, loop_vec_info loop_vinfo
)
1547 dr_vec_info
*dr_info
= LOOP_VINFO_UNALIGNED_DR (loop_vinfo
);
1548 stmt_vec_info stmt_info
= dr_info
->stmt
;
1549 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1551 poly_uint64 target_align
= DR_TARGET_ALIGNMENT (dr_info
);
1552 unsigned HOST_WIDE_INT target_align_c
;
1553 tree target_align_minus_1
;
1555 bool negative
= tree_int_cst_compare (DR_STEP (dr_info
->dr
),
1556 size_zero_node
) < 0;
1557 tree offset
= (negative
1558 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype
) + 1)
1560 tree start_addr
= vect_create_addr_base_for_vector_ref (stmt_info
, seq
,
1562 tree type
= unsigned_type_for (TREE_TYPE (start_addr
));
1563 if (target_align
.is_constant (&target_align_c
))
1564 target_align_minus_1
= build_int_cst (type
, target_align_c
- 1);
1567 tree vla
= build_int_cst (type
, target_align
);
1568 tree vla_align
= fold_build2 (BIT_AND_EXPR
, type
, vla
,
1569 fold_build2 (MINUS_EXPR
, type
,
1570 build_int_cst (type
, 0), vla
));
1571 target_align_minus_1
= fold_build2 (MINUS_EXPR
, type
, vla_align
,
1572 build_int_cst (type
, 1));
1575 HOST_WIDE_INT elem_size
1576 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1577 tree elem_size_log
= build_int_cst (type
, exact_log2 (elem_size
));
1579 /* Create: misalign_in_bytes = addr & (target_align - 1). */
1580 tree int_start_addr
= fold_convert (type
, start_addr
);
1581 tree misalign_in_bytes
= fold_build2 (BIT_AND_EXPR
, type
, int_start_addr
,
1582 target_align_minus_1
);
1584 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
1585 tree misalign_in_elems
= fold_build2 (RSHIFT_EXPR
, type
, misalign_in_bytes
,
1588 return misalign_in_elems
;
1591 /* Function vect_gen_prolog_loop_niters
1593 Generate the number of iterations which should be peeled as prolog for the
1594 loop represented by LOOP_VINFO. It is calculated as the misalignment of
1595 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1596 As a result, after the execution of this loop, the data reference DR will
1597 refer to an aligned location. The following computation is generated:
1599 If the misalignment of DR is known at compile time:
1600 addr_mis = int mis = DR_MISALIGNMENT (dr);
1601 Else, compute address misalignment in bytes:
1602 addr_mis = addr & (target_align - 1)
1604 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1606 (elem_size = element type size; an element is the scalar element whose type
1607 is the inner type of the vectype)
1609 The computations will be emitted at the end of BB. We also compute and
1610 store upper bound (included) of the result in BOUND.
1612 When the step of the data-ref in the loop is not 1 (as in interleaved data
1613 and SLP), the number of iterations of the prolog must be divided by the step
1614 (which is equal to the size of interleaved group).
1616 The above formulas assume that VF == number of elements in the vector. This
1617 may not hold when there are multiple-types in the loop.
1618 In this case, for some data-references in the loop the VF does not represent
1619 the number of elements that fit in the vector. Therefore, instead of VF we
1620 use TYPE_VECTOR_SUBPARTS. */
1623 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo
,
1624 basic_block bb
, int *bound
)
1626 dr_vec_info
*dr_info
= LOOP_VINFO_UNALIGNED_DR (loop_vinfo
);
1628 tree niters_type
= TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
));
1629 gimple_seq stmts
= NULL
, new_stmts
= NULL
;
1630 tree iters
, iters_name
;
1631 stmt_vec_info stmt_info
= dr_info
->stmt
;
1632 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1633 poly_uint64 target_align
= DR_TARGET_ALIGNMENT (dr_info
);
1635 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) > 0)
1637 int npeel
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
1639 if (dump_enabled_p ())
1640 dump_printf_loc (MSG_NOTE
, vect_location
,
1641 "known peeling = %d.\n", npeel
);
1643 iters
= build_int_cst (niters_type
, npeel
);
1644 *bound
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
1648 tree misalign_in_elems
= get_misalign_in_elems (&stmts
, loop_vinfo
);
1649 tree type
= TREE_TYPE (misalign_in_elems
);
1650 HOST_WIDE_INT elem_size
1651 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1652 /* We only do prolog peeling if the target alignment is known at compile
1654 poly_uint64 align_in_elems
=
1655 exact_div (target_align
, elem_size
);
1656 tree align_in_elems_minus_1
=
1657 build_int_cst (type
, align_in_elems
- 1);
1658 tree align_in_elems_tree
= build_int_cst (type
, align_in_elems
);
1660 /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
1661 & (align_in_elems - 1)). */
1662 bool negative
= tree_int_cst_compare (DR_STEP (dr_info
->dr
),
1663 size_zero_node
) < 0;
1665 iters
= fold_build2 (MINUS_EXPR
, type
, misalign_in_elems
,
1666 align_in_elems_tree
);
1668 iters
= fold_build2 (MINUS_EXPR
, type
, align_in_elems_tree
,
1670 iters
= fold_build2 (BIT_AND_EXPR
, type
, iters
, align_in_elems_minus_1
);
1671 iters
= fold_convert (niters_type
, iters
);
1672 unsigned HOST_WIDE_INT align_in_elems_c
;
1673 if (align_in_elems
.is_constant (&align_in_elems_c
))
1674 *bound
= align_in_elems_c
- 1;
1679 if (dump_enabled_p ())
1680 dump_printf_loc (MSG_NOTE
, vect_location
,
1681 "niters for prolog loop: %T\n", iters
);
1683 var
= create_tmp_var (niters_type
, "prolog_loop_niters");
1684 iters_name
= force_gimple_operand (iters
, &new_stmts
, false, var
);
1687 gimple_seq_add_seq (&stmts
, new_stmts
);
1690 gcc_assert (single_succ_p (bb
));
1691 gimple_stmt_iterator gsi
= gsi_last_bb (bb
);
1692 if (gsi_end_p (gsi
))
1693 gsi_insert_seq_before (&gsi
, stmts
, GSI_SAME_STMT
);
1695 gsi_insert_seq_after (&gsi
, stmts
, GSI_SAME_STMT
);
1701 /* Function vect_update_init_of_dr
1703 If CODE is PLUS, the vector loop starts NITERS iterations after the
1704 scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1705 iterations before the scalar one (using masking to skip inactive
1706 elements). This function updates the information recorded in DR to
1707 account for the difference. Specifically, it updates the OFFSET
1711 vect_update_init_of_dr (struct data_reference
*dr
, tree niters
, tree_code code
)
1713 tree offset
= DR_OFFSET (dr
);
1715 niters
= fold_build2 (MULT_EXPR
, sizetype
,
1716 fold_convert (sizetype
, niters
),
1717 fold_convert (sizetype
, DR_STEP (dr
)));
1718 offset
= fold_build2 (code
, sizetype
,
1719 fold_convert (sizetype
, offset
), niters
);
1720 DR_OFFSET (dr
) = offset
;
1724 /* Function vect_update_inits_of_drs
1726 Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1727 CODE and NITERS are as for vect_update_inits_of_dr. */
1730 vect_update_inits_of_drs (loop_vec_info loop_vinfo
, tree niters
,
1734 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1735 struct data_reference
*dr
;
1737 DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1739 /* Adjust niters to sizetype. We used to insert the stmts on loop preheader
1740 here, but since we might use these niters to update the epilogues niters
1741 and data references we can't insert them here as this definition might not
1742 always dominate its uses. */
1743 if (!types_compatible_p (sizetype
, TREE_TYPE (niters
)))
1744 niters
= fold_convert (sizetype
, niters
);
1746 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1748 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
1749 if (!STMT_VINFO_GATHER_SCATTER_P (dr_info
->stmt
))
1750 vect_update_init_of_dr (dr
, niters
, code
);
1754 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1755 by masking. This involves calculating the number of iterations to
1756 be peeled and then aligning all memory references appropriately. */
1759 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo
)
1761 tree misalign_in_elems
;
1762 tree type
= LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo
);
1764 gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo
));
1766 /* From the information recorded in LOOP_VINFO get the number of iterations
1767 that need to be skipped via masking. */
1768 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) > 0)
1770 poly_int64 misalign
= (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
1771 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
));
1772 misalign_in_elems
= build_int_cst (type
, misalign
);
1776 gimple_seq seq1
= NULL
, seq2
= NULL
;
1777 misalign_in_elems
= get_misalign_in_elems (&seq1
, loop_vinfo
);
1778 misalign_in_elems
= fold_convert (type
, misalign_in_elems
);
1779 misalign_in_elems
= force_gimple_operand (misalign_in_elems
,
1780 &seq2
, true, NULL_TREE
);
1781 gimple_seq_add_seq (&seq1
, seq2
);
1784 edge pe
= loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo
));
1785 basic_block new_bb
= gsi_insert_seq_on_edge_immediate (pe
, seq1
);
1786 gcc_assert (!new_bb
);
1790 if (dump_enabled_p ())
1791 dump_printf_loc (MSG_NOTE
, vect_location
,
1792 "misalignment for fully-masked loop: %T\n",
1795 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
) = misalign_in_elems
;
1797 vect_update_inits_of_drs (loop_vinfo
, misalign_in_elems
, MINUS_EXPR
);
1800 /* This function builds ni_name = number of iterations. Statements
1801 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
1802 it to TRUE if new ssa_var is generated. */
1805 vect_build_loop_niters (loop_vec_info loop_vinfo
, bool *new_var_p
)
1807 tree ni
= unshare_expr (LOOP_VINFO_NITERS (loop_vinfo
));
1808 if (TREE_CODE (ni
) == INTEGER_CST
)
1813 gimple_seq stmts
= NULL
;
1814 edge pe
= loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo
));
1816 var
= create_tmp_var (TREE_TYPE (ni
), "niters");
1817 ni_name
= force_gimple_operand (ni
, &stmts
, false, var
);
1820 gsi_insert_seq_on_edge_immediate (pe
, stmts
);
1821 if (new_var_p
!= NULL
)
1829 /* Calculate the number of iterations above which vectorized loop will be
1830 preferred than scalar loop. NITERS_PROLOG is the number of iterations
1831 of prolog loop. If it's integer const, the integer number is also passed
1832 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
1833 number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
1834 value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
1835 threshold below which the scalar (rather than vectorized) loop will be
1836 executed. This function stores the upper bound (inclusive) of the result
1840 vect_gen_scalar_loop_niters (tree niters_prolog
, int int_niters_prolog
,
1841 int bound_prolog
, poly_int64 bound_epilog
, int th
,
1842 poly_uint64
*bound_scalar
,
1843 bool check_profitability
)
1845 tree type
= TREE_TYPE (niters_prolog
);
1846 tree niters
= fold_build2 (PLUS_EXPR
, type
, niters_prolog
,
1847 build_int_cst (type
, bound_epilog
));
1849 *bound_scalar
= bound_prolog
+ bound_epilog
;
1850 if (check_profitability
)
1852 /* TH indicates the minimum niters of vectorized loop, while we
1853 compute the maximum niters of scalar loop. */
1855 /* Peeling for constant times. */
1856 if (int_niters_prolog
>= 0)
1858 *bound_scalar
= upper_bound (int_niters_prolog
+ bound_epilog
, th
);
1859 return build_int_cst (type
, *bound_scalar
);
1861 /* Peeling an unknown number of times. Note that both BOUND_PROLOG
1862 and BOUND_EPILOG are inclusive upper bounds. */
1863 if (known_ge (th
, bound_prolog
+ bound_epilog
))
1866 return build_int_cst (type
, th
);
1868 /* Need to do runtime comparison. */
1869 else if (maybe_gt (th
, bound_epilog
))
1871 *bound_scalar
= upper_bound (*bound_scalar
, th
);
1872 return fold_build2 (MAX_EXPR
, type
,
1873 build_int_cst (type
, th
), niters
);
1879 /* NITERS is the number of times that the original scalar loop executes
1880 after peeling. Work out the maximum number of iterations N that can
1881 be handled by the vectorized form of the loop and then either:
1883 a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1887 b) set *STEP_VECTOR_PTR to one and generate:
1889 niters_vector = N / vf
1891 In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1892 any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
1893 is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
1896 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo
, tree niters
,
1897 tree
*niters_vector_ptr
, tree
*step_vector_ptr
,
1898 bool niters_no_overflow
)
1900 tree ni_minus_gap
, var
;
1901 tree niters_vector
, step_vector
, type
= TREE_TYPE (niters
);
1902 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1903 edge pe
= loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo
));
1904 tree log_vf
= NULL_TREE
;
1906 /* If epilogue loop is required because of data accesses with gaps, we
1907 subtract one iteration from the total number of iterations here for
1908 correct calculation of RATIO. */
1909 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
1911 ni_minus_gap
= fold_build2 (MINUS_EXPR
, type
, niters
,
1912 build_one_cst (type
));
1913 if (!is_gimple_val (ni_minus_gap
))
1915 var
= create_tmp_var (type
, "ni_gap");
1916 gimple
*stmts
= NULL
;
1917 ni_minus_gap
= force_gimple_operand (ni_minus_gap
, &stmts
,
1919 gsi_insert_seq_on_edge_immediate (pe
, stmts
);
1923 ni_minus_gap
= niters
;
1925 unsigned HOST_WIDE_INT const_vf
;
1926 if (vf
.is_constant (&const_vf
)
1927 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
1929 /* Create: niters >> log2(vf) */
1930 /* If it's known that niters == number of latch executions + 1 doesn't
1931 overflow, we can generate niters >> log2(vf); otherwise we generate
1932 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1933 will be at least one. */
1934 log_vf
= build_int_cst (type
, exact_log2 (const_vf
));
1935 if (niters_no_overflow
)
1936 niters_vector
= fold_build2 (RSHIFT_EXPR
, type
, ni_minus_gap
, log_vf
);
1939 = fold_build2 (PLUS_EXPR
, type
,
1940 fold_build2 (RSHIFT_EXPR
, type
,
1941 fold_build2 (MINUS_EXPR
, type
,
1943 build_int_cst (type
, vf
)),
1945 build_int_cst (type
, 1));
1946 step_vector
= build_one_cst (type
);
1950 niters_vector
= ni_minus_gap
;
1951 step_vector
= build_int_cst (type
, vf
);
1954 if (!is_gimple_val (niters_vector
))
1956 var
= create_tmp_var (type
, "bnd");
1957 gimple_seq stmts
= NULL
;
1958 niters_vector
= force_gimple_operand (niters_vector
, &stmts
, true, var
);
1959 gsi_insert_seq_on_edge_immediate (pe
, stmts
);
1960 /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1961 we set range information to make niters analyzer's life easier. */
1962 if (stmts
!= NULL
&& log_vf
)
1963 set_range_info (niters_vector
, VR_RANGE
,
1964 wi::to_wide (build_int_cst (type
, 1)),
1965 wi::to_wide (fold_build2 (RSHIFT_EXPR
, type
,
1966 TYPE_MAX_VALUE (type
),
1969 *niters_vector_ptr
= niters_vector
;
1970 *step_vector_ptr
= step_vector
;
1975 /* Given NITERS_VECTOR which is the number of iterations for vectorized
1976 loop specified by LOOP_VINFO after vectorization, compute the number
1977 of iterations before vectorization (niters_vector * vf) and store it
1978 to NITERS_VECTOR_MULT_VF_PTR. */
1981 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo
,
1983 tree
*niters_vector_mult_vf_ptr
)
1985 /* We should be using a step_vector of VF if VF is variable. */
1986 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
).to_constant ();
1987 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1988 tree type
= TREE_TYPE (niters_vector
);
1989 tree log_vf
= build_int_cst (type
, exact_log2 (vf
));
1990 basic_block exit_bb
= single_exit (loop
)->dest
;
1992 gcc_assert (niters_vector_mult_vf_ptr
!= NULL
);
1993 tree niters_vector_mult_vf
= fold_build2 (LSHIFT_EXPR
, type
,
1994 niters_vector
, log_vf
);
1995 if (!is_gimple_val (niters_vector_mult_vf
))
1997 tree var
= create_tmp_var (type
, "niters_vector_mult_vf");
1998 gimple_seq stmts
= NULL
;
1999 niters_vector_mult_vf
= force_gimple_operand (niters_vector_mult_vf
,
2001 gimple_stmt_iterator gsi
= gsi_start_bb (exit_bb
);
2002 gsi_insert_seq_before (&gsi
, stmts
, GSI_SAME_STMT
);
2004 *niters_vector_mult_vf_ptr
= niters_vector_mult_vf
;
2007 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2008 from SECOND/FIRST and puts it at the original loop's preheader/exit
2009 edge, the two loops are arranged as below:
2014 i_1 = PHI<i_0, i_2>;
2025 ;; i_x = PHI<i_2>; ;; LCSSA phi node to be created for FIRST,
2029 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2030 or with i_2 if no LCSSA phi is created
2031 under condition of CREATE_LCSSA_FOR_IV_PHIS.
2043 This function creates loop closed SSA for the first loop; update the
2044 second loop's PHI nodes by replacing argument on incoming edge with the
2045 result of newly created lcssa PHI nodes. IF CREATE_LCSSA_FOR_IV_PHIS
2046 is false, Loop closed ssa phis will only be created for non-iv phis for
2049 This function assumes exit bb of the first loop is preheader bb of the
2050 second loop, i.e, between_bb in the example code. With PHIs updated,
2051 the second loop will execute rest iterations of the first. */
2054 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo
,
2055 class loop
*first
, class loop
*second
,
2056 bool create_lcssa_for_iv_phis
)
2058 gphi_iterator gsi_update
, gsi_orig
;
2059 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2061 edge first_latch_e
= EDGE_SUCC (first
->latch
, 0);
2062 edge second_preheader_e
= loop_preheader_edge (second
);
2063 basic_block between_bb
= single_exit (first
)->dest
;
2065 gcc_assert (between_bb
== second_preheader_e
->src
);
2066 gcc_assert (single_pred_p (between_bb
) && single_succ_p (between_bb
));
2067 /* Either the first loop or the second is the loop to be vectorized. */
2068 gcc_assert (loop
== first
|| loop
== second
);
2070 for (gsi_orig
= gsi_start_phis (first
->header
),
2071 gsi_update
= gsi_start_phis (second
->header
);
2072 !gsi_end_p (gsi_orig
) && !gsi_end_p (gsi_update
);
2073 gsi_next (&gsi_orig
), gsi_next (&gsi_update
))
2075 gphi
*orig_phi
= gsi_orig
.phi ();
2076 gphi
*update_phi
= gsi_update
.phi ();
2078 tree arg
= PHI_ARG_DEF_FROM_EDGE (orig_phi
, first_latch_e
);
2079 /* Generate lcssa PHI node for the first loop. */
2080 gphi
*vect_phi
= (loop
== first
) ? orig_phi
: update_phi
;
2081 stmt_vec_info vect_phi_info
= loop_vinfo
->lookup_stmt (vect_phi
);
2082 if (create_lcssa_for_iv_phis
|| !iv_phi_p (vect_phi_info
))
2084 tree new_res
= copy_ssa_name (PHI_RESULT (orig_phi
));
2085 gphi
*lcssa_phi
= create_phi_node (new_res
, between_bb
);
2086 add_phi_arg (lcssa_phi
, arg
, single_exit (first
), UNKNOWN_LOCATION
);
2090 /* Update PHI node in the second loop by replacing arg on the loop's
2092 adjust_phi_and_debug_stmts (update_phi
, second_preheader_e
, arg
);
2096 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2097 of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
2098 are two pred edges of the merge point before UPDATE_LOOP. The two loops
2109 i_1 = PHI<i_0, i_2>;
2123 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2127 i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
2139 This function creates PHI nodes at merge_bb and replaces the use of i_5
2140 in the update_loop's PHI node with the result of new PHI result. */
2143 slpeel_update_phi_nodes_for_guard1 (class loop
*skip_loop
,
2144 class loop
*update_loop
,
2145 edge guard_edge
, edge merge_edge
)
2147 location_t merge_loc
, guard_loc
;
2148 edge orig_e
= loop_preheader_edge (skip_loop
);
2149 edge update_e
= loop_preheader_edge (update_loop
);
2150 gphi_iterator gsi_orig
, gsi_update
;
2152 for ((gsi_orig
= gsi_start_phis (skip_loop
->header
),
2153 gsi_update
= gsi_start_phis (update_loop
->header
));
2154 !gsi_end_p (gsi_orig
) && !gsi_end_p (gsi_update
);
2155 gsi_next (&gsi_orig
), gsi_next (&gsi_update
))
2157 gphi
*orig_phi
= gsi_orig
.phi ();
2158 gphi
*update_phi
= gsi_update
.phi ();
2160 /* Generate new phi node at merge bb of the guard. */
2161 tree new_res
= copy_ssa_name (PHI_RESULT (orig_phi
));
2162 gphi
*new_phi
= create_phi_node (new_res
, guard_edge
->dest
);
2164 /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
2165 args in NEW_PHI for these edges. */
2166 tree merge_arg
= PHI_ARG_DEF_FROM_EDGE (update_phi
, update_e
);
2167 tree guard_arg
= PHI_ARG_DEF_FROM_EDGE (orig_phi
, orig_e
);
2168 merge_loc
= gimple_phi_arg_location_from_edge (update_phi
, update_e
);
2169 guard_loc
= gimple_phi_arg_location_from_edge (orig_phi
, orig_e
);
2170 add_phi_arg (new_phi
, merge_arg
, merge_edge
, merge_loc
);
2171 add_phi_arg (new_phi
, guard_arg
, guard_edge
, guard_loc
);
2173 /* Update phi in UPDATE_PHI. */
2174 adjust_phi_and_debug_stmts (update_phi
, update_e
, new_res
);
2178 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2179 this function searches for the corresponding lcssa phi node in exit
2180 bb of LOOP. If it is found, return the phi result; otherwise return
2184 find_guard_arg (class loop
*loop
, class loop
*epilog ATTRIBUTE_UNUSED
,
2188 edge e
= single_exit (loop
);
2190 gcc_assert (single_pred_p (e
->dest
));
2191 for (gsi
= gsi_start_phis (e
->dest
); !gsi_end_p (gsi
); gsi_next (&gsi
))
2193 gphi
*phi
= gsi
.phi ();
2194 if (operand_equal_p (PHI_ARG_DEF (phi
, 0),
2195 PHI_ARG_DEF (lcssa_phi
, 0), 0))
2196 return PHI_RESULT (phi
);
2201 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2202 from LOOP. Function slpeel_add_loop_guard adds guard skipping from a
2203 point between the two loops to the end of EPILOG. Edges GUARD_EDGE
2204 and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2209 i_1 = PHI<i_0, i_2>;
2231 i_3 = PHI<i_2, i_4>;
2242 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2245 i_x = PHI<i_4>; ;Use of i_4 to be replaced with i_y in merge_bb.
2247 For each name used out side EPILOG (i.e - for each name that has a lcssa
2248 phi in exit_bb) we create a new PHI in merge_bb. The new PHI has two
2249 args corresponding to GUARD_EDGE and MERGE_EDGE. Arg for MERGE_EDGE is
2250 the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2251 by LOOP and is found in the exit bb of LOOP. Arg of the original PHI
2252 in exit_bb will also be updated. */
2255 slpeel_update_phi_nodes_for_guard2 (class loop
*loop
, class loop
*epilog
,
2256 edge guard_edge
, edge merge_edge
)
2259 basic_block merge_bb
= guard_edge
->dest
;
2261 gcc_assert (single_succ_p (merge_bb
));
2262 edge e
= single_succ_edge (merge_bb
);
2263 basic_block exit_bb
= e
->dest
;
2264 gcc_assert (single_pred_p (exit_bb
));
2265 gcc_assert (single_pred (exit_bb
) == single_exit (epilog
)->dest
);
2267 for (gsi
= gsi_start_phis (exit_bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
2269 gphi
*update_phi
= gsi
.phi ();
2270 tree old_arg
= PHI_ARG_DEF (update_phi
, 0);
2271 /* This loop-closed-phi actually doesn't represent a use out of the
2272 loop - the phi arg is a constant. */
2273 if (TREE_CODE (old_arg
) != SSA_NAME
)
2276 tree merge_arg
= get_current_def (old_arg
);
2278 merge_arg
= old_arg
;
2280 tree guard_arg
= find_guard_arg (loop
, epilog
, update_phi
);
2281 /* If the var is live after loop but not a reduction, we simply
2284 guard_arg
= old_arg
;
2286 /* Create new phi node in MERGE_BB: */
2287 tree new_res
= copy_ssa_name (PHI_RESULT (update_phi
));
2288 gphi
*merge_phi
= create_phi_node (new_res
, merge_bb
);
2290 /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2291 the two PHI args in merge_phi for these edges. */
2292 add_phi_arg (merge_phi
, merge_arg
, merge_edge
, UNKNOWN_LOCATION
);
2293 add_phi_arg (merge_phi
, guard_arg
, guard_edge
, UNKNOWN_LOCATION
);
2295 /* Update the original phi in exit_bb. */
2296 adjust_phi_and_debug_stmts (update_phi
, e
, new_res
);
2300 /* EPILOG loop is duplicated from the original loop for vectorizing,
2301 the arg of its loop closed ssa PHI needs to be updated. */
2304 slpeel_update_phi_nodes_for_lcssa (class loop
*epilog
)
2307 basic_block exit_bb
= single_exit (epilog
)->dest
;
2309 gcc_assert (single_pred_p (exit_bb
));
2310 edge e
= EDGE_PRED (exit_bb
, 0);
2311 for (gsi
= gsi_start_phis (exit_bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
2312 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi
.phi (), e
));
2315 /* Function vect_do_peeling.
2318 - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2324 if (exit_loop_cond) goto exit_bb
2328 - NITERS: The number of iterations of the loop.
2329 - NITERSM1: The number of iterations of the loop's latch.
2330 - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2331 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2332 CHECK_PROFITABILITY is true.
2334 - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2335 iterate after vectorization; see vect_set_loop_condition for details.
2336 - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2337 should be set to the number of scalar iterations handled by the
2338 vector loop. The SSA name is only used on exit from the loop.
2340 This function peels prolog and epilog from the loop, adds guards skipping
2341 PROLOG and EPILOG for various conditions. As a result, the changed CFG
2345 if (prefer_scalar_loop) goto merge_bb_1
2346 else goto guard_bb_2
2349 if (skip_prolog) goto merge_bb_2
2350 else goto prolog_preheader
2356 if (exit_prolog_cond) goto prolog_exit_bb
2357 else goto prolog_header_bb
2366 if (exit_vector_cond) goto vector_exit_bb
2367 else goto vector_header_bb
2371 if (skip_epilog) goto merge_bb_3
2372 else goto epilog_preheader
2380 if (exit_epilog_cond) goto merge_bb_3
2381 else goto epilog_header_bb
2385 Note this function peels prolog and epilog only if it's necessary,
2387 This function returns the epilogue loop if a decision was made to vectorize
2390 The analysis resulting in this epilogue loop's loop_vec_info was performed
2391 in the same vect_analyze_loop call as the main loop's. At that time
2392 vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2393 vectorization factors than the main loop. This list is stored in the main
2394 loop's loop_vec_info in the 'epilogue_vinfos' member. Everytime we decide to
2395 vectorize the epilogue loop for a lower vectorization factor, the
2396 loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2397 updated and linked to the epilogue loop. This is later used to vectorize
2398 the epilogue. The reason the loop_vec_info needs updating is that it was
2399 constructed based on the original main loop, and the epilogue loop is a
2400 copy of this loop, so all links pointing to statements in the original loop
2401 need updating. Furthermore, these loop_vec_infos share the
2402 data_reference's records, which will also need to be updated.
2404 TODO: Guard for prefer_scalar_loop should be emitted along with
2405 versioning conditions if loop versioning is needed. */
2409 vect_do_peeling (loop_vec_info loop_vinfo
, tree niters
, tree nitersm1
,
2410 tree
*niters_vector
, tree
*step_vector
,
2411 tree
*niters_vector_mult_vf_var
, int th
,
2412 bool check_profitability
, bool niters_no_overflow
,
2413 tree
*advance
, drs_init_vec
&orig_drs_init
)
2416 tree type
= TREE_TYPE (niters
), guard_cond
;
2417 basic_block guard_bb
, guard_to
;
2418 profile_probability prob_prolog
, prob_vector
, prob_epilog
;
2420 int prolog_peeling
= 0;
2421 bool vect_epilogues
= loop_vinfo
->epilogue_vinfos
.length () > 0;
2422 /* We currently do not support prolog peeling if the target alignment is not
2423 known at compile time. 'vect_gen_prolog_loop_niters' depends on the
2424 target alignment being constant. */
2425 dr_vec_info
*dr_info
= LOOP_VINFO_UNALIGNED_DR (loop_vinfo
);
2426 if (dr_info
&& !DR_TARGET_ALIGNMENT (dr_info
).is_constant ())
2429 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo
))
2430 prolog_peeling
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
2432 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2433 poly_uint64 bound_epilog
= 0;
2434 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
2435 && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
))
2436 bound_epilog
+= vf
- 1;
2437 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
2439 bool epilog_peeling
= maybe_ne (bound_epilog
, 0U);
2440 poly_uint64 bound_scalar
= bound_epilog
;
2442 if (!prolog_peeling
&& !epilog_peeling
)
2445 prob_vector
= profile_probability::guessed_always ().apply_scale (9, 10);
2446 estimated_vf
= vect_vf_for_cost (loop_vinfo
);
2447 if (estimated_vf
== 2)
2449 prob_prolog
= prob_epilog
= profile_probability::guessed_always ()
2450 .apply_scale (estimated_vf
- 1, estimated_vf
);
2452 class loop
*prolog
, *epilog
= NULL
, *loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2453 class loop
*first_loop
= loop
;
2454 bool irred_flag
= loop_preheader_edge (loop
)->flags
& EDGE_IRREDUCIBLE_LOOP
;
2455 create_lcssa_for_virtual_phi (loop
);
2456 update_ssa (TODO_update_ssa_only_virtuals
);
2458 if (MAY_HAVE_DEBUG_BIND_STMTS
)
2460 gcc_assert (!adjust_vec
.exists ());
2461 adjust_vec
.create (32);
2463 initialize_original_copy_tables ();
2465 /* Record the anchor bb at which the guard should be placed if the scalar
2466 loop might be preferred. */
2467 basic_block anchor
= loop_preheader_edge (loop
)->src
;
2469 /* Generate the number of iterations for the prolog loop. We do this here
2470 so that we can also get the upper bound on the number of iterations. */
2472 int bound_prolog
= 0;
2474 niters_prolog
= vect_gen_prolog_loop_niters (loop_vinfo
, anchor
,
2477 niters_prolog
= build_int_cst (type
, 0);
2479 loop_vec_info epilogue_vinfo
= NULL
;
2482 epilogue_vinfo
= loop_vinfo
->epilogue_vinfos
[0];
2483 loop_vinfo
->epilogue_vinfos
.ordered_remove (0);
2486 tree niters_vector_mult_vf
= NULL_TREE
;
2487 /* Saving NITERs before the loop, as this may be changed by prologue. */
2488 tree before_loop_niters
= LOOP_VINFO_NITERS (loop_vinfo
);
2489 edge update_e
= NULL
, skip_e
= NULL
;
2490 unsigned int lowest_vf
= constant_lower_bound (vf
);
2491 /* If we know the number of scalar iterations for the main loop we should
2492 check whether after the main loop there are enough iterations left over
2493 for the epilogue. */
2495 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
2496 && prolog_peeling
>= 0
2497 && known_eq (vf
, lowest_vf
))
2499 unsigned HOST_WIDE_INT eiters
2500 = (LOOP_VINFO_INT_NITERS (loop_vinfo
)
2501 - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
));
2503 eiters
-= prolog_peeling
;
2505 = eiters
% lowest_vf
+ LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
);
2508 while (!(constant_multiple_p (loop_vinfo
->vector_size
,
2509 epilogue_vinfo
->vector_size
, &ratio
)
2510 && eiters
>= lowest_vf
/ ratio
))
2512 delete epilogue_vinfo
;
2513 epilogue_vinfo
= NULL
;
2514 if (loop_vinfo
->epilogue_vinfos
.length () == 0)
2516 vect_epilogues
= false;
2519 epilogue_vinfo
= loop_vinfo
->epilogue_vinfos
[0];
2520 loop_vinfo
->epilogue_vinfos
.ordered_remove (0);
2523 /* Prolog loop may be skipped. */
2524 bool skip_prolog
= (prolog_peeling
!= 0);
2525 /* Skip this loop to epilog when there are not enough iterations to enter this
2526 vectorized loop. If true we should perform runtime checks on the NITERS
2527 to check whether we should skip the current vectorized loop. If we know
2528 the number of scalar iterations we may choose to add a runtime check if
2529 this number "maybe" smaller than the number of iterations required
2530 when we know the number of scalar iterations may potentially
2531 be smaller than the number of iterations required to enter this loop, for
2532 this we use the upper bounds on the prolog and epilog peeling. When we
2533 don't know the number of iterations and don't require versioning it is
2534 because we have asserted that there are enough scalar iterations to enter
2535 the main loop, so this skip is not necessary. When we are versioning then
2536 we only add such a skip if we have chosen to vectorize the epilogue. */
2537 bool skip_vector
= (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
2538 ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo
),
2539 bound_prolog
+ bound_epilog
)
2540 : (!LOOP_REQUIRES_VERSIONING (loop_vinfo
)
2541 || vect_epilogues
));
2542 /* Epilog loop must be executed if the number of iterations for epilog
2543 loop is known at compile time, otherwise we need to add a check at
2544 the end of vector loop and skip to the end of epilog loop. */
2545 bool skip_epilog
= (prolog_peeling
< 0
2546 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
2547 || !vf
.is_constant ());
2548 /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
2549 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
2550 skip_epilog
= false;
2554 split_edge (loop_preheader_edge (loop
));
2556 /* Due to the order in which we peel prolog and epilog, we first
2557 propagate probability to the whole loop. The purpose is to
2558 avoid adjusting probabilities of both prolog and vector loops
2559 separately. Note in this case, the probability of epilog loop
2560 needs to be scaled back later. */
2561 basic_block bb_before_loop
= loop_preheader_edge (loop
)->src
;
2562 if (prob_vector
.initialized_p ())
2564 scale_bbs_frequencies (&bb_before_loop
, 1, prob_vector
);
2565 scale_loop_profile (loop
, prob_vector
, 0);
2569 dump_user_location_t loop_loc
= find_loop_location (loop
);
2570 class loop
*scalar_loop
= LOOP_VINFO_SCALAR_LOOP (loop_vinfo
);
2572 /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2573 use the original scalar loop as remaining epilogue if necessary. */
2574 LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo
)
2575 = LOOP_VINFO_SCALAR_LOOP (loop_vinfo
);
2579 e
= loop_preheader_edge (loop
);
2580 if (!slpeel_can_duplicate_loop_p (loop
, e
))
2582 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, loop_loc
,
2583 "loop can't be duplicated to preheader edge.\n");
2586 /* Peel prolog and put it on preheader edge of loop. */
2587 prolog
= slpeel_tree_duplicate_loop_to_edge_cfg (loop
, scalar_loop
, e
);
2590 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, loop_loc
,
2591 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2594 prolog
->force_vectorize
= false;
2595 slpeel_update_phi_nodes_for_loops (loop_vinfo
, prolog
, loop
, true);
2596 first_loop
= prolog
;
2597 reset_original_copy_tables ();
2599 /* Update the number of iterations for prolog loop. */
2600 tree step_prolog
= build_one_cst (TREE_TYPE (niters_prolog
));
2601 vect_set_loop_condition (prolog
, NULL
, niters_prolog
,
2602 step_prolog
, NULL_TREE
, false);
2604 /* Skip the prolog loop. */
2607 guard_cond
= fold_build2 (EQ_EXPR
, boolean_type_node
,
2608 niters_prolog
, build_int_cst (type
, 0));
2609 guard_bb
= loop_preheader_edge (prolog
)->src
;
2610 basic_block bb_after_prolog
= loop_preheader_edge (loop
)->src
;
2611 guard_to
= split_edge (loop_preheader_edge (loop
));
2612 guard_e
= slpeel_add_loop_guard (guard_bb
, guard_cond
,
2614 prob_prolog
.invert (),
2616 e
= EDGE_PRED (guard_to
, 0);
2617 e
= (e
!= guard_e
? e
: EDGE_PRED (guard_to
, 1));
2618 slpeel_update_phi_nodes_for_guard1 (prolog
, loop
, guard_e
, e
);
2620 scale_bbs_frequencies (&bb_after_prolog
, 1, prob_prolog
);
2621 scale_loop_profile (prolog
, prob_prolog
, bound_prolog
);
2624 /* Save original inits for each data_reference before advancing them with
2627 struct data_reference
*dr
;
2628 vec
<data_reference_p
> datarefs
= loop_vinfo
->shared
->datarefs
;
2629 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2630 orig_drs_init
.safe_push (std::make_pair (dr
, DR_OFFSET (dr
)));
2632 /* Update init address of DRs. */
2633 vect_update_inits_of_drs (loop_vinfo
, niters_prolog
, PLUS_EXPR
);
2634 /* Update niters for vector loop. */
2635 LOOP_VINFO_NITERS (loop_vinfo
)
2636 = fold_build2 (MINUS_EXPR
, type
, niters
, niters_prolog
);
2637 LOOP_VINFO_NITERSM1 (loop_vinfo
)
2638 = fold_build2 (MINUS_EXPR
, type
,
2639 LOOP_VINFO_NITERSM1 (loop_vinfo
), niters_prolog
);
2640 bool new_var_p
= false;
2641 niters
= vect_build_loop_niters (loop_vinfo
, &new_var_p
);
2642 /* It's guaranteed that vector loop bound before vectorization is at
2643 least VF, so set range information for newly generated var. */
2645 set_range_info (niters
, VR_RANGE
,
2646 wi::to_wide (build_int_cst (type
, vf
)),
2647 wi::to_wide (TYPE_MAX_VALUE (type
)));
2649 /* Prolog iterates at most bound_prolog times, latch iterates at
2650 most bound_prolog - 1 times. */
2651 record_niter_bound (prolog
, bound_prolog
- 1, false, true);
2652 delete_update_ssa ();
2653 adjust_vec_debug_stmts ();
2659 e
= single_exit (loop
);
2660 if (!slpeel_can_duplicate_loop_p (loop
, e
))
2662 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, loop_loc
,
2663 "loop can't be duplicated to exit edge.\n");
2666 /* Peel epilog and put it on exit edge of loop. If we are vectorizing
2667 said epilog then we should use a copy of the main loop as a starting
2668 point. This loop may have already had some preliminary transformations
2669 to allow for more optimal vectorization, for example if-conversion.
2670 If we are not vectorizing the epilog then we should use the scalar loop
2671 as the transformations mentioned above make less or no sense when not
2673 epilog
= vect_epilogues
? get_loop_copy (loop
) : scalar_loop
;
2674 epilog
= slpeel_tree_duplicate_loop_to_edge_cfg (loop
, epilog
, e
);
2677 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, loop_loc
,
2678 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2681 epilog
->force_vectorize
= false;
2682 slpeel_update_phi_nodes_for_loops (loop_vinfo
, loop
, epilog
, false);
2684 /* Scalar version loop may be preferred. In this case, add guard
2685 and skip to epilog. Note this only happens when the number of
2686 iterations of loop is unknown at compile time, otherwise this
2687 won't be vectorized. */
2690 /* Additional epilogue iteration is peeled if gap exists. */
2691 tree t
= vect_gen_scalar_loop_niters (niters_prolog
, prolog_peeling
,
2692 bound_prolog
, bound_epilog
,
2694 check_profitability
);
2695 /* Build guard against NITERSM1 since NITERS may overflow. */
2696 guard_cond
= fold_build2 (LT_EXPR
, boolean_type_node
, nitersm1
, t
);
2698 guard_to
= split_edge (loop_preheader_edge (epilog
));
2699 guard_e
= slpeel_add_loop_guard (guard_bb
, guard_cond
,
2701 prob_vector
.invert (),
2704 e
= EDGE_PRED (guard_to
, 0);
2705 e
= (e
!= guard_e
? e
: EDGE_PRED (guard_to
, 1));
2706 slpeel_update_phi_nodes_for_guard1 (first_loop
, epilog
, guard_e
, e
);
2708 /* Simply propagate profile info from guard_bb to guard_to which is
2709 a merge point of control flow. */
2710 guard_to
->count
= guard_bb
->count
;
2712 /* Scale probability of epilog loop back.
2713 FIXME: We should avoid scaling down and back up. Profile may
2714 get lost if we scale down to 0. */
2715 basic_block
*bbs
= get_loop_body (epilog
);
2716 for (unsigned int i
= 0; i
< epilog
->num_nodes
; i
++)
2717 bbs
[i
]->count
= bbs
[i
]->count
.apply_scale
2719 bbs
[i
]->count
.apply_probability
2724 basic_block bb_before_epilog
= loop_preheader_edge (epilog
)->src
;
2725 /* If loop is peeled for non-zero constant times, now niters refers to
2726 orig_niters - prolog_peeling, it won't overflow even the orig_niters
2728 niters_no_overflow
|= (prolog_peeling
> 0);
2729 vect_gen_vector_loop_niters (loop_vinfo
, niters
,
2730 niters_vector
, step_vector
,
2731 niters_no_overflow
);
2732 if (!integer_onep (*step_vector
))
2734 /* On exit from the loop we will have an easy way of calcalating
2735 NITERS_VECTOR / STEP * STEP. Install a dummy definition
2737 niters_vector_mult_vf
= make_ssa_name (TREE_TYPE (*niters_vector
));
2738 SSA_NAME_DEF_STMT (niters_vector_mult_vf
) = gimple_build_nop ();
2739 *niters_vector_mult_vf_var
= niters_vector_mult_vf
;
2742 vect_gen_vector_loop_niters_mult_vf (loop_vinfo
, *niters_vector
,
2743 &niters_vector_mult_vf
);
2744 /* Update IVs of original loop as if they were advanced by
2745 niters_vector_mult_vf steps. */
2746 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo
));
2747 update_e
= skip_vector
? e
: loop_preheader_edge (epilog
);
2748 vect_update_ivs_after_vectorizer (loop_vinfo
, niters_vector_mult_vf
,
2753 guard_cond
= fold_build2 (EQ_EXPR
, boolean_type_node
,
2754 niters
, niters_vector_mult_vf
);
2755 guard_bb
= single_exit (loop
)->dest
;
2756 guard_to
= split_edge (single_exit (epilog
));
2757 guard_e
= slpeel_add_loop_guard (guard_bb
, guard_cond
, guard_to
,
2758 skip_vector
? anchor
: guard_bb
,
2759 prob_epilog
.invert (),
2761 slpeel_update_phi_nodes_for_guard2 (loop
, epilog
, guard_e
,
2762 single_exit (epilog
));
2763 /* Only need to handle basic block before epilog loop if it's not
2764 the guard_bb, which is the case when skip_vector is true. */
2765 if (guard_bb
!= bb_before_epilog
)
2767 prob_epilog
= prob_vector
* prob_epilog
+ prob_vector
.invert ();
2769 scale_bbs_frequencies (&bb_before_epilog
, 1, prob_epilog
);
2771 scale_loop_profile (epilog
, prob_epilog
, 0);
2774 slpeel_update_phi_nodes_for_lcssa (epilog
);
2776 unsigned HOST_WIDE_INT bound
;
2777 if (bound_scalar
.is_constant (&bound
))
2779 gcc_assert (bound
!= 0);
2780 /* -1 to convert loop iterations to latch iterations. */
2781 record_niter_bound (epilog
, bound
- 1, false, true);
2784 delete_update_ssa ();
2785 adjust_vec_debug_stmts ();
2791 epilog
->aux
= epilogue_vinfo
;
2792 LOOP_VINFO_LOOP (epilogue_vinfo
) = epilog
;
2794 loop_constraint_clear (epilog
, LOOP_C_INFINITE
);
2796 /* We now must calculate the number of NITERS performed by the previous
2797 loop and EPILOGUE_NITERS to be performed by the epilogue. */
2798 tree niters
= fold_build2 (PLUS_EXPR
, TREE_TYPE (niters_vector_mult_vf
),
2799 niters_prolog
, niters_vector_mult_vf
);
2801 /* If skip_vector we may skip the previous loop, we insert a phi-node to
2802 determine whether we are coming from the previous vectorized loop
2803 using the update_e edge or the skip_vector basic block using the
2807 gcc_assert (update_e
!= NULL
&& skip_e
!= NULL
);
2808 gphi
*new_phi
= create_phi_node (make_ssa_name (TREE_TYPE (niters
)),
2810 tree new_ssa
= make_ssa_name (TREE_TYPE (niters
));
2811 gimple
*stmt
= gimple_build_assign (new_ssa
, niters
);
2812 gimple_stmt_iterator gsi
;
2813 if (TREE_CODE (niters_vector_mult_vf
) == SSA_NAME
2814 && SSA_NAME_DEF_STMT (niters_vector_mult_vf
)->bb
!= NULL
)
2816 gsi
= gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf
));
2817 gsi_insert_after (&gsi
, stmt
, GSI_NEW_STMT
);
2821 gsi
= gsi_last_bb (update_e
->src
);
2822 gsi_insert_before (&gsi
, stmt
, GSI_NEW_STMT
);
2826 add_phi_arg (new_phi
, niters
, update_e
, UNKNOWN_LOCATION
);
2827 add_phi_arg (new_phi
, build_zero_cst (TREE_TYPE (niters
)), skip_e
,
2829 niters
= PHI_RESULT (new_phi
);
2832 /* Subtract the number of iterations performed by the vectorized loop
2833 from the number of total iterations. */
2834 tree epilogue_niters
= fold_build2 (MINUS_EXPR
, TREE_TYPE (niters
),
2838 LOOP_VINFO_NITERS (epilogue_vinfo
) = epilogue_niters
;
2839 LOOP_VINFO_NITERSM1 (epilogue_vinfo
)
2840 = fold_build2 (MINUS_EXPR
, TREE_TYPE (epilogue_niters
),
2842 build_one_cst (TREE_TYPE (epilogue_niters
)));
2844 /* Set ADVANCE to the number of iterations performed by the previous
2845 loop and its prologue. */
2848 /* Redo the peeling for niter analysis as the NITERs and alignment
2849 may have been updated to take the main loop into account. */
2850 determine_peel_for_niter (epilogue_vinfo
);
2853 adjust_vec
.release ();
2854 free_original_copy_tables ();
2856 return vect_epilogues
? epilog
: NULL
;
2859 /* Function vect_create_cond_for_niters_checks.
2861 Create a conditional expression that represents the run-time checks for
2862 loop's niter. The loop is guaranteed to terminate if the run-time
2866 COND_EXPR - input conditional expression. New conditions will be chained
2867 with logical AND operation. If it is NULL, then the function
2868 is used to return the number of alias checks.
2869 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2873 COND_EXPR - conditional expression.
2875 The returned COND_EXPR is the conditional expression to be used in the
2876 if statement that controls which version of the loop gets executed at
2880 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo
, tree
*cond_expr
)
2882 tree part_cond_expr
= LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo
);
2885 *cond_expr
= fold_build2 (TRUTH_AND_EXPR
, boolean_type_node
,
2886 *cond_expr
, part_cond_expr
);
2888 *cond_expr
= part_cond_expr
;
2891 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2892 and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
2895 chain_cond_expr (tree
*cond_expr
, tree part_cond_expr
)
2898 *cond_expr
= fold_build2 (TRUTH_AND_EXPR
, boolean_type_node
,
2899 *cond_expr
, part_cond_expr
);
2901 *cond_expr
= part_cond_expr
;
2904 /* Function vect_create_cond_for_align_checks.
2906 Create a conditional expression that represents the alignment checks for
2907 all of data references (array element references) whose alignment must be
2911 COND_EXPR - input conditional expression. New conditions will be chained
2912 with logical AND operation.
2913 LOOP_VINFO - two fields of the loop information are used.
2914 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
2915 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
2918 COND_EXPR_STMT_LIST - statements needed to construct the conditional
2920 The returned value is the conditional expression to be used in the if
2921 statement that controls which version of the loop gets executed at runtime.
2923 The algorithm makes two assumptions:
2924 1) The number of bytes "n" in a vector is a power of 2.
2925 2) An address "a" is aligned if a%n is zero and that this
2926 test can be done as a&(n-1) == 0. For example, for 16
2927 byte vectors the test is a&0xf == 0. */
2930 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo
,
2932 gimple_seq
*cond_expr_stmt_list
)
2934 vec
<stmt_vec_info
> may_misalign_stmts
2935 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
2936 stmt_vec_info stmt_info
;
2937 int mask
= LOOP_VINFO_PTR_MASK (loop_vinfo
);
2940 tree int_ptrsize_type
;
2942 tree or_tmp_name
= NULL_TREE
;
2946 tree part_cond_expr
;
2948 /* Check that mask is one less than a power of 2, i.e., mask is
2949 all zeros followed by all ones. */
2950 gcc_assert ((mask
!= 0) && ((mask
& (mask
+1)) == 0));
2952 int_ptrsize_type
= signed_type_for (ptr_type_node
);
2954 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
2955 of the first vector of the i'th data reference. */
2957 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt_info
)
2959 gimple_seq new_stmt_list
= NULL
;
2962 tree new_or_tmp_name
;
2963 gimple
*addr_stmt
, *or_stmt
;
2964 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2965 bool negative
= tree_int_cst_compare
2966 (DR_STEP (STMT_VINFO_DATA_REF (stmt_info
)), size_zero_node
) < 0;
2967 tree offset
= negative
2968 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype
) + 1) : size_zero_node
;
2970 /* create: addr_tmp = (int)(address_of_first_vector) */
2972 vect_create_addr_base_for_vector_ref (stmt_info
, &new_stmt_list
,
2974 if (new_stmt_list
!= NULL
)
2975 gimple_seq_add_seq (cond_expr_stmt_list
, new_stmt_list
);
2977 sprintf (tmp_name
, "addr2int%d", i
);
2978 addr_tmp_name
= make_temp_ssa_name (int_ptrsize_type
, NULL
, tmp_name
);
2979 addr_stmt
= gimple_build_assign (addr_tmp_name
, NOP_EXPR
, addr_base
);
2980 gimple_seq_add_stmt (cond_expr_stmt_list
, addr_stmt
);
2982 /* The addresses are OR together. */
2984 if (or_tmp_name
!= NULL_TREE
)
2986 /* create: or_tmp = or_tmp | addr_tmp */
2987 sprintf (tmp_name
, "orptrs%d", i
);
2988 new_or_tmp_name
= make_temp_ssa_name (int_ptrsize_type
, NULL
, tmp_name
);
2989 or_stmt
= gimple_build_assign (new_or_tmp_name
, BIT_IOR_EXPR
,
2990 or_tmp_name
, addr_tmp_name
);
2991 gimple_seq_add_stmt (cond_expr_stmt_list
, or_stmt
);
2992 or_tmp_name
= new_or_tmp_name
;
2995 or_tmp_name
= addr_tmp_name
;
2999 mask_cst
= build_int_cst (int_ptrsize_type
, mask
);
3001 /* create: and_tmp = or_tmp & mask */
3002 and_tmp_name
= make_temp_ssa_name (int_ptrsize_type
, NULL
, "andmask");
3004 and_stmt
= gimple_build_assign (and_tmp_name
, BIT_AND_EXPR
,
3005 or_tmp_name
, mask_cst
);
3006 gimple_seq_add_stmt (cond_expr_stmt_list
, and_stmt
);
3008 /* Make and_tmp the left operand of the conditional test against zero.
3009 if and_tmp has a nonzero bit then some address is unaligned. */
3010 ptrsize_zero
= build_int_cst (int_ptrsize_type
, 0);
3011 part_cond_expr
= fold_build2 (EQ_EXPR
, boolean_type_node
,
3012 and_tmp_name
, ptrsize_zero
);
3013 chain_cond_expr (cond_expr
, part_cond_expr
);
3016 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3017 create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3018 Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3019 and this new condition are true. Treat a null *COND_EXPR as "true". */
3022 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo
, tree
*cond_expr
)
3024 vec
<vec_object_pair
> pairs
= LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
);
3026 vec_object_pair
*pair
;
3027 FOR_EACH_VEC_ELT (pairs
, i
, pair
)
3029 tree addr1
= build_fold_addr_expr (pair
->first
);
3030 tree addr2
= build_fold_addr_expr (pair
->second
);
3031 tree part_cond_expr
= fold_build2 (NE_EXPR
, boolean_type_node
,
3033 chain_cond_expr (cond_expr
, part_cond_expr
);
3037 /* Create an expression that is true when all lower-bound conditions for
3038 the vectorized loop are met. Chain this condition with *COND_EXPR. */
3041 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo
, tree
*cond_expr
)
3043 vec
<vec_lower_bound
> lower_bounds
= LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
);
3044 for (unsigned int i
= 0; i
< lower_bounds
.length (); ++i
)
3046 tree expr
= lower_bounds
[i
].expr
;
3047 tree type
= unsigned_type_for (TREE_TYPE (expr
));
3048 expr
= fold_convert (type
, expr
);
3049 poly_uint64 bound
= lower_bounds
[i
].min_value
;
3050 if (!lower_bounds
[i
].unsigned_p
)
3052 expr
= fold_build2 (PLUS_EXPR
, type
, expr
,
3053 build_int_cstu (type
, bound
- 1));
3056 tree part_cond_expr
= fold_build2 (GE_EXPR
, boolean_type_node
, expr
,
3057 build_int_cstu (type
, bound
));
3058 chain_cond_expr (cond_expr
, part_cond_expr
);
3062 /* Function vect_create_cond_for_alias_checks.
3064 Create a conditional expression that represents the run-time checks for
3065 overlapping of address ranges represented by a list of data references
3066 relations passed as input.
3069 COND_EXPR - input conditional expression. New conditions will be chained
3070 with logical AND operation. If it is NULL, then the function
3071 is used to return the number of alias checks.
3072 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3076 COND_EXPR - conditional expression.
3078 The returned COND_EXPR is the conditional expression to be used in the if
3079 statement that controls which version of the loop gets executed at runtime.
3083 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo
, tree
* cond_expr
)
3085 vec
<dr_with_seg_len_pair_t
> comp_alias_ddrs
=
3086 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
);
3088 if (comp_alias_ddrs
.is_empty ())
3091 create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo
),
3092 &comp_alias_ddrs
, cond_expr
);
3093 if (dump_enabled_p ())
3094 dump_printf_loc (MSG_NOTE
, vect_location
,
3095 "created %u versioning for alias checks.\n",
3096 comp_alias_ddrs
.length ());
3100 /* Function vect_loop_versioning.
3102 If the loop has data references that may or may not be aligned or/and
3103 has data reference relations whose independence was not proven then
3104 two versions of the loop need to be generated, one which is vectorized
3105 and one which isn't. A test is then generated to control which of the
3106 loops is executed. The test checks for the alignment of all of the
3107 data references that may or may not be aligned. An additional
3108 sequence of runtime tests is generated for each pairs of DDRs whose
3109 independence was not proven. The vectorized version of loop is
3110 executed only if both alias and alignment tests are passed.
3112 The test generated to check which version of loop is executed
3113 is modified to also check for profitability as indicated by the
3114 cost model threshold TH.
3116 The versioning precondition(s) are placed in *COND_EXPR and
3117 *COND_EXPR_STMT_LIST. */
3120 vect_loop_versioning (loop_vec_info loop_vinfo
)
3122 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
), *nloop
;
3123 class loop
*scalar_loop
= LOOP_VINFO_SCALAR_LOOP (loop_vinfo
);
3124 basic_block condition_bb
;
3126 gimple_stmt_iterator cond_exp_gsi
;
3127 basic_block merge_bb
;
3128 basic_block new_exit_bb
;
3130 gphi
*orig_phi
, *new_phi
;
3131 tree cond_expr
= NULL_TREE
;
3132 gimple_seq cond_expr_stmt_list
= NULL
;
3134 profile_probability prob
= profile_probability::likely ();
3135 gimple_seq gimplify_stmt_list
= NULL
;
3136 tree scalar_loop_iters
= LOOP_VINFO_NITERSM1 (loop_vinfo
);
3137 bool version_align
= LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
);
3138 bool version_alias
= LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
);
3139 bool version_niter
= LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo
);
3140 poly_uint64 versioning_threshold
3141 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
);
3142 tree version_simd_if_cond
3143 = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo
);
3144 unsigned th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
3146 if (th
>= vect_vf_for_cost (loop_vinfo
)
3147 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
3148 && !ordered_p (th
, versioning_threshold
))
3149 cond_expr
= fold_build2 (GE_EXPR
, boolean_type_node
, scalar_loop_iters
,
3150 build_int_cst (TREE_TYPE (scalar_loop_iters
),
3152 if (maybe_ne (versioning_threshold
, 0U))
3154 tree expr
= fold_build2 (GE_EXPR
, boolean_type_node
, scalar_loop_iters
,
3155 build_int_cst (TREE_TYPE (scalar_loop_iters
),
3156 versioning_threshold
- 1));
3158 cond_expr
= fold_build2 (BIT_AND_EXPR
, boolean_type_node
,
3165 vect_create_cond_for_niters_checks (loop_vinfo
, &cond_expr
);
3168 cond_expr
= force_gimple_operand_1 (unshare_expr (cond_expr
),
3169 &cond_expr_stmt_list
,
3170 is_gimple_condexpr
, NULL_TREE
);
3173 vect_create_cond_for_align_checks (loop_vinfo
, &cond_expr
,
3174 &cond_expr_stmt_list
);
3178 vect_create_cond_for_unequal_addrs (loop_vinfo
, &cond_expr
);
3179 vect_create_cond_for_lower_bounds (loop_vinfo
, &cond_expr
);
3180 vect_create_cond_for_alias_checks (loop_vinfo
, &cond_expr
);
3183 if (version_simd_if_cond
)
3185 gcc_assert (dom_info_available_p (CDI_DOMINATORS
));
3188 = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond
)))
3189 gcc_assert (bb
!= loop
->header
3190 && dominated_by_p (CDI_DOMINATORS
, loop
->header
, bb
)
3191 && (scalar_loop
== NULL
3192 || (bb
!= scalar_loop
->header
3193 && dominated_by_p (CDI_DOMINATORS
,
3194 scalar_loop
->header
, bb
))));
3195 tree zero
= build_zero_cst (TREE_TYPE (version_simd_if_cond
));
3196 tree c
= fold_build2 (NE_EXPR
, boolean_type_node
,
3197 version_simd_if_cond
, zero
);
3199 cond_expr
= fold_build2 (TRUTH_AND_EXPR
, boolean_type_node
,
3203 if (dump_enabled_p ())
3204 dump_printf_loc (MSG_NOTE
, vect_location
,
3205 "created versioning for simd if condition check.\n");
3208 cond_expr
= force_gimple_operand_1 (unshare_expr (cond_expr
),
3209 &gimplify_stmt_list
,
3210 is_gimple_condexpr
, NULL_TREE
);
3211 gimple_seq_add_seq (&cond_expr_stmt_list
, gimplify_stmt_list
);
3213 /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3215 class loop
*outermost
= outermost_invariant_loop_for_expr (loop
, cond_expr
);
3216 for (gimple_stmt_iterator gsi
= gsi_start (cond_expr_stmt_list
);
3217 !gsi_end_p (gsi
); gsi_next (&gsi
))
3219 gimple
*stmt
= gsi_stmt (gsi
);
3222 use_operand_p use_p
;
3224 FOR_EACH_SSA_USE_OPERAND (use_p
, stmt
, iter
, SSA_OP_USE
)
3225 if ((def_bb
= gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p
))))
3226 && flow_bb_inside_loop_p (outermost
, def_bb
))
3227 outermost
= superloop_at_depth (loop
, bb_loop_depth (def_bb
) + 1);
3230 /* Search for the outermost loop we can version. Avoid versioning of
3231 non-perfect nests but allow if-conversion versioned loops inside. */
3232 class loop
*loop_to_version
= loop
;
3233 if (flow_loop_nested_p (outermost
, loop
))
3235 if (dump_enabled_p ())
3236 dump_printf_loc (MSG_NOTE
, vect_location
,
3237 "trying to apply versioning to outer loop %d\n",
3239 if (outermost
->num
== 0)
3240 outermost
= superloop_at_depth (loop
, 1);
3241 /* And avoid applying versioning on non-perfect nests. */
3242 while (loop_to_version
!= outermost
3243 && single_exit (loop_outer (loop_to_version
))
3244 && (!loop_outer (loop_to_version
)->inner
->next
3245 || vect_loop_vectorized_call (loop_to_version
))
3246 && (!loop_outer (loop_to_version
)->inner
->next
3247 || !loop_outer (loop_to_version
)->inner
->next
->next
))
3248 loop_to_version
= loop_outer (loop_to_version
);
3251 /* Apply versioning. If there is already a scalar version created by
3252 if-conversion re-use that. Note we cannot re-use the copy of
3253 an if-converted outer-loop when vectorizing the inner loop only. */
3256 if ((!loop_to_version
->inner
|| loop
== loop_to_version
)
3257 && (call
= vect_loop_vectorized_call (loop_to_version
, &cond
)))
3259 gcc_assert (scalar_loop
);
3260 condition_bb
= gimple_bb (cond
);
3261 gimple_cond_set_condition_from_tree (cond
, cond_expr
);
3264 if (cond_expr_stmt_list
)
3266 cond_exp_gsi
= gsi_for_stmt (call
);
3267 gsi_insert_seq_before (&cond_exp_gsi
, cond_expr_stmt_list
,
3271 /* if-conversion uses profile_probability::always () for both paths,
3272 reset the paths probabilities appropriately. */
3274 extract_true_false_edges_from_block (condition_bb
, &te
, &fe
);
3275 te
->probability
= prob
;
3276 fe
->probability
= prob
.invert ();
3277 /* We can scale loops counts immediately but have to postpone
3278 scaling the scalar loop because we re-use it during peeling. */
3279 scale_loop_frequencies (loop_to_version
, te
->probability
);
3280 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo
) = fe
->probability
;
3282 nloop
= scalar_loop
;
3283 if (dump_enabled_p ())
3284 dump_printf_loc (MSG_NOTE
, vect_location
,
3285 "reusing %sloop version created by if conversion\n",
3286 loop_to_version
!= loop
? "outer " : "");
3290 if (loop_to_version
!= loop
3291 && dump_enabled_p ())
3292 dump_printf_loc (MSG_NOTE
, vect_location
,
3293 "applying loop versioning to outer loop %d\n",
3294 loop_to_version
->num
);
3296 initialize_original_copy_tables ();
3297 nloop
= loop_version (loop_to_version
, cond_expr
, &condition_bb
,
3298 prob
, prob
.invert (), prob
, prob
.invert (), true);
3300 nloop
= get_loop_copy (loop
);
3302 /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3303 reap those otherwise; they also refer to the original
3305 class loop
*l
= loop
;
3306 while (gimple
*call
= vect_loop_vectorized_call (l
))
3308 call
= SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call
)));
3309 fold_loop_internal_call (call
, boolean_false_node
);
3312 free_original_copy_tables ();
3314 if (cond_expr_stmt_list
)
3316 cond_exp_gsi
= gsi_last_bb (condition_bb
);
3317 gsi_insert_seq_before (&cond_exp_gsi
, cond_expr_stmt_list
,
3321 /* Loop versioning violates an assumption we try to maintain during
3322 vectorization - that the loop exit block has a single predecessor.
3323 After versioning, the exit block of both loop versions is the same
3324 basic block (i.e. it has two predecessors). Just in order to simplify
3325 following transformations in the vectorizer, we fix this situation
3326 here by adding a new (empty) block on the exit-edge of the loop,
3327 with the proper loop-exit phis to maintain loop-closed-form.
3328 If loop versioning wasn't done from loop, but scalar_loop instead,
3329 merge_bb will have already just a single successor. */
3331 merge_bb
= single_exit (loop_to_version
)->dest
;
3332 if (EDGE_COUNT (merge_bb
->preds
) >= 2)
3334 gcc_assert (EDGE_COUNT (merge_bb
->preds
) >= 2);
3335 new_exit_bb
= split_edge (single_exit (loop_to_version
));
3336 new_exit_e
= single_exit (loop_to_version
);
3337 e
= EDGE_SUCC (new_exit_bb
, 0);
3339 for (gsi
= gsi_start_phis (merge_bb
); !gsi_end_p (gsi
);
3343 orig_phi
= gsi
.phi ();
3344 new_res
= copy_ssa_name (PHI_RESULT (orig_phi
));
3345 new_phi
= create_phi_node (new_res
, new_exit_bb
);
3346 arg
= PHI_ARG_DEF_FROM_EDGE (orig_phi
, e
);
3347 add_phi_arg (new_phi
, arg
, new_exit_e
,
3348 gimple_phi_arg_location_from_edge (orig_phi
, e
));
3349 adjust_phi_and_debug_stmts (orig_phi
, e
, PHI_RESULT (new_phi
));
3353 update_ssa (TODO_update_ssa
);
3358 /* The versioned loop could be infinite, we need to clear existing
3359 niter information which is copied from the original loop. */
3360 gcc_assert (loop_constraint_set_p (loop
, LOOP_C_FINITE
));
3361 vect_free_loop_info_assumptions (nloop
);
3362 /* And set constraint LOOP_C_INFINITE for niter analyzer. */
3363 loop_constraint_set (loop
, LOOP_C_INFINITE
);
3366 if (LOCATION_LOCUS (vect_location
.get_location_t ()) != UNKNOWN_LOCATION
3367 && dump_enabled_p ())
3370 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
| MSG_PRIORITY_USER_FACING
,
3372 "loop versioned for vectorization because of "
3373 "possible aliasing\n");
3375 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
| MSG_PRIORITY_USER_FACING
,
3377 "loop versioned for vectorization to enhance "