2 Copyright (C) 2003-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
31 #include "tree-pass.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
51 #include "tree-if-conv.h"
52 #include "internal-fn.h"
53 #include "tree-vector-builder.h"
54 #include "vec-perm-indices.h"
57 /* Loop Vectorization Pass.
59 This pass tries to vectorize loops.
61 For example, the vectorizer transforms the following simple loop:
63 short a[N]; short b[N]; short c[N]; int i;
69 as if it was manually vectorized by rewriting the source code into:
71 typedef int __attribute__((mode(V8HI))) v8hi;
72 short a[N]; short b[N]; short c[N]; int i;
73 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
76 for (i=0; i<N/8; i++){
83 The main entry to this pass is vectorize_loops(), in which
84 the vectorizer applies a set of analyses on a given set of loops,
85 followed by the actual vectorization transformation for the loops that
86 had successfully passed the analysis phase.
87 Throughout this pass we make a distinction between two types of
88 data: scalars (which are represented by SSA_NAMES), and memory references
89 ("data-refs"). These two types of data require different handling both
90 during analysis and transformation. The types of data-refs that the
91 vectorizer currently supports are ARRAY_REFS which base is an array DECL
92 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
93 accesses are required to have a simple (consecutive) access pattern.
97 The driver for the analysis phase is vect_analyze_loop().
98 It applies a set of analyses, some of which rely on the scalar evolution
99 analyzer (scev) developed by Sebastian Pop.
101 During the analysis phase the vectorizer records some information
102 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
103 loop, as well as general information about the loop as a whole, which is
104 recorded in a "loop_vec_info" struct attached to each loop.
106 Transformation phase:
107 =====================
108 The loop transformation phase scans all the stmts in the loop, and
109 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
110 the loop that needs to be vectorized. It inserts the vector code sequence
111 just before the scalar stmt S, and records a pointer to the vector code
112 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
113 attached to S). This pointer will be used for the vectorization of following
114 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
115 otherwise, we rely on dead code elimination for removing it.
117 For example, say stmt S1 was vectorized into stmt VS1:
120 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
123 To vectorize stmt S2, the vectorizer first finds the stmt that defines
124 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
125 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
126 resulting sequence would be:
129 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
133 Operands that are not SSA_NAMEs, are data-refs that appear in
134 load/store operations (like 'x[i]' in S1), and are handled differently.
138 Currently the only target specific information that is used is the
139 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
140 Targets that can support different sizes of vectors, for now will need
141 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
142 flexibility will be added in the future.
144 Since we only vectorize operations which vector form can be
145 expressed using existing tree codes, to verify that an operation is
146 supported, the vectorizer checks the relevant optab at the relevant
147 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
148 the value found is CODE_FOR_nothing, then there's no target support, and
149 we can't vectorize the stmt.
151 For additional information on this project see:
152 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
155 static void vect_estimate_min_profitable_iters (loop_vec_info
, int *, int *);
156 static stmt_vec_info
vect_is_simple_reduction (loop_vec_info
, stmt_vec_info
,
159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
160 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
161 may already be set for general statements (not just data refs). */
164 vect_determine_vf_for_stmt_1 (vec_info
*vinfo
, stmt_vec_info stmt_info
,
165 bool vectype_maybe_set_p
,
168 gimple
*stmt
= stmt_info
->stmt
;
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info
)
171 && !STMT_VINFO_LIVE_P (stmt_info
))
172 || gimple_clobber_p (stmt
))
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE
, vect_location
, "skip.\n");
176 return opt_result::success ();
179 tree stmt_vectype
, nunits_vectype
;
180 opt_result res
= vect_get_vector_types_for_stmt (vinfo
, stmt_info
,
188 if (STMT_VINFO_VECTYPE (stmt_info
))
189 /* The only case when a vectype had been already set is for stmts
190 that contain a data ref, or for "pattern-stmts" (stmts generated
191 by the vectorizer to represent/replace a certain idiom). */
192 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info
)
193 || vectype_maybe_set_p
)
194 && STMT_VINFO_VECTYPE (stmt_info
) == stmt_vectype
);
196 STMT_VINFO_VECTYPE (stmt_info
) = stmt_vectype
;
200 vect_update_max_nunits (vf
, nunits_vectype
);
202 return opt_result::success ();
205 /* Subroutine of vect_determine_vectorization_factor. Set the vector
206 types of STMT_INFO and all attached pattern statements and update
207 the vectorization factor VF accordingly. Return true on success
208 or false if something prevented vectorization. */
211 vect_determine_vf_for_stmt (vec_info
*vinfo
,
212 stmt_vec_info stmt_info
, poly_uint64
*vf
)
214 if (dump_enabled_p ())
215 dump_printf_loc (MSG_NOTE
, vect_location
, "==> examining statement: %G",
217 opt_result res
= vect_determine_vf_for_stmt_1 (vinfo
, stmt_info
, false, vf
);
221 if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
222 && STMT_VINFO_RELATED_STMT (stmt_info
))
224 gimple
*pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
225 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
227 /* If a pattern statement has def stmts, analyze them too. */
228 for (gimple_stmt_iterator si
= gsi_start (pattern_def_seq
);
229 !gsi_end_p (si
); gsi_next (&si
))
231 stmt_vec_info def_stmt_info
= vinfo
->lookup_stmt (gsi_stmt (si
));
232 if (dump_enabled_p ())
233 dump_printf_loc (MSG_NOTE
, vect_location
,
234 "==> examining pattern def stmt: %G",
235 def_stmt_info
->stmt
);
236 res
= vect_determine_vf_for_stmt_1 (vinfo
, def_stmt_info
, true, vf
);
241 if (dump_enabled_p ())
242 dump_printf_loc (MSG_NOTE
, vect_location
,
243 "==> examining pattern statement: %G",
245 res
= vect_determine_vf_for_stmt_1 (vinfo
, stmt_info
, true, vf
);
250 return opt_result::success ();
253 /* Function vect_determine_vectorization_factor
255 Determine the vectorization factor (VF). VF is the number of data elements
256 that are operated upon in parallel in a single iteration of the vectorized
257 loop. For example, when vectorizing a loop that operates on 4byte elements,
258 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
259 elements can fit in a single vector register.
261 We currently support vectorization of loops in which all types operated upon
262 are of the same size. Therefore this function currently sets VF according to
263 the size of the types operated upon, and fails if there are multiple sizes
266 VF is also the factor by which the loop iterations are strip-mined, e.g.:
273 for (i=0; i<N; i+=VF){
274 a[i:VF] = b[i:VF] + c[i:VF];
279 vect_determine_vectorization_factor (loop_vec_info loop_vinfo
)
281 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
282 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
283 unsigned nbbs
= loop
->num_nodes
;
284 poly_uint64 vectorization_factor
= 1;
285 tree scalar_type
= NULL_TREE
;
288 stmt_vec_info stmt_info
;
291 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
293 for (i
= 0; i
< nbbs
; i
++)
295 basic_block bb
= bbs
[i
];
297 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
301 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
302 if (dump_enabled_p ())
303 dump_printf_loc (MSG_NOTE
, vect_location
, "==> examining phi: %G",
306 gcc_assert (stmt_info
);
308 if (STMT_VINFO_RELEVANT_P (stmt_info
)
309 || STMT_VINFO_LIVE_P (stmt_info
))
311 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info
));
312 scalar_type
= TREE_TYPE (PHI_RESULT (phi
));
314 if (dump_enabled_p ())
315 dump_printf_loc (MSG_NOTE
, vect_location
,
316 "get vectype for scalar type: %T\n",
319 vectype
= get_vectype_for_scalar_type (loop_vinfo
, scalar_type
);
321 return opt_result::failure_at (phi
,
322 "not vectorized: unsupported "
325 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
327 if (dump_enabled_p ())
328 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: %T\n",
331 if (dump_enabled_p ())
333 dump_printf_loc (MSG_NOTE
, vect_location
, "nunits = ");
334 dump_dec (MSG_NOTE
, TYPE_VECTOR_SUBPARTS (vectype
));
335 dump_printf (MSG_NOTE
, "\n");
338 vect_update_max_nunits (&vectorization_factor
, vectype
);
342 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
345 if (is_gimple_debug (gsi_stmt (si
)))
347 stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
349 = vect_determine_vf_for_stmt (loop_vinfo
,
350 stmt_info
, &vectorization_factor
);
356 /* TODO: Analyze cost. Decide if worth while to vectorize. */
357 if (dump_enabled_p ())
359 dump_printf_loc (MSG_NOTE
, vect_location
, "vectorization factor = ");
360 dump_dec (MSG_NOTE
, vectorization_factor
);
361 dump_printf (MSG_NOTE
, "\n");
364 if (known_le (vectorization_factor
, 1U))
365 return opt_result::failure_at (vect_location
,
366 "not vectorized: unsupported data-type\n");
367 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
368 return opt_result::success ();
372 /* Function vect_is_simple_iv_evolution.
374 FORNOW: A simple evolution of an induction variables in the loop is
375 considered a polynomial evolution. */
378 vect_is_simple_iv_evolution (unsigned loop_nb
, tree access_fn
, tree
* init
,
383 tree evolution_part
= evolution_part_in_loop_num (access_fn
, loop_nb
);
386 /* When there is no evolution in this loop, the evolution function
388 if (evolution_part
== NULL_TREE
)
391 /* When the evolution is a polynomial of degree >= 2
392 the evolution function is not "simple". */
393 if (tree_is_chrec (evolution_part
))
396 step_expr
= evolution_part
;
397 init_expr
= unshare_expr (initial_condition_in_loop_num (access_fn
, loop_nb
));
399 if (dump_enabled_p ())
400 dump_printf_loc (MSG_NOTE
, vect_location
, "step: %T, init: %T\n",
401 step_expr
, init_expr
);
406 if (TREE_CODE (step_expr
) != INTEGER_CST
407 && (TREE_CODE (step_expr
) != SSA_NAME
408 || ((bb
= gimple_bb (SSA_NAME_DEF_STMT (step_expr
)))
409 && flow_bb_inside_loop_p (get_loop (cfun
, loop_nb
), bb
))
410 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr
))
411 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
))
412 || !flag_associative_math
)))
413 && (TREE_CODE (step_expr
) != REAL_CST
414 || !flag_associative_math
))
416 if (dump_enabled_p ())
417 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
425 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
426 what we are assuming is a double reduction. For example, given
427 a structure like this:
430 x_1 = PHI <x_4(outer2), ...>;
434 x_2 = PHI <x_1(outer1), ...>;
440 x_4 = PHI <x_3(inner)>;
443 outer loop analysis would treat x_1 as a double reduction phi and
444 this function would then return true for x_2. */
447 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo
, gphi
*phi
)
451 FOR_EACH_PHI_ARG (use_p
, phi
, op_iter
, SSA_OP_USE
)
452 if (stmt_vec_info def_info
= loop_vinfo
->lookup_def (USE_FROM_PTR (use_p
)))
453 if (STMT_VINFO_DEF_TYPE (def_info
) == vect_double_reduction_def
)
458 /* Function vect_analyze_scalar_cycles_1.
460 Examine the cross iteration def-use cycles of scalar variables
461 in LOOP. LOOP_VINFO represents the loop that is now being
462 considered for vectorization (can be LOOP, or an outer-loop
466 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo
, class loop
*loop
)
468 basic_block bb
= loop
->header
;
470 auto_vec
<stmt_vec_info
, 64> worklist
;
472 bool double_reduc
, reduc_chain
;
474 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
476 /* First - identify all inductions. Reduction detection assumes that all the
477 inductions have been identified, therefore, this order must not be
479 for (gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
481 gphi
*phi
= gsi
.phi ();
482 tree access_fn
= NULL
;
483 tree def
= PHI_RESULT (phi
);
484 stmt_vec_info stmt_vinfo
= loop_vinfo
->lookup_stmt (phi
);
486 if (dump_enabled_p ())
487 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: %G", phi
);
489 /* Skip virtual phi's. The data dependences that are associated with
490 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
491 if (virtual_operand_p (def
))
494 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_unknown_def_type
;
496 /* Analyze the evolution function. */
497 access_fn
= analyze_scalar_evolution (loop
, def
);
500 STRIP_NOPS (access_fn
);
501 if (dump_enabled_p ())
502 dump_printf_loc (MSG_NOTE
, vect_location
,
503 "Access function of PHI: %T\n", access_fn
);
504 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
)
505 = initial_condition_in_loop_num (access_fn
, loop
->num
);
506 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
)
507 = evolution_part_in_loop_num (access_fn
, loop
->num
);
511 || vect_inner_phi_in_double_reduction_p (loop_vinfo
, phi
)
512 || !vect_is_simple_iv_evolution (loop
->num
, access_fn
, &init
, &step
)
513 || (LOOP_VINFO_LOOP (loop_vinfo
) != loop
514 && TREE_CODE (step
) != INTEGER_CST
))
516 worklist
.safe_push (stmt_vinfo
);
520 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
)
522 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
) != NULL_TREE
);
524 if (dump_enabled_p ())
525 dump_printf_loc (MSG_NOTE
, vect_location
, "Detected induction.\n");
526 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_induction_def
;
530 /* Second - identify all reductions and nested cycles. */
531 while (worklist
.length () > 0)
533 stmt_vec_info stmt_vinfo
= worklist
.pop ();
534 gphi
*phi
= as_a
<gphi
*> (stmt_vinfo
->stmt
);
535 tree def
= PHI_RESULT (phi
);
537 if (dump_enabled_p ())
538 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: %G", phi
);
540 gcc_assert (!virtual_operand_p (def
)
541 && STMT_VINFO_DEF_TYPE (stmt_vinfo
) == vect_unknown_def_type
);
543 stmt_vec_info reduc_stmt_info
544 = vect_is_simple_reduction (loop_vinfo
, stmt_vinfo
, &double_reduc
,
548 STMT_VINFO_REDUC_DEF (stmt_vinfo
) = reduc_stmt_info
;
549 STMT_VINFO_REDUC_DEF (reduc_stmt_info
) = stmt_vinfo
;
552 if (dump_enabled_p ())
553 dump_printf_loc (MSG_NOTE
, vect_location
,
554 "Detected double reduction.\n");
556 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_double_reduction_def
;
557 STMT_VINFO_DEF_TYPE (reduc_stmt_info
) = vect_double_reduction_def
;
561 if (loop
!= LOOP_VINFO_LOOP (loop_vinfo
))
563 if (dump_enabled_p ())
564 dump_printf_loc (MSG_NOTE
, vect_location
,
565 "Detected vectorizable nested cycle.\n");
567 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_nested_cycle
;
571 if (dump_enabled_p ())
572 dump_printf_loc (MSG_NOTE
, vect_location
,
573 "Detected reduction.\n");
575 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_reduction_def
;
576 STMT_VINFO_DEF_TYPE (reduc_stmt_info
) = vect_reduction_def
;
577 /* Store the reduction cycles for possible vectorization in
578 loop-aware SLP if it was not detected as reduction
581 LOOP_VINFO_REDUCTIONS (loop_vinfo
).safe_push
587 if (dump_enabled_p ())
588 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
589 "Unknown def-use cycle pattern.\n");
594 /* Function vect_analyze_scalar_cycles.
596 Examine the cross iteration def-use cycles of scalar variables, by
597 analyzing the loop-header PHIs of scalar variables. Classify each
598 cycle as one of the following: invariant, induction, reduction, unknown.
599 We do that for the loop represented by LOOP_VINFO, and also to its
600 inner-loop, if exists.
601 Examples for scalar cycles:
616 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo
)
618 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
620 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
);
622 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
623 Reductions in such inner-loop therefore have different properties than
624 the reductions in the nest that gets vectorized:
625 1. When vectorized, they are executed in the same order as in the original
626 scalar loop, so we can't change the order of computation when
628 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
629 current checks are too strict. */
632 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
->inner
);
635 /* Transfer group and reduction information from STMT_INFO to its
639 vect_fixup_reduc_chain (stmt_vec_info stmt_info
)
641 stmt_vec_info firstp
= STMT_VINFO_RELATED_STMT (stmt_info
);
643 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp
)
644 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
));
645 REDUC_GROUP_SIZE (firstp
) = REDUC_GROUP_SIZE (stmt_info
);
648 stmtp
= STMT_VINFO_RELATED_STMT (stmt_info
);
649 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp
)
650 == STMT_VINFO_DEF_TYPE (stmt_info
));
651 REDUC_GROUP_FIRST_ELEMENT (stmtp
) = firstp
;
652 stmt_info
= REDUC_GROUP_NEXT_ELEMENT (stmt_info
);
654 REDUC_GROUP_NEXT_ELEMENT (stmtp
)
655 = STMT_VINFO_RELATED_STMT (stmt_info
);
660 /* Fixup scalar cycles that now have their stmts detected as patterns. */
663 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo
)
668 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
), i
, first
)
670 stmt_vec_info next
= REDUC_GROUP_NEXT_ELEMENT (first
);
673 if ((STMT_VINFO_IN_PATTERN_P (next
)
674 != STMT_VINFO_IN_PATTERN_P (first
))
675 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next
)) == -1)
677 next
= REDUC_GROUP_NEXT_ELEMENT (next
);
679 /* If all reduction chain members are well-formed patterns adjust
680 the group to group the pattern stmts instead. */
682 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first
)) != -1)
684 if (STMT_VINFO_IN_PATTERN_P (first
))
686 vect_fixup_reduc_chain (first
);
687 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
)[i
]
688 = STMT_VINFO_RELATED_STMT (first
);
691 /* If not all stmt in the chain are patterns or if we failed
692 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
693 it as regular reduction instead. */
696 stmt_vec_info vinfo
= first
;
697 stmt_vec_info last
= NULL
;
700 next
= REDUC_GROUP_NEXT_ELEMENT (vinfo
);
701 REDUC_GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
702 REDUC_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
706 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first
))
708 loop_vinfo
->reductions
.safe_push (vect_stmt_to_vectorize (last
));
709 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
).unordered_remove (i
);
715 /* Function vect_get_loop_niters.
717 Determine how many iterations the loop is executed and place it
718 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
719 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
720 niter information holds in ASSUMPTIONS.
722 Return the loop exit condition. */
726 vect_get_loop_niters (class loop
*loop
, tree
*assumptions
,
727 tree
*number_of_iterations
, tree
*number_of_iterationsm1
)
729 edge exit
= single_exit (loop
);
730 class tree_niter_desc niter_desc
;
731 tree niter_assumptions
, niter
, may_be_zero
;
732 gcond
*cond
= get_loop_exit_condition (loop
);
734 *assumptions
= boolean_true_node
;
735 *number_of_iterationsm1
= chrec_dont_know
;
736 *number_of_iterations
= chrec_dont_know
;
737 DUMP_VECT_SCOPE ("get_loop_niters");
742 may_be_zero
= NULL_TREE
;
743 if (!number_of_iterations_exit_assumptions (loop
, exit
, &niter_desc
, NULL
)
744 || chrec_contains_undetermined (niter_desc
.niter
))
747 niter_assumptions
= niter_desc
.assumptions
;
748 may_be_zero
= niter_desc
.may_be_zero
;
749 niter
= niter_desc
.niter
;
751 if (may_be_zero
&& integer_zerop (may_be_zero
))
752 may_be_zero
= NULL_TREE
;
756 if (COMPARISON_CLASS_P (may_be_zero
))
758 /* Try to combine may_be_zero with assumptions, this can simplify
759 computation of niter expression. */
760 if (niter_assumptions
&& !integer_nonzerop (niter_assumptions
))
761 niter_assumptions
= fold_build2 (TRUTH_AND_EXPR
, boolean_type_node
,
763 fold_build1 (TRUTH_NOT_EXPR
,
767 niter
= fold_build3 (COND_EXPR
, TREE_TYPE (niter
), may_be_zero
,
768 build_int_cst (TREE_TYPE (niter
), 0),
769 rewrite_to_non_trapping_overflow (niter
));
771 may_be_zero
= NULL_TREE
;
773 else if (integer_nonzerop (may_be_zero
))
775 *number_of_iterationsm1
= build_int_cst (TREE_TYPE (niter
), 0);
776 *number_of_iterations
= build_int_cst (TREE_TYPE (niter
), 1);
783 *assumptions
= niter_assumptions
;
784 *number_of_iterationsm1
= niter
;
786 /* We want the number of loop header executions which is the number
787 of latch executions plus one.
788 ??? For UINT_MAX latch executions this number overflows to zero
789 for loops like do { n++; } while (n != 0); */
790 if (niter
&& !chrec_contains_undetermined (niter
))
791 niter
= fold_build2 (PLUS_EXPR
, TREE_TYPE (niter
), unshare_expr (niter
),
792 build_int_cst (TREE_TYPE (niter
), 1));
793 *number_of_iterations
= niter
;
798 /* Function bb_in_loop_p
800 Used as predicate for dfs order traversal of the loop bbs. */
803 bb_in_loop_p (const_basic_block bb
, const void *data
)
805 const class loop
*const loop
= (const class loop
*)data
;
806 if (flow_bb_inside_loop_p (loop
, bb
))
812 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
813 stmt_vec_info structs for all the stmts in LOOP_IN. */
815 _loop_vec_info::_loop_vec_info (class loop
*loop_in
, vec_info_shared
*shared
)
816 : vec_info (vec_info::loop
, init_cost (loop_in
), shared
),
818 bbs (XCNEWVEC (basic_block
, loop
->num_nodes
)),
819 num_itersm1 (NULL_TREE
),
820 num_iters (NULL_TREE
),
821 num_iters_unchanged (NULL_TREE
),
822 num_iters_assumptions (NULL_TREE
),
824 versioning_threshold (0),
825 vectorization_factor (0),
826 max_vectorization_factor (0),
827 mask_skip_niters (NULL_TREE
),
828 rgroup_compare_type (NULL_TREE
),
829 simd_if_cond (NULL_TREE
),
831 peeling_for_alignment (0),
835 slp_unrolling_factor (1),
836 single_scalar_iteration_cost (0),
837 vec_outside_cost (0),
839 vectorizable (false),
840 can_use_partial_vectors_p (param_vect_partial_vector_usage
!= 0),
841 using_partial_vectors_p (false),
842 epil_using_partial_vectors_p (false),
843 peeling_for_gaps (false),
844 peeling_for_niter (false),
845 no_data_dependencies (false),
846 has_mask_store (false),
847 scalar_loop_scaling (profile_probability::uninitialized ()),
849 orig_loop_info (NULL
)
851 /* CHECKME: We want to visit all BBs before their successors (except for
852 latch blocks, for which this assertion wouldn't hold). In the simple
853 case of the loop forms we allow, a dfs order of the BBs would the same
854 as reversed postorder traversal, so we are safe. */
856 unsigned int nbbs
= dfs_enumerate_from (loop
->header
, 0, bb_in_loop_p
,
857 bbs
, loop
->num_nodes
, loop
);
858 gcc_assert (nbbs
== loop
->num_nodes
);
860 for (unsigned int i
= 0; i
< nbbs
; i
++)
862 basic_block bb
= bbs
[i
];
863 gimple_stmt_iterator si
;
865 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
867 gimple
*phi
= gsi_stmt (si
);
868 gimple_set_uid (phi
, 0);
872 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
874 gimple
*stmt
= gsi_stmt (si
);
875 gimple_set_uid (stmt
, 0);
876 if (is_gimple_debug (stmt
))
879 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
880 third argument is the #pragma omp simd if (x) condition, when 0,
881 loop shouldn't be vectorized, when non-zero constant, it should
882 be vectorized normally, otherwise versioned with vectorized loop
883 done if the condition is non-zero at runtime. */
885 && is_gimple_call (stmt
)
886 && gimple_call_internal_p (stmt
)
887 && gimple_call_internal_fn (stmt
) == IFN_GOMP_SIMD_LANE
888 && gimple_call_num_args (stmt
) >= 3
889 && TREE_CODE (gimple_call_arg (stmt
, 0)) == SSA_NAME
891 == SSA_NAME_VAR (gimple_call_arg (stmt
, 0))))
893 tree arg
= gimple_call_arg (stmt
, 2);
894 if (integer_zerop (arg
) || TREE_CODE (arg
) == SSA_NAME
)
897 gcc_assert (integer_nonzerop (arg
));
902 epilogue_vinfos
.create (6);
905 /* Free all levels of rgroup CONTROLS. */
908 release_vec_loop_controls (vec
<rgroup_controls
> *controls
)
910 rgroup_controls
*rgc
;
912 FOR_EACH_VEC_ELT (*controls
, i
, rgc
)
913 rgc
->controls
.release ();
914 controls
->release ();
917 /* Free all memory used by the _loop_vec_info, as well as all the
918 stmt_vec_info structs of all the stmts in the loop. */
920 _loop_vec_info::~_loop_vec_info ()
924 release_vec_loop_controls (&masks
);
925 release_vec_loop_controls (&lens
);
928 epilogue_vinfos
.release ();
933 /* Return an invariant or register for EXPR and emit necessary
934 computations in the LOOP_VINFO loop preheader. */
937 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo
, tree expr
)
939 if (is_gimple_reg (expr
)
940 || is_gimple_min_invariant (expr
))
943 if (! loop_vinfo
->ivexpr_map
)
944 loop_vinfo
->ivexpr_map
= new hash_map
<tree_operand_hash
, tree
>;
945 tree
&cached
= loop_vinfo
->ivexpr_map
->get_or_insert (expr
);
948 gimple_seq stmts
= NULL
;
949 cached
= force_gimple_operand (unshare_expr (expr
),
950 &stmts
, true, NULL_TREE
);
953 edge e
= loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo
));
954 gsi_insert_seq_on_edge_immediate (e
, stmts
);
960 /* Return true if we can use CMP_TYPE as the comparison type to produce
961 all masks required to mask LOOP_VINFO. */
964 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo
, tree cmp_type
)
966 rgroup_controls
*rgm
;
968 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
), i
, rgm
)
969 if (rgm
->type
!= NULL_TREE
970 && !direct_internal_fn_supported_p (IFN_WHILE_ULT
,
977 /* Calculate the maximum number of scalars per iteration for every
978 rgroup in LOOP_VINFO. */
981 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo
)
983 unsigned int res
= 1;
985 rgroup_controls
*rgm
;
986 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
), i
, rgm
)
987 res
= MAX (res
, rgm
->max_nscalars_per_iter
);
991 /* Calculate the minimum precision necessary to represent:
995 as an unsigned integer, where MAX_NITERS is the maximum number of
996 loop header iterations for the original scalar form of LOOP_VINFO. */
999 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo
, unsigned int factor
)
1001 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1003 /* Get the maximum number of iterations that is representable
1004 in the counter type. */
1005 tree ni_type
= TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo
));
1006 widest_int max_ni
= wi::to_widest (TYPE_MAX_VALUE (ni_type
)) + 1;
1008 /* Get a more refined estimate for the number of iterations. */
1009 widest_int max_back_edges
;
1010 if (max_loop_iterations (loop
, &max_back_edges
))
1011 max_ni
= wi::smin (max_ni
, max_back_edges
+ 1);
1013 /* Work out how many bits we need to represent the limit. */
1014 return wi::min_precision (max_ni
* factor
, UNSIGNED
);
1017 /* True if the loop needs peeling or partial vectors when vectorized. */
1020 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo
)
1022 unsigned HOST_WIDE_INT const_vf
;
1023 HOST_WIDE_INT max_niter
1024 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo
));
1026 unsigned th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
1027 if (!th
&& LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
))
1028 th
= LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1031 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1032 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) >= 0)
1034 /* Work out the (constant) number of iterations that need to be
1035 peeled for reasons other than niters. */
1036 unsigned int peel_niter
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
1037 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
1039 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo
) - peel_niter
,
1040 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)))
1043 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1044 /* ??? When peeling for gaps but not alignment, we could
1045 try to check whether the (variable) niters is known to be
1046 VF * N + 1. That's something of a niche case though. */
1047 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
1048 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&const_vf
)
1049 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo
))
1050 < (unsigned) exact_log2 (const_vf
))
1051 /* In case of versioning, check if the maximum number of
1052 iterations is greater than th. If they are identical,
1053 the epilogue is unnecessary. */
1054 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo
)
1055 || ((unsigned HOST_WIDE_INT
) max_niter
1056 > (th
/ const_vf
) * const_vf
))))
1062 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1063 whether we can actually generate the masks required. Return true if so,
1064 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1067 vect_verify_full_masking (loop_vec_info loop_vinfo
)
1069 unsigned int min_ni_width
;
1070 unsigned int max_nscalars_per_iter
1071 = vect_get_max_nscalars_per_iter (loop_vinfo
);
1073 /* Use a normal loop if there are no statements that need masking.
1074 This only happens in rare degenerate cases: it means that the loop
1075 has no loads, no stores, and no live-out values. */
1076 if (LOOP_VINFO_MASKS (loop_vinfo
).is_empty ())
1079 /* Work out how many bits we need to represent the limit. */
1081 = vect_min_prec_for_max_niters (loop_vinfo
, max_nscalars_per_iter
);
1083 /* Find a scalar mode for which WHILE_ULT is supported. */
1084 opt_scalar_int_mode cmp_mode_iter
;
1085 tree cmp_type
= NULL_TREE
;
1086 tree iv_type
= NULL_TREE
;
1087 widest_int iv_limit
= vect_iv_limit_for_partial_vectors (loop_vinfo
);
1088 unsigned int iv_precision
= UINT_MAX
;
1091 iv_precision
= wi::min_precision (iv_limit
* max_nscalars_per_iter
,
1094 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter
, MODE_INT
)
1096 unsigned int cmp_bits
= GET_MODE_BITSIZE (cmp_mode_iter
.require ());
1097 if (cmp_bits
>= min_ni_width
1098 && targetm
.scalar_mode_supported_p (cmp_mode_iter
.require ()))
1100 tree this_type
= build_nonstandard_integer_type (cmp_bits
, true);
1102 && can_produce_all_loop_masks_p (loop_vinfo
, this_type
))
1104 /* Although we could stop as soon as we find a valid mode,
1105 there are at least two reasons why that's not always the
1108 - An IV that's Pmode or wider is more likely to be reusable
1109 in address calculations than an IV that's narrower than
1112 - Doing the comparison in IV_PRECISION or wider allows
1113 a natural 0-based IV, whereas using a narrower comparison
1114 type requires mitigations against wrap-around.
1116 Conversely, if the IV limit is variable, doing the comparison
1117 in a wider type than the original type can introduce
1118 unnecessary extensions, so picking the widest valid mode
1119 is not always a good choice either.
1121 Here we prefer the first IV type that's Pmode or wider,
1122 and the first comparison type that's IV_PRECISION or wider.
1123 (The comparison type must be no wider than the IV type,
1124 to avoid extensions in the vector loop.)
1126 ??? We might want to try continuing beyond Pmode for ILP32
1127 targets if CMP_BITS < IV_PRECISION. */
1128 iv_type
= this_type
;
1129 if (!cmp_type
|| iv_precision
> TYPE_PRECISION (cmp_type
))
1130 cmp_type
= this_type
;
1131 if (cmp_bits
>= GET_MODE_BITSIZE (Pmode
))
1140 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo
) = cmp_type
;
1141 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo
) = iv_type
;
1145 /* Check whether we can use vector access with length based on precison
1146 comparison. So far, to keep it simple, we only allow the case that the
1147 precision of the target supported length is larger than the precision
1148 required by loop niters. */
1151 vect_verify_loop_lens (loop_vec_info loop_vinfo
)
1153 if (LOOP_VINFO_LENS (loop_vinfo
).is_empty ())
1156 unsigned int max_nitems_per_iter
= 1;
1158 rgroup_controls
*rgl
;
1159 /* Find the maximum number of items per iteration for every rgroup. */
1160 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo
), i
, rgl
)
1162 unsigned nitems_per_iter
= rgl
->max_nscalars_per_iter
* rgl
->factor
;
1163 max_nitems_per_iter
= MAX (max_nitems_per_iter
, nitems_per_iter
);
1166 /* Work out how many bits we need to represent the length limit. */
1167 unsigned int min_ni_prec
1168 = vect_min_prec_for_max_niters (loop_vinfo
, max_nitems_per_iter
);
1170 /* Now use the maximum of below precisions for one suitable IV type:
1171 - the IV's natural precision
1172 - the precision needed to hold: the maximum number of scalar
1173 iterations multiplied by the scale factor (min_ni_prec above)
1174 - the Pmode precision
1176 If min_ni_prec is less than the precision of the current niters,
1177 we perfer to still use the niters type. Prefer to use Pmode and
1178 wider IV to avoid narrow conversions. */
1180 unsigned int ni_prec
1181 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
)));
1182 min_ni_prec
= MAX (min_ni_prec
, ni_prec
);
1183 min_ni_prec
= MAX (min_ni_prec
, GET_MODE_BITSIZE (Pmode
));
1185 tree iv_type
= NULL_TREE
;
1186 opt_scalar_int_mode tmode_iter
;
1187 FOR_EACH_MODE_IN_CLASS (tmode_iter
, MODE_INT
)
1189 scalar_mode tmode
= tmode_iter
.require ();
1190 unsigned int tbits
= GET_MODE_BITSIZE (tmode
);
1192 /* ??? Do we really want to construct one IV whose precision exceeds
1194 if (tbits
> BITS_PER_WORD
)
1197 /* Find the first available standard integral type. */
1198 if (tbits
>= min_ni_prec
&& targetm
.scalar_mode_supported_p (tmode
))
1200 iv_type
= build_nonstandard_integer_type (tbits
, true);
1207 if (dump_enabled_p ())
1208 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1209 "can't vectorize with length-based partial vectors"
1210 " because there is no suitable iv type.\n");
1214 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo
) = iv_type
;
1215 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo
) = iv_type
;
1220 /* Calculate the cost of one scalar iteration of the loop. */
1222 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo
)
1224 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1225 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1226 int nbbs
= loop
->num_nodes
, factor
;
1227 int innerloop_iters
, i
;
1229 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1231 /* Gather costs for statements in the scalar loop. */
1234 innerloop_iters
= 1;
1236 innerloop_iters
= 50; /* FIXME */
1238 for (i
= 0; i
< nbbs
; i
++)
1240 gimple_stmt_iterator si
;
1241 basic_block bb
= bbs
[i
];
1243 if (bb
->loop_father
== loop
->inner
)
1244 factor
= innerloop_iters
;
1248 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
1250 gimple
*stmt
= gsi_stmt (si
);
1251 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
1253 if (!is_gimple_assign (stmt
) && !is_gimple_call (stmt
))
1256 /* Skip stmts that are not vectorized inside the loop. */
1257 stmt_vec_info vstmt_info
= vect_stmt_to_vectorize (stmt_info
);
1258 if (!STMT_VINFO_RELEVANT_P (vstmt_info
)
1259 && (!STMT_VINFO_LIVE_P (vstmt_info
)
1260 || !VECTORIZABLE_CYCLE_DEF
1261 (STMT_VINFO_DEF_TYPE (vstmt_info
))))
1264 vect_cost_for_stmt kind
;
1265 if (STMT_VINFO_DATA_REF (stmt_info
))
1267 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1270 kind
= scalar_store
;
1272 else if (vect_nop_conversion_p (stmt_info
))
1277 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1278 factor
, kind
, stmt_info
, 0, vect_prologue
);
1282 /* Now accumulate cost. */
1283 void *target_cost_data
= init_cost (loop
);
1284 stmt_info_for_cost
*si
;
1286 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1288 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, si
->count
,
1289 si
->kind
, si
->stmt_info
, si
->vectype
,
1290 si
->misalign
, vect_body
);
1291 unsigned dummy
, body_cost
= 0;
1292 finish_cost (target_cost_data
, &dummy
, &body_cost
, &dummy
);
1293 destroy_cost_data (target_cost_data
);
1294 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo
) = body_cost
;
1298 /* Function vect_analyze_loop_form_1.
1300 Verify that certain CFG restrictions hold, including:
1301 - the loop has a pre-header
1302 - the loop has a single entry and exit
1303 - the loop exit condition is simple enough
1304 - the number of iterations can be analyzed, i.e, a countable loop. The
1305 niter could be analyzed under some assumptions. */
1308 vect_analyze_loop_form_1 (class loop
*loop
, gcond
**loop_cond
,
1309 tree
*assumptions
, tree
*number_of_iterationsm1
,
1310 tree
*number_of_iterations
, gcond
**inner_loop_cond
)
1312 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1314 /* Different restrictions apply when we are considering an inner-most loop,
1315 vs. an outer (nested) loop.
1316 (FORNOW. May want to relax some of these restrictions in the future). */
1320 /* Inner-most loop. We currently require that the number of BBs is
1321 exactly 2 (the header and latch). Vectorizable inner-most loops
1332 if (loop
->num_nodes
!= 2)
1333 return opt_result::failure_at (vect_location
,
1335 " control flow in loop.\n");
1337 if (empty_block_p (loop
->header
))
1338 return opt_result::failure_at (vect_location
,
1339 "not vectorized: empty loop.\n");
1343 class loop
*innerloop
= loop
->inner
;
1346 /* Nested loop. We currently require that the loop is doubly-nested,
1347 contains a single inner loop, and the number of BBs is exactly 5.
1348 Vectorizable outer-loops look like this:
1360 The inner-loop has the properties expected of inner-most loops
1361 as described above. */
1363 if ((loop
->inner
)->inner
|| (loop
->inner
)->next
)
1364 return opt_result::failure_at (vect_location
,
1366 " multiple nested loops.\n");
1368 if (loop
->num_nodes
!= 5)
1369 return opt_result::failure_at (vect_location
,
1371 " control flow in loop.\n");
1373 entryedge
= loop_preheader_edge (innerloop
);
1374 if (entryedge
->src
!= loop
->header
1375 || !single_exit (innerloop
)
1376 || single_exit (innerloop
)->dest
!= EDGE_PRED (loop
->latch
, 0)->src
)
1377 return opt_result::failure_at (vect_location
,
1379 " unsupported outerloop form.\n");
1381 /* Analyze the inner-loop. */
1382 tree inner_niterm1
, inner_niter
, inner_assumptions
;
1384 = vect_analyze_loop_form_1 (loop
->inner
, inner_loop_cond
,
1385 &inner_assumptions
, &inner_niterm1
,
1386 &inner_niter
, NULL
);
1389 if (dump_enabled_p ())
1390 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1391 "not vectorized: Bad inner loop.\n");
1395 /* Don't support analyzing niter under assumptions for inner
1397 if (!integer_onep (inner_assumptions
))
1398 return opt_result::failure_at (vect_location
,
1399 "not vectorized: Bad inner loop.\n");
1401 if (!expr_invariant_in_loop_p (loop
, inner_niter
))
1402 return opt_result::failure_at (vect_location
,
1403 "not vectorized: inner-loop count not"
1406 if (dump_enabled_p ())
1407 dump_printf_loc (MSG_NOTE
, vect_location
,
1408 "Considering outer-loop vectorization.\n");
1411 if (!single_exit (loop
))
1412 return opt_result::failure_at (vect_location
,
1413 "not vectorized: multiple exits.\n");
1414 if (EDGE_COUNT (loop
->header
->preds
) != 2)
1415 return opt_result::failure_at (vect_location
,
1417 " too many incoming edges.\n");
1419 /* We assume that the loop exit condition is at the end of the loop. i.e,
1420 that the loop is represented as a do-while (with a proper if-guard
1421 before the loop if needed), where the loop header contains all the
1422 executable statements, and the latch is empty. */
1423 if (!empty_block_p (loop
->latch
)
1424 || !gimple_seq_empty_p (phi_nodes (loop
->latch
)))
1425 return opt_result::failure_at (vect_location
,
1426 "not vectorized: latch block not empty.\n");
1428 /* Make sure the exit is not abnormal. */
1429 edge e
= single_exit (loop
);
1430 if (e
->flags
& EDGE_ABNORMAL
)
1431 return opt_result::failure_at (vect_location
,
1433 " abnormal loop exit edge.\n");
1435 *loop_cond
= vect_get_loop_niters (loop
, assumptions
, number_of_iterations
,
1436 number_of_iterationsm1
);
1438 return opt_result::failure_at
1440 "not vectorized: complicated exit condition.\n");
1442 if (integer_zerop (*assumptions
)
1443 || !*number_of_iterations
1444 || chrec_contains_undetermined (*number_of_iterations
))
1445 return opt_result::failure_at
1447 "not vectorized: number of iterations cannot be computed.\n");
1449 if (integer_zerop (*number_of_iterations
))
1450 return opt_result::failure_at
1452 "not vectorized: number of iterations = 0.\n");
1454 return opt_result::success ();
1457 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1460 vect_analyze_loop_form (class loop
*loop
, vec_info_shared
*shared
)
1462 tree assumptions
, number_of_iterations
, number_of_iterationsm1
;
1463 gcond
*loop_cond
, *inner_loop_cond
= NULL
;
1466 = vect_analyze_loop_form_1 (loop
, &loop_cond
,
1467 &assumptions
, &number_of_iterationsm1
,
1468 &number_of_iterations
, &inner_loop_cond
);
1470 return opt_loop_vec_info::propagate_failure (res
);
1472 loop_vec_info loop_vinfo
= new _loop_vec_info (loop
, shared
);
1473 LOOP_VINFO_NITERSM1 (loop_vinfo
) = number_of_iterationsm1
;
1474 LOOP_VINFO_NITERS (loop_vinfo
) = number_of_iterations
;
1475 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = number_of_iterations
;
1476 if (!integer_onep (assumptions
))
1478 /* We consider to vectorize this loop by versioning it under
1479 some assumptions. In order to do this, we need to clear
1480 existing information computed by scev and niter analyzer. */
1482 free_numbers_of_iterations_estimates (loop
);
1483 /* Also set flag for this loop so that following scev and niter
1484 analysis are done under the assumptions. */
1485 loop_constraint_set (loop
, LOOP_C_FINITE
);
1486 /* Also record the assumptions for versioning. */
1487 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo
) = assumptions
;
1490 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
1492 if (dump_enabled_p ())
1494 dump_printf_loc (MSG_NOTE
, vect_location
,
1495 "Symbolic number of iterations is ");
1496 dump_generic_expr (MSG_NOTE
, TDF_DETAILS
, number_of_iterations
);
1497 dump_printf (MSG_NOTE
, "\n");
1501 stmt_vec_info loop_cond_info
= loop_vinfo
->lookup_stmt (loop_cond
);
1502 STMT_VINFO_TYPE (loop_cond_info
) = loop_exit_ctrl_vec_info_type
;
1503 if (inner_loop_cond
)
1505 stmt_vec_info inner_loop_cond_info
1506 = loop_vinfo
->lookup_stmt (inner_loop_cond
);
1507 STMT_VINFO_TYPE (inner_loop_cond_info
) = loop_exit_ctrl_vec_info_type
;
1510 gcc_assert (!loop
->aux
);
1511 loop
->aux
= loop_vinfo
;
1512 return opt_loop_vec_info::success (loop_vinfo
);
1517 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1518 statements update the vectorization factor. */
1521 vect_update_vf_for_slp (loop_vec_info loop_vinfo
)
1523 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1524 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1525 int nbbs
= loop
->num_nodes
;
1526 poly_uint64 vectorization_factor
;
1529 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1531 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1532 gcc_assert (known_ne (vectorization_factor
, 0U));
1534 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1535 vectorization factor of the loop is the unrolling factor required by
1536 the SLP instances. If that unrolling factor is 1, we say, that we
1537 perform pure SLP on loop - cross iteration parallelism is not
1539 bool only_slp_in_loop
= true;
1540 for (i
= 0; i
< nbbs
; i
++)
1542 basic_block bb
= bbs
[i
];
1543 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
1546 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (si
.phi ());
1549 if ((STMT_VINFO_RELEVANT_P (stmt_info
)
1550 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
1551 && !PURE_SLP_STMT (stmt_info
))
1552 /* STMT needs both SLP and loop-based vectorization. */
1553 only_slp_in_loop
= false;
1555 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
1558 if (is_gimple_debug (gsi_stmt (si
)))
1560 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
1561 stmt_info
= vect_stmt_to_vectorize (stmt_info
);
1562 if ((STMT_VINFO_RELEVANT_P (stmt_info
)
1563 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
1564 && !PURE_SLP_STMT (stmt_info
))
1565 /* STMT needs both SLP and loop-based vectorization. */
1566 only_slp_in_loop
= false;
1570 if (only_slp_in_loop
)
1572 if (dump_enabled_p ())
1573 dump_printf_loc (MSG_NOTE
, vect_location
,
1574 "Loop contains only SLP stmts\n");
1575 vectorization_factor
= LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
);
1579 if (dump_enabled_p ())
1580 dump_printf_loc (MSG_NOTE
, vect_location
,
1581 "Loop contains SLP and non-SLP stmts\n");
1582 /* Both the vectorization factor and unroll factor have the form
1583 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1584 so they must have a common multiple. */
1585 vectorization_factor
1586 = force_common_multiple (vectorization_factor
,
1587 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
));
1590 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
1591 if (dump_enabled_p ())
1593 dump_printf_loc (MSG_NOTE
, vect_location
,
1594 "Updating vectorization factor to ");
1595 dump_dec (MSG_NOTE
, vectorization_factor
);
1596 dump_printf (MSG_NOTE
, ".\n");
1600 /* Return true if STMT_INFO describes a double reduction phi and if
1601 the other phi in the reduction is also relevant for vectorization.
1602 This rejects cases such as:
1605 x_1 = PHI <x_3(outer2), ...>;
1613 x_3 = PHI <x_2(inner)>;
1615 if nothing in x_2 or elsewhere makes x_1 relevant. */
1618 vect_active_double_reduction_p (stmt_vec_info stmt_info
)
1620 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
1623 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info
));
1626 /* Function vect_analyze_loop_operations.
1628 Scan the loop stmts and make sure they are all vectorizable. */
1631 vect_analyze_loop_operations (loop_vec_info loop_vinfo
)
1633 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1634 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1635 int nbbs
= loop
->num_nodes
;
1637 stmt_vec_info stmt_info
;
1638 bool need_to_vectorize
= false;
1641 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1643 auto_vec
<stmt_info_for_cost
> cost_vec
;
1645 for (i
= 0; i
< nbbs
; i
++)
1647 basic_block bb
= bbs
[i
];
1649 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
1652 gphi
*phi
= si
.phi ();
1655 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
1656 if (dump_enabled_p ())
1657 dump_printf_loc (MSG_NOTE
, vect_location
, "examining phi: %G", phi
);
1658 if (virtual_operand_p (gimple_phi_result (phi
)))
1661 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1662 (i.e., a phi in the tail of the outer-loop). */
1663 if (! is_loop_header_bb_p (bb
))
1665 /* FORNOW: we currently don't support the case that these phis
1666 are not used in the outerloop (unless it is double reduction,
1667 i.e., this phi is vect_reduction_def), cause this case
1668 requires to actually do something here. */
1669 if (STMT_VINFO_LIVE_P (stmt_info
)
1670 && !vect_active_double_reduction_p (stmt_info
))
1671 return opt_result::failure_at (phi
,
1672 "Unsupported loop-closed phi"
1673 " in outer-loop.\n");
1675 /* If PHI is used in the outer loop, we check that its operand
1676 is defined in the inner loop. */
1677 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1681 if (gimple_phi_num_args (phi
) != 1)
1682 return opt_result::failure_at (phi
, "unsupported phi");
1684 phi_op
= PHI_ARG_DEF (phi
, 0);
1685 stmt_vec_info op_def_info
= loop_vinfo
->lookup_def (phi_op
);
1687 return opt_result::failure_at (phi
, "unsupported phi\n");
1689 if (STMT_VINFO_RELEVANT (op_def_info
) != vect_used_in_outer
1690 && (STMT_VINFO_RELEVANT (op_def_info
)
1691 != vect_used_in_outer_by_reduction
))
1692 return opt_result::failure_at (phi
, "unsupported phi\n");
1694 if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_internal_def
1695 || (STMT_VINFO_DEF_TYPE (stmt_info
)
1696 == vect_double_reduction_def
))
1697 && !vectorizable_lc_phi (loop_vinfo
,
1698 stmt_info
, NULL
, NULL
))
1699 return opt_result::failure_at (phi
, "unsupported phi\n");
1705 gcc_assert (stmt_info
);
1707 if ((STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
1708 || STMT_VINFO_LIVE_P (stmt_info
))
1709 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_induction_def
)
1710 /* A scalar-dependence cycle that we don't support. */
1711 return opt_result::failure_at (phi
,
1713 " scalar dependence cycle.\n");
1715 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1717 need_to_vectorize
= true;
1718 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
1719 && ! PURE_SLP_STMT (stmt_info
))
1720 ok
= vectorizable_induction (loop_vinfo
,
1721 stmt_info
, NULL
, NULL
,
1723 else if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
1724 || (STMT_VINFO_DEF_TYPE (stmt_info
)
1725 == vect_double_reduction_def
)
1726 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
1727 && ! PURE_SLP_STMT (stmt_info
))
1728 ok
= vectorizable_reduction (loop_vinfo
,
1729 stmt_info
, NULL
, NULL
, &cost_vec
);
1732 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1734 && STMT_VINFO_LIVE_P (stmt_info
)
1735 && !PURE_SLP_STMT (stmt_info
))
1736 ok
= vectorizable_live_operation (loop_vinfo
,
1737 stmt_info
, NULL
, NULL
, NULL
,
1738 -1, false, &cost_vec
);
1741 return opt_result::failure_at (phi
,
1742 "not vectorized: relevant phi not "
1744 static_cast <gimple
*> (phi
));
1747 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
1750 gimple
*stmt
= gsi_stmt (si
);
1751 if (!gimple_clobber_p (stmt
)
1752 && !is_gimple_debug (stmt
))
1755 = vect_analyze_stmt (loop_vinfo
,
1756 loop_vinfo
->lookup_stmt (stmt
),
1758 NULL
, NULL
, &cost_vec
);
1765 add_stmt_costs (loop_vinfo
, loop_vinfo
->target_cost_data
, &cost_vec
);
1767 /* All operations in the loop are either irrelevant (deal with loop
1768 control, or dead), or only used outside the loop and can be moved
1769 out of the loop (e.g. invariants, inductions). The loop can be
1770 optimized away by scalar optimizations. We're better off not
1771 touching this loop. */
1772 if (!need_to_vectorize
)
1774 if (dump_enabled_p ())
1775 dump_printf_loc (MSG_NOTE
, vect_location
,
1776 "All the computation can be taken out of the loop.\n");
1777 return opt_result::failure_at
1779 "not vectorized: redundant loop. no profit to vectorize.\n");
1782 return opt_result::success ();
1785 /* Return true if we know that the iteration count is smaller than the
1786 vectorization factor. Return false if it isn't, or if we can't be sure
1790 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo
)
1792 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
1794 HOST_WIDE_INT max_niter
;
1795 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
1796 max_niter
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
1798 max_niter
= max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo
));
1800 if (max_niter
!= -1 && (unsigned HOST_WIDE_INT
) max_niter
< assumed_vf
)
1806 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1807 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1808 definitely no, or -1 if it's worth retrying. */
1811 vect_analyze_loop_costing (loop_vec_info loop_vinfo
)
1813 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1814 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
1816 /* Only loops that can handle partially-populated vectors can have iteration
1817 counts less than the vectorization factor. */
1818 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
1820 if (vect_known_niters_smaller_than_vf (loop_vinfo
))
1822 if (dump_enabled_p ())
1823 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1824 "not vectorized: iteration count smaller than "
1825 "vectorization factor.\n");
1830 /* If using the "very cheap" model. reject cases in which we'd keep
1831 a copy of the scalar code (even if we might be able to vectorize it). */
1832 if (flag_vect_cost_model
== VECT_COST_MODEL_VERY_CHEAP
1833 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1834 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
1835 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
)))
1837 if (dump_enabled_p ())
1838 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1839 "some scalar iterations would need to be peeled\n");
1843 int min_profitable_iters
, min_profitable_estimate
;
1844 vect_estimate_min_profitable_iters (loop_vinfo
, &min_profitable_iters
,
1845 &min_profitable_estimate
);
1847 if (min_profitable_iters
< 0)
1849 if (dump_enabled_p ())
1850 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1851 "not vectorized: vectorization not profitable.\n");
1852 if (dump_enabled_p ())
1853 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1854 "not vectorized: vector version will never be "
1859 int min_scalar_loop_bound
= (param_min_vect_loop_bound
1862 /* Use the cost model only if it is more conservative than user specified
1864 unsigned int th
= (unsigned) MAX (min_scalar_loop_bound
,
1865 min_profitable_iters
);
1867 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) = th
;
1869 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1870 && LOOP_VINFO_INT_NITERS (loop_vinfo
) < th
)
1872 if (dump_enabled_p ())
1873 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1874 "not vectorized: vectorization not profitable.\n");
1875 if (dump_enabled_p ())
1876 dump_printf_loc (MSG_NOTE
, vect_location
,
1877 "not vectorized: iteration count smaller than user "
1878 "specified loop bound parameter or minimum profitable "
1879 "iterations (whichever is more conservative).\n");
1883 /* The static profitablity threshold min_profitable_estimate includes
1884 the cost of having to check at runtime whether the scalar loop
1885 should be used instead. If it turns out that we don't need or want
1886 such a check, the threshold we should use for the static estimate
1887 is simply the point at which the vector loop becomes more profitable
1888 than the scalar loop. */
1889 if (min_profitable_estimate
> min_profitable_iters
1890 && !LOOP_REQUIRES_VERSIONING (loop_vinfo
)
1891 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
)
1892 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1893 && !vect_apply_runtime_profitability_check_p (loop_vinfo
))
1895 if (dump_enabled_p ())
1896 dump_printf_loc (MSG_NOTE
, vect_location
, "no need for a runtime"
1897 " choice between the scalar and vector loops\n");
1898 min_profitable_estimate
= min_profitable_iters
;
1901 /* If the vector loop needs multiple iterations to be beneficial then
1902 things are probably too close to call, and the conservative thing
1903 would be to stick with the scalar code. */
1904 if (flag_vect_cost_model
== VECT_COST_MODEL_VERY_CHEAP
1905 && min_profitable_estimate
> (int) vect_vf_for_cost (loop_vinfo
))
1907 if (dump_enabled_p ())
1908 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1909 "one iteration of the vector loop would be"
1910 " more expensive than the equivalent number of"
1911 " iterations of the scalar loop\n");
1915 HOST_WIDE_INT estimated_niter
;
1917 /* If we are vectorizing an epilogue then we know the maximum number of
1918 scalar iterations it will cover is at least one lower than the
1919 vectorization factor of the main loop. */
1920 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
1922 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
)) - 1;
1925 estimated_niter
= estimated_stmt_executions_int (loop
);
1926 if (estimated_niter
== -1)
1927 estimated_niter
= likely_max_stmt_executions_int (loop
);
1929 if (estimated_niter
!= -1
1930 && ((unsigned HOST_WIDE_INT
) estimated_niter
1931 < MAX (th
, (unsigned) min_profitable_estimate
)))
1933 if (dump_enabled_p ())
1934 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1935 "not vectorized: estimated iteration count too "
1937 if (dump_enabled_p ())
1938 dump_printf_loc (MSG_NOTE
, vect_location
,
1939 "not vectorized: estimated iteration count smaller "
1940 "than specified loop bound parameter or minimum "
1941 "profitable iterations (whichever is more "
1942 "conservative).\n");
1950 vect_get_datarefs_in_loop (loop_p loop
, basic_block
*bbs
,
1951 vec
<data_reference_p
> *datarefs
,
1952 unsigned int *n_stmts
)
1955 for (unsigned i
= 0; i
< loop
->num_nodes
; i
++)
1956 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
1957 !gsi_end_p (gsi
); gsi_next (&gsi
))
1959 gimple
*stmt
= gsi_stmt (gsi
);
1960 if (is_gimple_debug (stmt
))
1963 opt_result res
= vect_find_stmt_data_reference (loop
, stmt
, datarefs
,
1967 if (is_gimple_call (stmt
) && loop
->safelen
)
1969 tree fndecl
= gimple_call_fndecl (stmt
), op
;
1970 if (fndecl
!= NULL_TREE
)
1972 cgraph_node
*node
= cgraph_node::get (fndecl
);
1973 if (node
!= NULL
&& node
->simd_clones
!= NULL
)
1975 unsigned int j
, n
= gimple_call_num_args (stmt
);
1976 for (j
= 0; j
< n
; j
++)
1978 op
= gimple_call_arg (stmt
, j
);
1980 || (REFERENCE_CLASS_P (op
)
1981 && get_base_address (op
)))
1984 op
= gimple_call_lhs (stmt
);
1985 /* Ignore #pragma omp declare simd functions
1986 if they don't have data references in the
1987 call stmt itself. */
1991 || (REFERENCE_CLASS_P (op
)
1992 && get_base_address (op
)))))
1999 /* If dependence analysis will give up due to the limit on the
2000 number of datarefs stop here and fail fatally. */
2001 if (datarefs
->length ()
2002 > (unsigned)param_loop_max_datarefs_for_datadeps
)
2003 return opt_result::failure_at (stmt
, "exceeded param "
2004 "loop-max-datarefs-for-datadeps\n");
2006 return opt_result::success ();
2009 /* Look for SLP-only access groups and turn each individual access into its own
2012 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo
)
2015 struct data_reference
*dr
;
2017 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2019 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
2020 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2022 gcc_assert (DR_REF (dr
));
2023 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (DR_STMT (dr
));
2025 /* Check if the load is a part of an interleaving chain. */
2026 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2028 stmt_vec_info first_element
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
2029 unsigned int group_size
= DR_GROUP_SIZE (first_element
);
2031 /* Check if SLP-only groups. */
2032 if (!STMT_SLP_TYPE (stmt_info
)
2033 && STMT_VINFO_SLP_VECT_ONLY (first_element
))
2035 /* Dissolve the group. */
2036 STMT_VINFO_SLP_VECT_ONLY (first_element
) = false;
2038 stmt_vec_info vinfo
= first_element
;
2041 stmt_vec_info next
= DR_GROUP_NEXT_ELEMENT (vinfo
);
2042 DR_GROUP_FIRST_ELEMENT (vinfo
) = vinfo
;
2043 DR_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2044 DR_GROUP_SIZE (vinfo
) = 1;
2045 if (STMT_VINFO_STRIDED_P (first_element
))
2046 DR_GROUP_GAP (vinfo
) = 0;
2048 DR_GROUP_GAP (vinfo
) = group_size
- 1;
2056 /* Determine if operating on full vectors for LOOP_VINFO might leave
2057 some scalar iterations still to do. If so, decide how we should
2058 handle those scalar iterations. The possibilities are:
2060 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2063 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2064 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2065 LOOP_VINFO_PEELING_FOR_NITER == false
2067 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2068 to handle the remaining scalar iterations. In this case:
2070 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2071 LOOP_VINFO_PEELING_FOR_NITER == true
2073 There are two choices:
2075 (2a) Consider vectorizing the epilogue loop at the same VF as the
2076 main loop, but using partial vectors instead of full vectors.
2079 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2081 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2084 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2086 When FOR_EPILOGUE_P is true, make this determination based on the
2087 assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2088 based on the assumption that LOOP_VINFO is the main loop. The caller
2089 has made sure that the number of iterations is set appropriately for
2090 this value of FOR_EPILOGUE_P. */
2093 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo
,
2094 bool for_epilogue_p
)
2096 /* Determine whether there would be any scalar iterations left over. */
2097 bool need_peeling_or_partial_vectors_p
2098 = vect_need_peeling_or_partial_vectors_p (loop_vinfo
);
2100 /* Decide whether to vectorize the loop with partial vectors. */
2101 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
) = false;
2102 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo
) = false;
2103 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
2104 && need_peeling_or_partial_vectors_p
)
2106 /* For partial-vector-usage=1, try to push the handling of partial
2107 vectors to the epilogue, with the main loop continuing to operate
2110 ??? We could then end up failing to use partial vectors if we
2111 decide to peel iterations into a prologue, and if the main loop
2112 then ends up processing fewer than VF iterations. */
2113 if (param_vect_partial_vector_usage
== 1
2114 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo
)
2115 && !vect_known_niters_smaller_than_vf (loop_vinfo
))
2116 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo
) = true;
2118 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
) = true;
2121 if (dump_enabled_p ())
2123 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
2124 dump_printf_loc (MSG_NOTE
, vect_location
,
2125 "operating on partial vectors%s.\n",
2126 for_epilogue_p
? " for epilogue loop" : "");
2128 dump_printf_loc (MSG_NOTE
, vect_location
,
2129 "operating only on full vectors%s.\n",
2130 for_epilogue_p
? " for epilogue loop" : "");
2135 loop_vec_info orig_loop_vinfo
= LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
2136 gcc_assert (orig_loop_vinfo
);
2137 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
2138 gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo
),
2139 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo
)));
2142 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
2143 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
2145 /* Check that the loop processes at least one full vector. */
2146 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2147 tree scalar_niters
= LOOP_VINFO_NITERS (loop_vinfo
);
2148 if (known_lt (wi::to_widest (scalar_niters
), vf
))
2149 return opt_result::failure_at (vect_location
,
2150 "loop does not have enough iterations"
2151 " to support vectorization.\n");
2153 /* If we need to peel an extra epilogue iteration to handle data
2154 accesses with gaps, check that there are enough scalar iterations
2157 The check above is redundant with this one when peeling for gaps,
2158 but the distinction is useful for diagnostics. */
2159 tree scalar_nitersm1
= LOOP_VINFO_NITERSM1 (loop_vinfo
);
2160 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
2161 && known_lt (wi::to_widest (scalar_nitersm1
), vf
))
2162 return opt_result::failure_at (vect_location
,
2163 "loop does not have enough iterations"
2164 " to support peeling for gaps.\n");
2167 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
)
2168 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
)
2169 && need_peeling_or_partial_vectors_p
);
2171 return opt_result::success ();
2174 /* Function vect_analyze_loop_2.
2176 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2177 for it. The different analyses will record information in the
2178 loop_vec_info struct. */
2180 vect_analyze_loop_2 (loop_vec_info loop_vinfo
, bool &fatal
, unsigned *n_stmts
)
2182 opt_result ok
= opt_result::success ();
2184 unsigned int max_vf
= MAX_VECTORIZATION_FACTOR
;
2185 poly_uint64 min_vf
= 2;
2186 loop_vec_info orig_loop_vinfo
= NULL
;
2188 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2189 loop_vec_info of the first vectorized loop. */
2190 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
2191 orig_loop_vinfo
= LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
2193 orig_loop_vinfo
= loop_vinfo
;
2194 gcc_assert (orig_loop_vinfo
);
2196 /* The first group of checks is independent of the vector size. */
2199 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo
)
2200 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo
)))
2201 return opt_result::failure_at (vect_location
,
2202 "not vectorized: simd if(0)\n");
2204 /* Find all data references in the loop (which correspond to vdefs/vuses)
2205 and analyze their evolution in the loop. */
2207 loop_p loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2209 /* Gather the data references and count stmts in the loop. */
2210 if (!LOOP_VINFO_DATAREFS (loop_vinfo
).exists ())
2213 = vect_get_datarefs_in_loop (loop
, LOOP_VINFO_BBS (loop_vinfo
),
2214 &LOOP_VINFO_DATAREFS (loop_vinfo
),
2218 if (dump_enabled_p ())
2219 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2220 "not vectorized: loop contains function "
2221 "calls or data references that cannot "
2225 loop_vinfo
->shared
->save_datarefs ();
2228 loop_vinfo
->shared
->check_datarefs ();
2230 /* Analyze the data references and also adjust the minimal
2231 vectorization factor according to the loads and stores. */
2233 ok
= vect_analyze_data_refs (loop_vinfo
, &min_vf
, &fatal
);
2236 if (dump_enabled_p ())
2237 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2238 "bad data references.\n");
2242 /* Classify all cross-iteration scalar data-flow cycles.
2243 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2244 vect_analyze_scalar_cycles (loop_vinfo
);
2246 vect_pattern_recog (loop_vinfo
);
2248 vect_fixup_scalar_cycles_with_patterns (loop_vinfo
);
2250 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2251 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2253 ok
= vect_analyze_data_ref_accesses (loop_vinfo
, NULL
);
2256 if (dump_enabled_p ())
2257 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2258 "bad data access.\n");
2262 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2264 ok
= vect_mark_stmts_to_be_vectorized (loop_vinfo
, &fatal
);
2267 if (dump_enabled_p ())
2268 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2269 "unexpected pattern.\n");
2273 /* While the rest of the analysis below depends on it in some way. */
2276 /* Analyze data dependences between the data-refs in the loop
2277 and adjust the maximum vectorization factor according to
2279 FORNOW: fail at the first data dependence that we encounter. */
2281 ok
= vect_analyze_data_ref_dependences (loop_vinfo
, &max_vf
);
2284 if (dump_enabled_p ())
2285 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2286 "bad data dependence.\n");
2289 if (max_vf
!= MAX_VECTORIZATION_FACTOR
2290 && maybe_lt (max_vf
, min_vf
))
2291 return opt_result::failure_at (vect_location
, "bad data dependence.\n");
2292 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo
) = max_vf
;
2294 ok
= vect_determine_vectorization_factor (loop_vinfo
);
2297 if (dump_enabled_p ())
2298 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2299 "can't determine vectorization factor.\n");
2302 if (max_vf
!= MAX_VECTORIZATION_FACTOR
2303 && maybe_lt (max_vf
, LOOP_VINFO_VECT_FACTOR (loop_vinfo
)))
2304 return opt_result::failure_at (vect_location
, "bad data dependence.\n");
2306 /* Compute the scalar iteration cost. */
2307 vect_compute_single_scalar_iteration_cost (loop_vinfo
);
2309 poly_uint64 saved_vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2311 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2312 ok
= vect_analyze_slp (loop_vinfo
, *n_stmts
);
2316 /* If there are any SLP instances mark them as pure_slp. */
2317 bool slp
= vect_make_slp_decision (loop_vinfo
);
2320 /* Find stmts that need to be both vectorized and SLPed. */
2321 vect_detect_hybrid_slp (loop_vinfo
);
2323 /* Update the vectorization factor based on the SLP decision. */
2324 vect_update_vf_for_slp (loop_vinfo
);
2326 /* Optimize the SLP graph with the vectorization factor fixed. */
2327 vect_optimize_slp (loop_vinfo
);
2329 /* Gather the loads reachable from the SLP graph entries. */
2330 vect_gather_slp_loads (loop_vinfo
);
2333 bool saved_can_use_partial_vectors_p
2334 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
);
2336 /* We don't expect to have to roll back to anything other than an empty
2338 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo
).is_empty ());
2340 /* This is the point where we can re-start analysis with SLP forced off. */
2343 /* Now the vectorization factor is final. */
2344 poly_uint64 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2345 gcc_assert (known_ne (vectorization_factor
, 0U));
2347 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && dump_enabled_p ())
2349 dump_printf_loc (MSG_NOTE
, vect_location
,
2350 "vectorization_factor = ");
2351 dump_dec (MSG_NOTE
, vectorization_factor
);
2352 dump_printf (MSG_NOTE
, ", niters = %wd\n",
2353 LOOP_VINFO_INT_NITERS (loop_vinfo
));
2356 /* Analyze the alignment of the data-refs in the loop.
2357 Fail if a data reference is found that cannot be vectorized. */
2359 ok
= vect_analyze_data_refs_alignment (loop_vinfo
);
2362 if (dump_enabled_p ())
2363 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2364 "bad data alignment.\n");
2368 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2369 It is important to call pruning after vect_analyze_data_ref_accesses,
2370 since we use grouping information gathered by interleaving analysis. */
2371 ok
= vect_prune_runtime_alias_test_list (loop_vinfo
);
2375 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2376 vectorization, since we do not want to add extra peeling or
2377 add versioning for alignment. */
2378 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
2379 /* This pass will decide on using loop versioning and/or loop peeling in
2380 order to enhance the alignment of data references in the loop. */
2381 ok
= vect_enhance_data_refs_alignment (loop_vinfo
);
2387 /* Analyze operations in the SLP instances. Note this may
2388 remove unsupported SLP instances which makes the above
2389 SLP kind detection invalid. */
2390 unsigned old_size
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).length ();
2391 vect_slp_analyze_operations (loop_vinfo
);
2392 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).length () != old_size
)
2394 ok
= opt_result::failure_at (vect_location
,
2395 "unsupported SLP instances\n");
2399 /* Check whether any load in ALL SLP instances is possibly permuted. */
2400 slp_tree load_node
, slp_root
;
2402 slp_instance instance
;
2403 bool can_use_lanes
= true;
2404 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), x
, instance
)
2406 slp_root
= SLP_INSTANCE_TREE (instance
);
2407 int group_size
= SLP_TREE_LANES (slp_root
);
2408 tree vectype
= SLP_TREE_VECTYPE (slp_root
);
2409 bool loads_permuted
= false;
2410 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load_node
)
2412 if (!SLP_TREE_LOAD_PERMUTATION (load_node
).exists ())
2415 stmt_vec_info load_info
;
2416 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node
), j
, load_info
)
2417 if (SLP_TREE_LOAD_PERMUTATION (load_node
)[j
] != j
)
2419 loads_permuted
= true;
2424 /* If the loads and stores can be handled with load/store-lane
2425 instructions record it and move on to the next instance. */
2427 && SLP_INSTANCE_KIND (instance
) == slp_inst_kind_store
2428 && vect_store_lanes_supported (vectype
, group_size
, false))
2430 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load_node
)
2432 stmt_vec_info stmt_vinfo
= DR_GROUP_FIRST_ELEMENT
2433 (SLP_TREE_SCALAR_STMTS (load_node
)[0]);
2434 /* Use SLP for strided accesses (or if we can't
2436 if (STMT_VINFO_STRIDED_P (stmt_vinfo
)
2437 || ! vect_load_lanes_supported
2438 (STMT_VINFO_VECTYPE (stmt_vinfo
),
2439 DR_GROUP_SIZE (stmt_vinfo
), false))
2444 = can_use_lanes
&& i
== SLP_INSTANCE_LOADS (instance
).length ();
2446 if (can_use_lanes
&& dump_enabled_p ())
2447 dump_printf_loc (MSG_NOTE
, vect_location
,
2448 "SLP instance %p can use load/store-lanes\n",
2453 can_use_lanes
= false;
2458 /* If all SLP instances can use load/store-lanes abort SLP and try again
2459 with SLP disabled. */
2462 ok
= opt_result::failure_at (vect_location
,
2463 "Built SLP cancelled: can use "
2464 "load/store-lanes\n");
2465 if (dump_enabled_p ())
2466 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2467 "Built SLP cancelled: all SLP instances support "
2468 "load/store-lanes\n");
2473 /* Dissolve SLP-only groups. */
2474 vect_dissolve_slp_only_groups (loop_vinfo
);
2476 /* Scan all the remaining operations in the loop that are not subject
2477 to SLP and make sure they are vectorizable. */
2478 ok
= vect_analyze_loop_operations (loop_vinfo
);
2481 if (dump_enabled_p ())
2482 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2483 "bad operation or unsupported loop bound.\n");
2487 /* For now, we don't expect to mix both masking and length approaches for one
2488 loop, disable it if both are recorded. */
2489 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
2490 && !LOOP_VINFO_MASKS (loop_vinfo
).is_empty ()
2491 && !LOOP_VINFO_LENS (loop_vinfo
).is_empty ())
2493 if (dump_enabled_p ())
2494 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2495 "can't vectorize a loop with partial vectors"
2496 " because we don't expect to mix different"
2497 " approaches with partial vectors for the"
2499 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
2502 /* If we still have the option of using partial vectors,
2503 check whether we can generate the necessary loop controls. */
2504 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
2505 && !vect_verify_full_masking (loop_vinfo
)
2506 && !vect_verify_loop_lens (loop_vinfo
))
2507 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
2509 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2510 to be able to handle fewer than VF scalars, or needs to have a lower VF
2511 than the main loop. */
2512 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
)
2513 && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
2514 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo
),
2515 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo
)))
2516 return opt_result::failure_at (vect_location
,
2517 "Vectorization factor too high for"
2518 " epilogue loop.\n");
2520 /* Decide whether this loop_vinfo should use partial vectors or peeling,
2521 assuming that the loop will be used as a main loop. We will redo
2522 this analysis later if we instead decide to use the loop as an
2524 ok
= vect_determine_partial_vectors_and_peeling (loop_vinfo
, false);
2528 /* Check the costings of the loop make vectorizing worthwhile. */
2529 res
= vect_analyze_loop_costing (loop_vinfo
);
2532 ok
= opt_result::failure_at (vect_location
,
2533 "Loop costings may not be worthwhile.\n");
2537 return opt_result::failure_at (vect_location
,
2538 "Loop costings not worthwhile.\n");
2540 /* If an epilogue loop is required make sure we can create one. */
2541 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
2542 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
))
2544 if (dump_enabled_p ())
2545 dump_printf_loc (MSG_NOTE
, vect_location
, "epilog loop required\n");
2546 if (!vect_can_advance_ivs_p (loop_vinfo
)
2547 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo
),
2548 single_exit (LOOP_VINFO_LOOP
2551 ok
= opt_result::failure_at (vect_location
,
2552 "not vectorized: can't create required "
2558 /* During peeling, we need to check if number of loop iterations is
2559 enough for both peeled prolog loop and vector loop. This check
2560 can be merged along with threshold check of loop versioning, so
2561 increase threshold for this case if necessary.
2563 If we are analyzing an epilogue we still want to check what its
2564 versioning threshold would be. If we decide to vectorize the epilogues we
2565 will want to use the lowest versioning threshold of all epilogues and main
2566 loop. This will enable us to enter a vectorized epilogue even when
2567 versioning the loop. We can't simply check whether the epilogue requires
2568 versioning though since we may have skipped some versioning checks when
2569 analyzing the epilogue. For instance, checks for alias versioning will be
2570 skipped when dealing with epilogues as we assume we already checked them
2571 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2572 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo
))
2574 poly_uint64 niters_th
= 0;
2575 unsigned int th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
2577 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo
))
2579 /* Niters for peeled prolog loop. */
2580 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) < 0)
2582 dr_vec_info
*dr_info
= LOOP_VINFO_UNALIGNED_DR (loop_vinfo
);
2583 tree vectype
= STMT_VINFO_VECTYPE (dr_info
->stmt
);
2584 niters_th
+= TYPE_VECTOR_SUBPARTS (vectype
) - 1;
2587 niters_th
+= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
2590 /* Niters for at least one iteration of vectorized loop. */
2591 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
2592 niters_th
+= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2593 /* One additional iteration because of peeling for gap. */
2594 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
2597 /* Use the same condition as vect_transform_loop to decide when to use
2598 the cost to determine a versioning threshold. */
2599 if (vect_apply_runtime_profitability_check_p (loop_vinfo
)
2600 && ordered_p (th
, niters_th
))
2601 niters_th
= ordered_max (poly_uint64 (th
), niters_th
);
2603 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
) = niters_th
;
2606 gcc_assert (known_eq (vectorization_factor
,
2607 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)));
2609 /* Ok to vectorize! */
2610 return opt_result::success ();
2613 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2616 /* Try again with SLP forced off but if we didn't do any SLP there is
2617 no point in re-trying. */
2621 /* If there are reduction chains re-trying will fail anyway. */
2622 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
).is_empty ())
2625 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2626 via interleaving or lane instructions. */
2627 slp_instance instance
;
2630 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), i
, instance
)
2632 stmt_vec_info vinfo
;
2633 vinfo
= SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance
))[0];
2634 if (! STMT_VINFO_GROUPED_ACCESS (vinfo
))
2636 vinfo
= DR_GROUP_FIRST_ELEMENT (vinfo
);
2637 unsigned int size
= DR_GROUP_SIZE (vinfo
);
2638 tree vectype
= STMT_VINFO_VECTYPE (vinfo
);
2639 if (! vect_store_lanes_supported (vectype
, size
, false)
2640 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype
), 1U)
2641 && ! vect_grouped_store_supported (vectype
, size
))
2642 return opt_result::failure_at (vinfo
->stmt
,
2643 "unsupported grouped store\n");
2644 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), j
, node
)
2646 vinfo
= SLP_TREE_SCALAR_STMTS (node
)[0];
2647 vinfo
= DR_GROUP_FIRST_ELEMENT (vinfo
);
2648 bool single_element_p
= !DR_GROUP_NEXT_ELEMENT (vinfo
);
2649 size
= DR_GROUP_SIZE (vinfo
);
2650 vectype
= STMT_VINFO_VECTYPE (vinfo
);
2651 if (! vect_load_lanes_supported (vectype
, size
, false)
2652 && ! vect_grouped_load_supported (vectype
, single_element_p
,
2654 return opt_result::failure_at (vinfo
->stmt
,
2655 "unsupported grouped load\n");
2659 if (dump_enabled_p ())
2660 dump_printf_loc (MSG_NOTE
, vect_location
,
2661 "re-trying with SLP disabled\n");
2663 /* Roll back state appropriately. No SLP this time. */
2665 /* Restore vectorization factor as it were without SLP. */
2666 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = saved_vectorization_factor
;
2667 /* Free the SLP instances. */
2668 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), j
, instance
)
2669 vect_free_slp_instance (instance
);
2670 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
2671 /* Reset SLP type to loop_vect on all stmts. */
2672 for (i
= 0; i
< LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
; ++i
)
2674 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
2675 for (gimple_stmt_iterator si
= gsi_start_phis (bb
);
2676 !gsi_end_p (si
); gsi_next (&si
))
2678 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
2679 STMT_SLP_TYPE (stmt_info
) = loop_vect
;
2680 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
2681 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
2683 /* vectorizable_reduction adjusts reduction stmt def-types,
2684 restore them to that of the PHI. */
2685 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info
))
2686 = STMT_VINFO_DEF_TYPE (stmt_info
);
2687 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2688 (STMT_VINFO_REDUC_DEF (stmt_info
)))
2689 = STMT_VINFO_DEF_TYPE (stmt_info
);
2692 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
2693 !gsi_end_p (si
); gsi_next (&si
))
2695 if (is_gimple_debug (gsi_stmt (si
)))
2697 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
2698 STMT_SLP_TYPE (stmt_info
) = loop_vect
;
2699 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
2701 stmt_vec_info pattern_stmt_info
2702 = STMT_VINFO_RELATED_STMT (stmt_info
);
2703 if (STMT_VINFO_SLP_VECT_ONLY (pattern_stmt_info
))
2704 STMT_VINFO_IN_PATTERN_P (stmt_info
) = false;
2706 gimple
*pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
2707 STMT_SLP_TYPE (pattern_stmt_info
) = loop_vect
;
2708 for (gimple_stmt_iterator pi
= gsi_start (pattern_def_seq
);
2709 !gsi_end_p (pi
); gsi_next (&pi
))
2710 STMT_SLP_TYPE (loop_vinfo
->lookup_stmt (gsi_stmt (pi
)))
2715 /* Free optimized alias test DDRS. */
2716 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).truncate (0);
2717 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
).release ();
2718 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).release ();
2719 /* Reset target cost data. */
2720 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
));
2721 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
)
2722 = init_cost (LOOP_VINFO_LOOP (loop_vinfo
));
2723 /* Reset accumulated rgroup information. */
2724 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo
));
2725 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo
));
2726 /* Reset assorted flags. */
2727 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = false;
2728 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = false;
2729 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) = 0;
2730 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
) = 0;
2731 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
2732 = saved_can_use_partial_vectors_p
;
2737 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2738 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2739 OLD_LOOP_VINFO is better unless something specifically indicates
2742 Note that this deliberately isn't a partial order. */
2745 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo
,
2746 loop_vec_info old_loop_vinfo
)
2748 struct loop
*loop
= LOOP_VINFO_LOOP (new_loop_vinfo
);
2749 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo
) == loop
);
2751 poly_int64 new_vf
= LOOP_VINFO_VECT_FACTOR (new_loop_vinfo
);
2752 poly_int64 old_vf
= LOOP_VINFO_VECT_FACTOR (old_loop_vinfo
);
2754 /* Always prefer a VF of loop->simdlen over any other VF. */
2757 bool new_simdlen_p
= known_eq (new_vf
, loop
->simdlen
);
2758 bool old_simdlen_p
= known_eq (old_vf
, loop
->simdlen
);
2759 if (new_simdlen_p
!= old_simdlen_p
)
2760 return new_simdlen_p
;
2763 /* Limit the VFs to what is likely to be the maximum number of iterations,
2764 to handle cases in which at least one loop_vinfo is fully-masked. */
2765 HOST_WIDE_INT estimated_max_niter
= likely_max_stmt_executions_int (loop
);
2766 if (estimated_max_niter
!= -1)
2768 if (known_le (estimated_max_niter
, new_vf
))
2769 new_vf
= estimated_max_niter
;
2770 if (known_le (estimated_max_niter
, old_vf
))
2771 old_vf
= estimated_max_niter
;
2774 /* Check whether the (fractional) cost per scalar iteration is lower
2775 or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
2776 poly_int64 rel_new
= new_loop_vinfo
->vec_inside_cost
* old_vf
;
2777 poly_int64 rel_old
= old_loop_vinfo
->vec_inside_cost
* new_vf
;
2779 HOST_WIDE_INT est_rel_new_min
2780 = estimated_poly_value (rel_new
, POLY_VALUE_MIN
);
2781 HOST_WIDE_INT est_rel_new_max
2782 = estimated_poly_value (rel_new
, POLY_VALUE_MAX
);
2784 HOST_WIDE_INT est_rel_old_min
2785 = estimated_poly_value (rel_old
, POLY_VALUE_MIN
);
2786 HOST_WIDE_INT est_rel_old_max
2787 = estimated_poly_value (rel_old
, POLY_VALUE_MAX
);
2789 /* Check first if we can make out an unambigous total order from the minimum
2790 and maximum estimates. */
2791 if (est_rel_new_min
< est_rel_old_min
2792 && est_rel_new_max
< est_rel_old_max
)
2794 else if (est_rel_old_min
< est_rel_new_min
2795 && est_rel_old_max
< est_rel_new_max
)
2797 /* When old_loop_vinfo uses a variable vectorization factor,
2798 we know that it has a lower cost for at least one runtime VF.
2799 However, we don't know how likely that VF is.
2801 One option would be to compare the costs for the estimated VFs.
2802 The problem is that that can put too much pressure on the cost
2803 model. E.g. if the estimated VF is also the lowest possible VF,
2804 and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2805 for the estimated VF, we'd then choose new_loop_vinfo even
2806 though (a) new_loop_vinfo might not actually be better than
2807 old_loop_vinfo for that VF and (b) it would be significantly
2808 worse at larger VFs.
2810 Here we go for a hacky compromise: pick new_loop_vinfo if it is
2811 no more expensive than old_loop_vinfo even after doubling the
2812 estimated old_loop_vinfo VF. For all but trivial loops, this
2813 ensures that we only pick new_loop_vinfo if it is significantly
2814 better than old_loop_vinfo at the estimated VF. */
2816 if (est_rel_old_min
!= est_rel_new_min
2817 || est_rel_old_max
!= est_rel_new_max
)
2819 HOST_WIDE_INT est_rel_new_likely
2820 = estimated_poly_value (rel_new
, POLY_VALUE_LIKELY
);
2821 HOST_WIDE_INT est_rel_old_likely
2822 = estimated_poly_value (rel_old
, POLY_VALUE_LIKELY
);
2824 return est_rel_new_likely
* 2 <= est_rel_old_likely
;
2827 /* If there's nothing to choose between the loop bodies, see whether
2828 there's a difference in the prologue and epilogue costs. */
2829 if (new_loop_vinfo
->vec_outside_cost
!= old_loop_vinfo
->vec_outside_cost
)
2830 return new_loop_vinfo
->vec_outside_cost
< old_loop_vinfo
->vec_outside_cost
;
2835 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2836 true if we should. */
2839 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo
,
2840 loop_vec_info old_loop_vinfo
)
2842 if (!vect_better_loop_vinfo_p (new_loop_vinfo
, old_loop_vinfo
))
2845 if (dump_enabled_p ())
2846 dump_printf_loc (MSG_NOTE
, vect_location
,
2847 "***** Preferring vector mode %s to vector mode %s\n",
2848 GET_MODE_NAME (new_loop_vinfo
->vector_mode
),
2849 GET_MODE_NAME (old_loop_vinfo
->vector_mode
));
2853 /* If LOOP_VINFO is already a main loop, return it unmodified. Otherwise
2854 try to reanalyze it as a main loop. Return the loop_vinfo on success
2855 and null on failure. */
2857 static loop_vec_info
2858 vect_reanalyze_as_main_loop (loop_vec_info loop_vinfo
, unsigned int *n_stmts
)
2860 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
2863 if (dump_enabled_p ())
2864 dump_printf_loc (MSG_NOTE
, vect_location
,
2865 "***** Reanalyzing as a main loop with vector mode %s\n",
2866 GET_MODE_NAME (loop_vinfo
->vector_mode
));
2868 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2869 vec_info_shared
*shared
= loop_vinfo
->shared
;
2870 opt_loop_vec_info main_loop_vinfo
= vect_analyze_loop_form (loop
, shared
);
2871 gcc_assert (main_loop_vinfo
);
2873 main_loop_vinfo
->vector_mode
= loop_vinfo
->vector_mode
;
2876 bool res
= vect_analyze_loop_2 (main_loop_vinfo
, fatal
, n_stmts
);
2880 if (dump_enabled_p ())
2881 dump_printf_loc (MSG_NOTE
, vect_location
,
2882 "***** Failed to analyze main loop with vector"
2884 GET_MODE_NAME (loop_vinfo
->vector_mode
));
2885 delete main_loop_vinfo
;
2888 LOOP_VINFO_VECTORIZABLE_P (main_loop_vinfo
) = 1;
2889 return main_loop_vinfo
;
2892 /* Function vect_analyze_loop.
2894 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2895 for it. The different analyses will record information in the
2896 loop_vec_info struct. */
2898 vect_analyze_loop (class loop
*loop
, vec_info_shared
*shared
)
2900 auto_vector_modes vector_modes
;
2902 /* Autodetect first vector size we try. */
2903 unsigned int autovec_flags
2904 = targetm
.vectorize
.autovectorize_vector_modes (&vector_modes
,
2905 loop
->simdlen
!= 0);
2906 unsigned int mode_i
= 0;
2908 DUMP_VECT_SCOPE ("analyze_loop_nest");
2910 if (loop_outer (loop
)
2911 && loop_vec_info_for_loop (loop_outer (loop
))
2912 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop
))))
2913 return opt_loop_vec_info::failure_at (vect_location
,
2914 "outer-loop already vectorized.\n");
2916 if (!find_loop_nest (loop
, &shared
->loop_nest
))
2917 return opt_loop_vec_info::failure_at
2919 "not vectorized: loop nest containing two or more consecutive inner"
2920 " loops cannot be vectorized\n");
2922 unsigned n_stmts
= 0;
2923 machine_mode autodetected_vector_mode
= VOIDmode
;
2924 opt_loop_vec_info first_loop_vinfo
= opt_loop_vec_info::success (NULL
);
2925 machine_mode next_vector_mode
= VOIDmode
;
2926 poly_uint64 lowest_th
= 0;
2927 unsigned vectorized_loops
= 0;
2928 bool pick_lowest_cost_p
= ((autovec_flags
& VECT_COMPARE_COSTS
)
2929 && !unlimited_cost_model (loop
));
2931 bool vect_epilogues
= false;
2932 opt_result res
= opt_result::success ();
2933 unsigned HOST_WIDE_INT simdlen
= loop
->simdlen
;
2936 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2937 opt_loop_vec_info loop_vinfo
= vect_analyze_loop_form (loop
, shared
);
2940 if (dump_enabled_p ())
2941 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2942 "bad loop form.\n");
2943 gcc_checking_assert (first_loop_vinfo
== NULL
);
2946 loop_vinfo
->vector_mode
= next_vector_mode
;
2950 /* When pick_lowest_cost_p is true, we should in principle iterate
2951 over all the loop_vec_infos that LOOP_VINFO could replace and
2952 try to vectorize LOOP_VINFO under the same conditions.
2953 E.g. when trying to replace an epilogue loop, we should vectorize
2954 LOOP_VINFO as an epilogue loop with the same VF limit. When trying
2955 to replace the main loop, we should vectorize LOOP_VINFO as a main
2958 However, autovectorize_vector_modes is usually sorted as follows:
2960 - Modes that naturally produce lower VFs usually follow modes that
2961 naturally produce higher VFs.
2963 - When modes naturally produce the same VF, maskable modes
2964 usually follow unmaskable ones, so that the maskable mode
2965 can be used to vectorize the epilogue of the unmaskable mode.
2967 This order is preferred because it leads to the maximum
2968 epilogue vectorization opportunities. Targets should only use
2969 a different order if they want to make wide modes available while
2970 disparaging them relative to earlier, smaller modes. The assumption
2971 in that case is that the wider modes are more expensive in some
2972 way that isn't reflected directly in the costs.
2974 There should therefore be few interesting cases in which
2975 LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2976 treated as a standalone loop, and ends up being genuinely cheaper
2977 than FIRST_LOOP_VINFO. */
2979 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
) = first_loop_vinfo
;
2981 res
= vect_analyze_loop_2 (loop_vinfo
, fatal
, &n_stmts
);
2983 autodetected_vector_mode
= loop_vinfo
->vector_mode
;
2984 if (dump_enabled_p ())
2987 dump_printf_loc (MSG_NOTE
, vect_location
,
2988 "***** Analysis succeeded with vector mode %s\n",
2989 GET_MODE_NAME (loop_vinfo
->vector_mode
));
2991 dump_printf_loc (MSG_NOTE
, vect_location
,
2992 "***** Analysis failed with vector mode %s\n",
2993 GET_MODE_NAME (loop_vinfo
->vector_mode
));
2999 while (mode_i
< vector_modes
.length ()
3000 && vect_chooses_same_modes_p (loop_vinfo
, vector_modes
[mode_i
]))
3002 if (dump_enabled_p ())
3003 dump_printf_loc (MSG_NOTE
, vect_location
,
3004 "***** The result for vector mode %s would"
3006 GET_MODE_NAME (vector_modes
[mode_i
]));
3012 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo
) = 1;
3015 /* Once we hit the desired simdlen for the first time,
3016 discard any previous attempts. */
3018 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), simdlen
))
3020 delete first_loop_vinfo
;
3021 first_loop_vinfo
= opt_loop_vec_info::success (NULL
);
3022 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
) = NULL
;
3025 else if (pick_lowest_cost_p
&& first_loop_vinfo
)
3027 /* Keep trying to roll back vectorization attempts while the
3028 loop_vec_infos they produced were worse than this one. */
3029 vec
<loop_vec_info
> &vinfos
= first_loop_vinfo
->epilogue_vinfos
;
3030 while (!vinfos
.is_empty ()
3031 && vect_joust_loop_vinfos (loop_vinfo
, vinfos
.last ()))
3033 gcc_assert (vect_epilogues
);
3034 delete vinfos
.pop ();
3036 if (vinfos
.is_empty ()
3037 && vect_joust_loop_vinfos (loop_vinfo
, first_loop_vinfo
))
3039 loop_vec_info main_loop_vinfo
3040 = vect_reanalyze_as_main_loop (loop_vinfo
, &n_stmts
);
3041 if (main_loop_vinfo
== loop_vinfo
)
3043 delete first_loop_vinfo
;
3044 first_loop_vinfo
= opt_loop_vec_info::success (NULL
);
3046 else if (main_loop_vinfo
3047 && vect_joust_loop_vinfos (main_loop_vinfo
,
3050 delete first_loop_vinfo
;
3051 first_loop_vinfo
= opt_loop_vec_info::success (NULL
);
3054 = opt_loop_vec_info::success (main_loop_vinfo
);
3057 delete main_loop_vinfo
;
3061 if (first_loop_vinfo
== NULL
)
3063 first_loop_vinfo
= loop_vinfo
;
3064 lowest_th
= LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo
);
3066 else if (vect_epilogues
3067 /* For now only allow one epilogue loop. */
3068 && first_loop_vinfo
->epilogue_vinfos
.is_empty ())
3070 first_loop_vinfo
->epilogue_vinfos
.safe_push (loop_vinfo
);
3071 poly_uint64 th
= LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
);
3072 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo
)
3073 || maybe_ne (lowest_th
, 0U));
3074 /* Keep track of the known smallest versioning
3076 if (ordered_p (lowest_th
, th
))
3077 lowest_th
= ordered_min (lowest_th
, th
);
3082 loop_vinfo
= opt_loop_vec_info::success (NULL
);
3085 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3086 enabled, SIMDUID is not set, it is the innermost loop and we have
3087 either already found the loop's SIMDLEN or there was no SIMDLEN to
3089 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3090 vect_epilogues
= (!simdlen
3091 && loop
->inner
== NULL
3092 && param_vect_epilogues_nomask
3093 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo
)
3095 /* For now only allow one epilogue loop, but allow
3096 pick_lowest_cost_p to replace it. */
3097 && (first_loop_vinfo
->epilogue_vinfos
.is_empty ()
3098 || pick_lowest_cost_p
));
3100 /* Commit to first_loop_vinfo if we have no reason to try
3102 if (!simdlen
&& !vect_epilogues
&& !pick_lowest_cost_p
)
3108 loop_vinfo
= opt_loop_vec_info::success (NULL
);
3111 gcc_checking_assert (first_loop_vinfo
== NULL
);
3116 /* Handle the case that the original loop can use partial
3117 vectorization, but want to only adopt it for the epilogue.
3118 The retry should be in the same mode as original. */
3121 && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo
))
3123 gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
3124 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
));
3125 if (dump_enabled_p ())
3126 dump_printf_loc (MSG_NOTE
, vect_location
,
3127 "***** Re-trying analysis with same vector mode"
3128 " %s for epilogue with partial vectors.\n",
3129 GET_MODE_NAME (loop_vinfo
->vector_mode
));
3133 if (mode_i
< vector_modes
.length ()
3134 && VECTOR_MODE_P (autodetected_vector_mode
)
3135 && (related_vector_mode (vector_modes
[mode_i
],
3136 GET_MODE_INNER (autodetected_vector_mode
))
3137 == autodetected_vector_mode
)
3138 && (related_vector_mode (autodetected_vector_mode
,
3139 GET_MODE_INNER (vector_modes
[mode_i
]))
3140 == vector_modes
[mode_i
]))
3142 if (dump_enabled_p ())
3143 dump_printf_loc (MSG_NOTE
, vect_location
,
3144 "***** Skipping vector mode %s, which would"
3145 " repeat the analysis for %s\n",
3146 GET_MODE_NAME (vector_modes
[mode_i
]),
3147 GET_MODE_NAME (autodetected_vector_mode
));
3151 if (mode_i
== vector_modes
.length ()
3152 || autodetected_vector_mode
== VOIDmode
)
3155 /* Try the next biggest vector size. */
3156 next_vector_mode
= vector_modes
[mode_i
++];
3157 if (dump_enabled_p ())
3158 dump_printf_loc (MSG_NOTE
, vect_location
,
3159 "***** Re-trying analysis with vector mode %s\n",
3160 GET_MODE_NAME (next_vector_mode
));
3163 if (first_loop_vinfo
)
3165 loop
->aux
= (loop_vec_info
) first_loop_vinfo
;
3166 if (dump_enabled_p ())
3167 dump_printf_loc (MSG_NOTE
, vect_location
,
3168 "***** Choosing vector mode %s\n",
3169 GET_MODE_NAME (first_loop_vinfo
->vector_mode
));
3170 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo
) = lowest_th
;
3171 return first_loop_vinfo
;
3174 return opt_loop_vec_info::propagate_failure (res
);
3177 /* Return true if there is an in-order reduction function for CODE, storing
3178 it in *REDUC_FN if so. */
3181 fold_left_reduction_fn (tree_code code
, internal_fn
*reduc_fn
)
3186 *reduc_fn
= IFN_FOLD_LEFT_PLUS
;
3194 /* Function reduction_fn_for_scalar_code
3197 CODE - tree_code of a reduction operations.
3200 REDUC_FN - the corresponding internal function to be used to reduce the
3201 vector of partial results into a single scalar result, or IFN_LAST
3202 if the operation is a supported reduction operation, but does not have
3203 such an internal function.
3205 Return FALSE if CODE currently cannot be vectorized as reduction. */
3208 reduction_fn_for_scalar_code (enum tree_code code
, internal_fn
*reduc_fn
)
3213 *reduc_fn
= IFN_REDUC_MAX
;
3217 *reduc_fn
= IFN_REDUC_MIN
;
3221 *reduc_fn
= IFN_REDUC_PLUS
;
3225 *reduc_fn
= IFN_REDUC_AND
;
3229 *reduc_fn
= IFN_REDUC_IOR
;
3233 *reduc_fn
= IFN_REDUC_XOR
;
3238 *reduc_fn
= IFN_LAST
;
3246 /* If there is a neutral value X such that SLP reduction NODE would not
3247 be affected by the introduction of additional X elements, return that X,
3248 otherwise return null. CODE is the code of the reduction and VECTOR_TYPE
3249 is the vector type that would hold element X. REDUC_CHAIN is true if
3250 the SLP statements perform a single reduction, false if each statement
3251 performs an independent reduction. */
3254 neutral_op_for_slp_reduction (slp_tree slp_node
, tree vector_type
,
3255 tree_code code
, bool reduc_chain
)
3257 vec
<stmt_vec_info
> stmts
= SLP_TREE_SCALAR_STMTS (slp_node
);
3258 stmt_vec_info stmt_vinfo
= stmts
[0];
3259 tree scalar_type
= TREE_TYPE (vector_type
);
3260 class loop
*loop
= gimple_bb (stmt_vinfo
->stmt
)->loop_father
;
3265 case WIDEN_SUM_EXPR
:
3272 return build_zero_cst (scalar_type
);
3275 return build_one_cst (scalar_type
);
3278 return build_all_ones_cst (scalar_type
);
3282 /* For MIN/MAX the initial values are neutral. A reduction chain
3283 has only a single initial value, so that value is neutral for
3286 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo
->stmt
,
3287 loop_preheader_edge (loop
));
3295 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3296 STMT is printed with a message MSG. */
3299 report_vect_op (dump_flags_t msg_type
, gimple
*stmt
, const char *msg
)
3301 dump_printf_loc (msg_type
, vect_location
, "%s%G", msg
, stmt
);
3304 /* Return true if we need an in-order reduction for operation CODE
3305 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3306 overflow must wrap. */
3309 needs_fold_left_reduction_p (tree type
, tree_code code
)
3311 /* CHECKME: check for !flag_finite_math_only too? */
3312 if (SCALAR_FLOAT_TYPE_P (type
))
3320 return !flag_associative_math
;
3323 if (INTEGRAL_TYPE_P (type
))
3325 if (!operation_no_trapping_overflow (type
, code
))
3330 if (SAT_FIXED_POINT_TYPE_P (type
))
3336 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3337 has a handled computation expression. Store the main reduction
3338 operation in *CODE. */
3341 check_reduction_path (dump_user_location_t loc
, loop_p loop
, gphi
*phi
,
3342 tree loop_arg
, enum tree_code
*code
,
3343 vec
<std::pair
<ssa_op_iter
, use_operand_p
> > &path
)
3345 auto_bitmap visited
;
3346 tree lookfor
= PHI_RESULT (phi
);
3348 use_operand_p curr
= op_iter_init_phiuse (&curri
, phi
, SSA_OP_USE
);
3349 while (USE_FROM_PTR (curr
) != loop_arg
)
3350 curr
= op_iter_next_use (&curri
);
3351 curri
.i
= curri
.numops
;
3354 path
.safe_push (std::make_pair (curri
, curr
));
3355 tree use
= USE_FROM_PTR (curr
);
3358 gimple
*def
= SSA_NAME_DEF_STMT (use
);
3359 if (gimple_nop_p (def
)
3360 || ! flow_bb_inside_loop_p (loop
, gimple_bb (def
)))
3365 std::pair
<ssa_op_iter
, use_operand_p
> x
= path
.pop ();
3369 curr
= op_iter_next_use (&curri
);
3370 /* Skip already visited or non-SSA operands (from iterating
3372 while (curr
!= NULL_USE_OPERAND_P
3373 && (TREE_CODE (USE_FROM_PTR (curr
)) != SSA_NAME
3374 || ! bitmap_set_bit (visited
,
3376 (USE_FROM_PTR (curr
)))));
3378 while (curr
== NULL_USE_OPERAND_P
&& ! path
.is_empty ());
3379 if (curr
== NULL_USE_OPERAND_P
)
3384 if (gimple_code (def
) == GIMPLE_PHI
)
3385 curr
= op_iter_init_phiuse (&curri
, as_a
<gphi
*>(def
), SSA_OP_USE
);
3387 curr
= op_iter_init_use (&curri
, def
, SSA_OP_USE
);
3388 while (curr
!= NULL_USE_OPERAND_P
3389 && (TREE_CODE (USE_FROM_PTR (curr
)) != SSA_NAME
3390 || ! bitmap_set_bit (visited
,
3392 (USE_FROM_PTR (curr
)))))
3393 curr
= op_iter_next_use (&curri
);
3394 if (curr
== NULL_USE_OPERAND_P
)
3399 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
3401 dump_printf_loc (MSG_NOTE
, loc
, "reduction path: ");
3403 std::pair
<ssa_op_iter
, use_operand_p
> *x
;
3404 FOR_EACH_VEC_ELT (path
, i
, x
)
3405 dump_printf (MSG_NOTE
, "%T ", USE_FROM_PTR (x
->second
));
3406 dump_printf (MSG_NOTE
, "\n");
3409 /* Check whether the reduction path detected is valid. */
3410 bool fail
= path
.length () == 0;
3414 for (unsigned i
= 1; i
< path
.length (); ++i
)
3416 gimple
*use_stmt
= USE_STMT (path
[i
].second
);
3417 tree op
= USE_FROM_PTR (path
[i
].second
);
3418 if (! is_gimple_assign (use_stmt
)
3419 /* The following make sure we can compute the operand index
3420 easily plus it mostly disallows chaining via COND_EXPR condition
3422 || (gimple_assign_rhs1_ptr (use_stmt
) != path
[i
].second
->use
3423 && (gimple_num_ops (use_stmt
) <= 2
3424 || gimple_assign_rhs2_ptr (use_stmt
) != path
[i
].second
->use
)
3425 && (gimple_num_ops (use_stmt
) <= 3
3426 || gimple_assign_rhs3_ptr (use_stmt
) != path
[i
].second
->use
)))
3431 /* Check there's only a single stmt the op is used on. For the
3432 not value-changing tail and the last stmt allow out-of-loop uses.
3433 ??? We could relax this and handle arbitrary live stmts by
3434 forcing a scalar epilogue for example. */
3435 imm_use_iterator imm_iter
;
3436 gimple
*op_use_stmt
;
3438 FOR_EACH_IMM_USE_STMT (op_use_stmt
, imm_iter
, op
)
3439 if (!is_gimple_debug (op_use_stmt
)
3440 && (*code
!= ERROR_MARK
3441 || flow_bb_inside_loop_p (loop
, gimple_bb (op_use_stmt
))))
3443 /* We want to allow x + x but not x < 1 ? x : 2. */
3444 if (is_gimple_assign (op_use_stmt
)
3445 && gimple_assign_rhs_code (op_use_stmt
) == COND_EXPR
)
3447 use_operand_p use_p
;
3448 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
3459 tree_code use_code
= gimple_assign_rhs_code (use_stmt
);
3460 if (use_code
== MINUS_EXPR
)
3462 use_code
= PLUS_EXPR
;
3463 /* Track whether we negate the reduction value each iteration. */
3464 if (gimple_assign_rhs2 (use_stmt
) == op
)
3467 if (CONVERT_EXPR_CODE_P (use_code
)
3468 && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt
)),
3469 TREE_TYPE (gimple_assign_rhs1 (use_stmt
))))
3471 else if (*code
== ERROR_MARK
)
3474 sign
= TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt
)));
3476 else if (use_code
!= *code
)
3481 else if ((use_code
== MIN_EXPR
3482 || use_code
== MAX_EXPR
)
3483 && sign
!= TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt
))))
3489 return ! fail
&& ! neg
&& *code
!= ERROR_MARK
;
3493 check_reduction_path (dump_user_location_t loc
, loop_p loop
, gphi
*phi
,
3494 tree loop_arg
, enum tree_code code
)
3496 auto_vec
<std::pair
<ssa_op_iter
, use_operand_p
> > path
;
3497 enum tree_code code_
;
3498 return (check_reduction_path (loc
, loop
, phi
, loop_arg
, &code_
, path
)
3504 /* Function vect_is_simple_reduction
3506 (1) Detect a cross-iteration def-use cycle that represents a simple
3507 reduction computation. We look for the following pattern:
3512 a2 = operation (a3, a1)
3519 a2 = operation (a3, a1)
3522 1. operation is commutative and associative and it is safe to
3523 change the order of the computation
3524 2. no uses for a2 in the loop (a2 is used out of the loop)
3525 3. no uses of a1 in the loop besides the reduction operation
3526 4. no uses of a1 outside the loop.
3528 Conditions 1,4 are tested here.
3529 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3531 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3534 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3538 inner loop (def of a3)
3541 (4) Detect condition expressions, ie:
3542 for (int i = 0; i < N; i++)
3548 static stmt_vec_info
3549 vect_is_simple_reduction (loop_vec_info loop_info
, stmt_vec_info phi_info
,
3550 bool *double_reduc
, bool *reduc_chain_p
)
3552 gphi
*phi
= as_a
<gphi
*> (phi_info
->stmt
);
3553 gimple
*phi_use_stmt
= NULL
;
3554 imm_use_iterator imm_iter
;
3555 use_operand_p use_p
;
3557 *double_reduc
= false;
3558 *reduc_chain_p
= false;
3559 STMT_VINFO_REDUC_TYPE (phi_info
) = TREE_CODE_REDUCTION
;
3561 tree phi_name
= PHI_RESULT (phi
);
3562 /* ??? If there are no uses of the PHI result the inner loop reduction
3563 won't be detected as possibly double-reduction by vectorizable_reduction
3564 because that tries to walk the PHI arg from the preheader edge which
3565 can be constant. See PR60382. */
3566 if (has_zero_uses (phi_name
))
3568 class loop
*loop
= (gimple_bb (phi
))->loop_father
;
3569 unsigned nphi_def_loop_uses
= 0;
3570 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, phi_name
)
3572 gimple
*use_stmt
= USE_STMT (use_p
);
3573 if (is_gimple_debug (use_stmt
))
3576 if (!flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
3578 if (dump_enabled_p ())
3579 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3580 "intermediate value used outside loop.\n");
3585 nphi_def_loop_uses
++;
3586 phi_use_stmt
= use_stmt
;
3589 tree latch_def
= PHI_ARG_DEF_FROM_EDGE (phi
, loop_latch_edge (loop
));
3590 if (TREE_CODE (latch_def
) != SSA_NAME
)
3592 if (dump_enabled_p ())
3593 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3594 "reduction: not ssa_name: %T\n", latch_def
);
3598 stmt_vec_info def_stmt_info
= loop_info
->lookup_def (latch_def
);
3600 || !flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt_info
->stmt
)))
3603 bool nested_in_vect_loop
3604 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info
), loop
);
3605 unsigned nlatch_def_loop_uses
= 0;
3606 auto_vec
<gphi
*, 3> lcphis
;
3607 bool inner_loop_of_double_reduc
= false;
3608 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, latch_def
)
3610 gimple
*use_stmt
= USE_STMT (use_p
);
3611 if (is_gimple_debug (use_stmt
))
3613 if (flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
3614 nlatch_def_loop_uses
++;
3617 /* We can have more than one loop-closed PHI. */
3618 lcphis
.safe_push (as_a
<gphi
*> (use_stmt
));
3619 if (nested_in_vect_loop
3620 && (STMT_VINFO_DEF_TYPE (loop_info
->lookup_stmt (use_stmt
))
3621 == vect_double_reduction_def
))
3622 inner_loop_of_double_reduc
= true;
3626 /* If we are vectorizing an inner reduction we are executing that
3627 in the original order only in case we are not dealing with a
3628 double reduction. */
3629 if (nested_in_vect_loop
&& !inner_loop_of_double_reduc
)
3631 if (dump_enabled_p ())
3632 report_vect_op (MSG_NOTE
, def_stmt_info
->stmt
,
3633 "detected nested cycle: ");
3634 return def_stmt_info
;
3637 /* If this isn't a nested cycle or if the nested cycle reduction value
3638 is used ouside of the inner loop we cannot handle uses of the reduction
3640 if (nlatch_def_loop_uses
> 1 || nphi_def_loop_uses
> 1)
3642 if (dump_enabled_p ())
3643 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3644 "reduction used in loop.\n");
3648 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3649 defined in the inner loop. */
3650 if (gphi
*def_stmt
= dyn_cast
<gphi
*> (def_stmt_info
->stmt
))
3652 tree op1
= PHI_ARG_DEF (def_stmt
, 0);
3653 if (gimple_phi_num_args (def_stmt
) != 1
3654 || TREE_CODE (op1
) != SSA_NAME
)
3656 if (dump_enabled_p ())
3657 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3658 "unsupported phi node definition.\n");
3663 gimple
*def1
= SSA_NAME_DEF_STMT (op1
);
3664 if (gimple_bb (def1
)
3665 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
3667 && flow_bb_inside_loop_p (loop
->inner
, gimple_bb (def1
))
3668 && is_gimple_assign (def1
)
3669 && is_a
<gphi
*> (phi_use_stmt
)
3670 && flow_bb_inside_loop_p (loop
->inner
, gimple_bb (phi_use_stmt
)))
3672 if (dump_enabled_p ())
3673 report_vect_op (MSG_NOTE
, def_stmt
,
3674 "detected double reduction: ");
3676 *double_reduc
= true;
3677 return def_stmt_info
;
3683 /* Look for the expression computing latch_def from then loop PHI result. */
3684 auto_vec
<std::pair
<ssa_op_iter
, use_operand_p
> > path
;
3685 enum tree_code code
;
3686 if (check_reduction_path (vect_location
, loop
, phi
, latch_def
, &code
,
3689 STMT_VINFO_REDUC_CODE (phi_info
) = code
;
3690 if (code
== COND_EXPR
&& !nested_in_vect_loop
)
3691 STMT_VINFO_REDUC_TYPE (phi_info
) = COND_REDUCTION
;
3693 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3694 reduction chain for which the additional restriction is that
3695 all operations in the chain are the same. */
3696 auto_vec
<stmt_vec_info
, 8> reduc_chain
;
3698 bool is_slp_reduc
= !nested_in_vect_loop
&& code
!= COND_EXPR
;
3699 for (i
= path
.length () - 1; i
>= 1; --i
)
3701 gimple
*stmt
= USE_STMT (path
[i
].second
);
3702 stmt_vec_info stmt_info
= loop_info
->lookup_stmt (stmt
);
3703 STMT_VINFO_REDUC_IDX (stmt_info
)
3704 = path
[i
].second
->use
- gimple_assign_rhs1_ptr (stmt
);
3705 enum tree_code stmt_code
= gimple_assign_rhs_code (stmt
);
3706 bool leading_conversion
= (CONVERT_EXPR_CODE_P (stmt_code
)
3707 && (i
== 1 || i
== path
.length () - 1));
3708 if ((stmt_code
!= code
&& !leading_conversion
)
3709 /* We can only handle the final value in epilogue
3710 generation for reduction chains. */
3711 || (i
!= 1 && !has_single_use (gimple_assign_lhs (stmt
))))
3712 is_slp_reduc
= false;
3713 /* For reduction chains we support a trailing/leading
3714 conversions. We do not store those in the actual chain. */
3715 if (leading_conversion
)
3717 reduc_chain
.safe_push (stmt_info
);
3719 if (is_slp_reduc
&& reduc_chain
.length () > 1)
3721 for (unsigned i
= 0; i
< reduc_chain
.length () - 1; ++i
)
3723 REDUC_GROUP_FIRST_ELEMENT (reduc_chain
[i
]) = reduc_chain
[0];
3724 REDUC_GROUP_NEXT_ELEMENT (reduc_chain
[i
]) = reduc_chain
[i
+1];
3726 REDUC_GROUP_FIRST_ELEMENT (reduc_chain
.last ()) = reduc_chain
[0];
3727 REDUC_GROUP_NEXT_ELEMENT (reduc_chain
.last ()) = NULL
;
3729 /* Save the chain for further analysis in SLP detection. */
3730 LOOP_VINFO_REDUCTION_CHAINS (loop_info
).safe_push (reduc_chain
[0]);
3731 REDUC_GROUP_SIZE (reduc_chain
[0]) = reduc_chain
.length ();
3733 *reduc_chain_p
= true;
3734 if (dump_enabled_p ())
3735 dump_printf_loc (MSG_NOTE
, vect_location
,
3736 "reduction: detected reduction chain\n");
3738 else if (dump_enabled_p ())
3739 dump_printf_loc (MSG_NOTE
, vect_location
,
3740 "reduction: detected reduction\n");
3742 return def_stmt_info
;
3745 if (dump_enabled_p ())
3746 dump_printf_loc (MSG_NOTE
, vect_location
,
3747 "reduction: unknown pattern\n");
3752 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3753 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3754 or -1 if not known. */
3757 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo
, int peel_iters_prologue
)
3759 int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
3760 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) || peel_iters_prologue
== -1)
3762 if (dump_enabled_p ())
3763 dump_printf_loc (MSG_NOTE
, vect_location
,
3764 "cost model: epilogue peel iters set to vf/2 "
3765 "because loop iterations are unknown .\n");
3766 return assumed_vf
/ 2;
3770 int niters
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
3771 peel_iters_prologue
= MIN (niters
, peel_iters_prologue
);
3772 int peel_iters_epilogue
= (niters
- peel_iters_prologue
) % assumed_vf
;
3773 /* If we need to peel for gaps, but no peeling is required, we have to
3774 peel VF iterations. */
3775 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) && !peel_iters_epilogue
)
3776 peel_iters_epilogue
= assumed_vf
;
3777 return peel_iters_epilogue
;
3781 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3783 vect_get_known_peeling_cost (loop_vec_info loop_vinfo
, int peel_iters_prologue
,
3784 int *peel_iters_epilogue
,
3785 stmt_vector_for_cost
*scalar_cost_vec
,
3786 stmt_vector_for_cost
*prologue_cost_vec
,
3787 stmt_vector_for_cost
*epilogue_cost_vec
)
3791 *peel_iters_epilogue
3792 = vect_get_peel_iters_epilogue (loop_vinfo
, peel_iters_prologue
);
3794 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
3796 /* If peeled iterations are known but number of scalar loop
3797 iterations are unknown, count a taken branch per peeled loop. */
3798 if (peel_iters_prologue
> 0)
3799 retval
= record_stmt_cost (prologue_cost_vec
, 1, cond_branch_taken
,
3800 NULL
, NULL_TREE
, 0, vect_prologue
);
3801 if (*peel_iters_epilogue
> 0)
3802 retval
+= record_stmt_cost (epilogue_cost_vec
, 1, cond_branch_taken
,
3803 NULL
, NULL_TREE
, 0, vect_epilogue
);
3806 stmt_info_for_cost
*si
;
3808 if (peel_iters_prologue
)
3809 FOR_EACH_VEC_ELT (*scalar_cost_vec
, j
, si
)
3810 retval
+= record_stmt_cost (prologue_cost_vec
,
3811 si
->count
* peel_iters_prologue
,
3812 si
->kind
, si
->stmt_info
, si
->misalign
,
3814 if (*peel_iters_epilogue
)
3815 FOR_EACH_VEC_ELT (*scalar_cost_vec
, j
, si
)
3816 retval
+= record_stmt_cost (epilogue_cost_vec
,
3817 si
->count
* *peel_iters_epilogue
,
3818 si
->kind
, si
->stmt_info
, si
->misalign
,
3824 /* Function vect_estimate_min_profitable_iters
3826 Return the number of iterations required for the vector version of the
3827 loop to be profitable relative to the cost of the scalar version of the
3830 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3831 of iterations for vectorization. -1 value means loop vectorization
3832 is not profitable. This returned value may be used for dynamic
3833 profitability check.
3835 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3836 for static check against estimated number of iterations. */
3839 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo
,
3840 int *ret_min_profitable_niters
,
3841 int *ret_min_profitable_estimate
)
3843 int min_profitable_iters
;
3844 int min_profitable_estimate
;
3845 int peel_iters_prologue
;
3846 int peel_iters_epilogue
;
3847 unsigned vec_inside_cost
= 0;
3848 int vec_outside_cost
= 0;
3849 unsigned vec_prologue_cost
= 0;
3850 unsigned vec_epilogue_cost
= 0;
3851 int scalar_single_iter_cost
= 0;
3852 int scalar_outside_cost
= 0;
3853 int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
3854 int npeel
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
3855 void *target_cost_data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
3857 /* Cost model disabled. */
3858 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
3860 if (dump_enabled_p ())
3861 dump_printf_loc (MSG_NOTE
, vect_location
, "cost model disabled.\n");
3862 *ret_min_profitable_niters
= 0;
3863 *ret_min_profitable_estimate
= 0;
3867 /* Requires loop versioning tests to handle misalignment. */
3868 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
3870 /* FIXME: Make cost depend on complexity of individual check. */
3871 unsigned len
= LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ();
3872 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, len
, vector_stmt
,
3873 NULL
, NULL_TREE
, 0, vect_prologue
);
3874 if (dump_enabled_p ())
3875 dump_printf (MSG_NOTE
,
3876 "cost model: Adding cost of checks for loop "
3877 "versioning to treat misalignment.\n");
3880 /* Requires loop versioning with alias checks. */
3881 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
3883 /* FIXME: Make cost depend on complexity of individual check. */
3884 unsigned len
= LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
).length ();
3885 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, len
, vector_stmt
,
3886 NULL
, NULL_TREE
, 0, vect_prologue
);
3887 len
= LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).length ();
3889 /* Count LEN - 1 ANDs and LEN comparisons. */
3890 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, len
* 2 - 1,
3891 scalar_stmt
, NULL
, NULL_TREE
, 0, vect_prologue
);
3892 len
= LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).length ();
3895 /* Count LEN - 1 ANDs and LEN comparisons. */
3896 unsigned int nstmts
= len
* 2 - 1;
3897 /* +1 for each bias that needs adding. */
3898 for (unsigned int i
= 0; i
< len
; ++i
)
3899 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
)[i
].unsigned_p
)
3901 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, nstmts
,
3902 scalar_stmt
, NULL
, NULL_TREE
, 0, vect_prologue
);
3904 if (dump_enabled_p ())
3905 dump_printf (MSG_NOTE
,
3906 "cost model: Adding cost of checks for loop "
3907 "versioning aliasing.\n");
3910 /* Requires loop versioning with niter checks. */
3911 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo
))
3913 /* FIXME: Make cost depend on complexity of individual check. */
3914 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, 1, vector_stmt
,
3915 NULL
, NULL_TREE
, 0, vect_prologue
);
3916 if (dump_enabled_p ())
3917 dump_printf (MSG_NOTE
,
3918 "cost model: Adding cost of checks for loop "
3919 "versioning niters.\n");
3922 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
3923 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, 1, cond_branch_taken
,
3924 NULL
, NULL_TREE
, 0, vect_prologue
);
3926 /* Count statements in scalar loop. Using this as scalar cost for a single
3929 TODO: Add outer loop support.
3931 TODO: Consider assigning different costs to different scalar
3934 scalar_single_iter_cost
3935 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo
);
3937 /* Add additional cost for the peeled instructions in prologue and epilogue
3938 loop. (For fully-masked loops there will be no peeling.)
3940 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3941 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3943 TODO: Build an expression that represents peel_iters for prologue and
3944 epilogue to be used in a run-time test. */
3946 bool prologue_need_br_taken_cost
= false;
3947 bool prologue_need_br_not_taken_cost
= false;
3949 /* Calculate peel_iters_prologue. */
3950 if (vect_use_loop_mask_for_alignment_p (loop_vinfo
))
3951 peel_iters_prologue
= 0;
3954 peel_iters_prologue
= assumed_vf
/ 2;
3955 if (dump_enabled_p ())
3956 dump_printf (MSG_NOTE
, "cost model: "
3957 "prologue peel iters set to vf/2.\n");
3959 /* If peeled iterations are unknown, count a taken branch and a not taken
3960 branch per peeled loop. Even if scalar loop iterations are known,
3961 vector iterations are not known since peeled prologue iterations are
3962 not known. Hence guards remain the same. */
3963 prologue_need_br_taken_cost
= true;
3964 prologue_need_br_not_taken_cost
= true;
3968 peel_iters_prologue
= npeel
;
3969 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && peel_iters_prologue
> 0)
3970 /* If peeled iterations are known but number of scalar loop
3971 iterations are unknown, count a taken branch per peeled loop. */
3972 prologue_need_br_taken_cost
= true;
3975 bool epilogue_need_br_taken_cost
= false;
3976 bool epilogue_need_br_not_taken_cost
= false;
3978 /* Calculate peel_iters_epilogue. */
3979 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
3980 /* We need to peel exactly one iteration for gaps. */
3981 peel_iters_epilogue
= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) ? 1 : 0;
3984 /* If peeling for alignment is unknown, loop bound of main loop
3986 peel_iters_epilogue
= assumed_vf
/ 2;
3987 if (dump_enabled_p ())
3988 dump_printf (MSG_NOTE
, "cost model: "
3989 "epilogue peel iters set to vf/2 because "
3990 "peeling for alignment is unknown.\n");
3992 /* See the same reason above in peel_iters_prologue calculation. */
3993 epilogue_need_br_taken_cost
= true;
3994 epilogue_need_br_not_taken_cost
= true;
3998 peel_iters_epilogue
= vect_get_peel_iters_epilogue (loop_vinfo
, npeel
);
3999 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && peel_iters_epilogue
> 0)
4000 /* If peeled iterations are known but number of scalar loop
4001 iterations are unknown, count a taken branch per peeled loop. */
4002 epilogue_need_br_taken_cost
= true;
4005 stmt_info_for_cost
*si
;
4007 /* Add costs associated with peel_iters_prologue. */
4008 if (peel_iters_prologue
)
4009 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
), j
, si
)
4011 (void) add_stmt_cost (loop_vinfo
, target_cost_data
,
4012 si
->count
* peel_iters_prologue
, si
->kind
,
4013 si
->stmt_info
, si
->vectype
, si
->misalign
,
4017 /* Add costs associated with peel_iters_epilogue. */
4018 if (peel_iters_epilogue
)
4019 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
), j
, si
)
4021 (void) add_stmt_cost (loop_vinfo
, target_cost_data
,
4022 si
->count
* peel_iters_epilogue
, si
->kind
,
4023 si
->stmt_info
, si
->vectype
, si
->misalign
,
4027 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4029 if (prologue_need_br_taken_cost
)
4030 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, 1, cond_branch_taken
,
4031 NULL
, NULL_TREE
, 0, vect_prologue
);
4033 if (prologue_need_br_not_taken_cost
)
4034 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, 1,
4035 cond_branch_not_taken
, NULL
, NULL_TREE
, 0,
4038 if (epilogue_need_br_taken_cost
)
4039 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, 1, cond_branch_taken
,
4040 NULL
, NULL_TREE
, 0, vect_epilogue
);
4042 if (epilogue_need_br_not_taken_cost
)
4043 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, 1,
4044 cond_branch_not_taken
, NULL
, NULL_TREE
, 0,
4047 /* Take care of special costs for rgroup controls of partial vectors. */
4048 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
4050 /* Calculate how many masks we need to generate. */
4051 unsigned int num_masks
= 0;
4052 rgroup_controls
*rgm
;
4053 unsigned int num_vectors_m1
;
4054 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
), num_vectors_m1
, rgm
)
4056 num_masks
+= num_vectors_m1
+ 1;
4057 gcc_assert (num_masks
> 0);
4059 /* In the worst case, we need to generate each mask in the prologue
4060 and in the loop body. One of the loop body mask instructions
4061 replaces the comparison in the scalar loop, and since we don't
4062 count the scalar comparison against the scalar body, we shouldn't
4063 count that vector instruction against the vector body either.
4065 Sometimes we can use unpacks instead of generating prologue
4066 masks and sometimes the prologue mask will fold to a constant,
4067 so the actual prologue cost might be smaller. However, it's
4068 simpler and safer to use the worst-case cost; if this ends up
4069 being the tie-breaker between vectorizing or not, then it's
4070 probably better not to vectorize. */
4071 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, num_masks
,
4072 vector_stmt
, NULL
, NULL_TREE
, 0, vect_prologue
);
4073 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, num_masks
- 1,
4074 vector_stmt
, NULL
, NULL_TREE
, 0, vect_body
);
4076 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo
))
4078 /* Referring to the functions vect_set_loop_condition_partial_vectors
4079 and vect_set_loop_controls_directly, we need to generate each
4080 length in the prologue and in the loop body if required. Although
4081 there are some possible optimizations, we consider the worst case
4084 bool niters_known_p
= LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
);
4086 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
)
4087 && !vect_known_niters_smaller_than_vf (loop_vinfo
));
4089 /* Calculate how many statements to be added. */
4090 unsigned int prologue_stmts
= 0;
4091 unsigned int body_stmts
= 0;
4093 rgroup_controls
*rgc
;
4094 unsigned int num_vectors_m1
;
4095 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo
), num_vectors_m1
, rgc
)
4098 /* May need one SHIFT for nitems_total computation. */
4099 unsigned nitems
= rgc
->max_nscalars_per_iter
* rgc
->factor
;
4100 if (nitems
!= 1 && !niters_known_p
)
4101 prologue_stmts
+= 1;
4103 /* May need one MAX and one MINUS for wrap around. */
4104 if (vect_rgroup_iv_might_wrap_p (loop_vinfo
, rgc
))
4105 prologue_stmts
+= 2;
4107 /* Need one MAX and one MINUS for each batch limit excepting for
4109 prologue_stmts
+= num_vectors_m1
* 2;
4111 unsigned int num_vectors
= num_vectors_m1
+ 1;
4113 /* Need to set up lengths in prologue, only one MIN required
4114 for each since start index is zero. */
4115 prologue_stmts
+= num_vectors
;
4117 /* Each may need two MINs and one MINUS to update lengths in body
4118 for next iteration. */
4120 body_stmts
+= 3 * num_vectors
;
4123 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, prologue_stmts
,
4124 scalar_stmt
, NULL
, NULL_TREE
, 0, vect_prologue
);
4125 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, body_stmts
,
4126 scalar_stmt
, NULL
, NULL_TREE
, 0, vect_body
);
4129 /* FORNOW: The scalar outside cost is incremented in one of the
4132 1. The vectorizer checks for alignment and aliasing and generates
4133 a condition that allows dynamic vectorization. A cost model
4134 check is ANDED with the versioning condition. Hence scalar code
4135 path now has the added cost of the versioning check.
4137 if (cost > th & versioning_check)
4140 Hence run-time scalar is incremented by not-taken branch cost.
4142 2. The vectorizer then checks if a prologue is required. If the
4143 cost model check was not done before during versioning, it has to
4144 be done before the prologue check.
4147 prologue = scalar_iters
4152 if (prologue == num_iters)
4155 Hence the run-time scalar cost is incremented by a taken branch,
4156 plus a not-taken branch, plus a taken branch cost.
4158 3. The vectorizer then checks if an epilogue is required. If the
4159 cost model check was not done before during prologue check, it
4160 has to be done with the epilogue check.
4166 if (prologue == num_iters)
4169 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4172 Hence the run-time scalar cost should be incremented by 2 taken
4175 TODO: The back end may reorder the BBS's differently and reverse
4176 conditions/branch directions. Change the estimates below to
4177 something more reasonable. */
4179 /* If the number of iterations is known and we do not do versioning, we can
4180 decide whether to vectorize at compile time. Hence the scalar version
4181 do not carry cost model guard costs. */
4182 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
4183 || LOOP_REQUIRES_VERSIONING (loop_vinfo
))
4185 /* Cost model check occurs at versioning. */
4186 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
4187 scalar_outside_cost
+= vect_get_stmt_cost (cond_branch_not_taken
);
4190 /* Cost model check occurs at prologue generation. */
4191 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) < 0)
4192 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
)
4193 + vect_get_stmt_cost (cond_branch_not_taken
);
4194 /* Cost model check occurs at epilogue generation. */
4196 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
);
4200 /* Complete the target-specific cost calculations. */
4201 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
), &vec_prologue_cost
,
4202 &vec_inside_cost
, &vec_epilogue_cost
);
4204 vec_outside_cost
= (int)(vec_prologue_cost
+ vec_epilogue_cost
);
4206 /* Stash the costs so that we can compare two loop_vec_infos. */
4207 loop_vinfo
->vec_inside_cost
= vec_inside_cost
;
4208 loop_vinfo
->vec_outside_cost
= vec_outside_cost
;
4210 if (dump_enabled_p ())
4212 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
4213 dump_printf (MSG_NOTE
, " Vector inside of loop cost: %d\n",
4215 dump_printf (MSG_NOTE
, " Vector prologue cost: %d\n",
4217 dump_printf (MSG_NOTE
, " Vector epilogue cost: %d\n",
4219 dump_printf (MSG_NOTE
, " Scalar iteration cost: %d\n",
4220 scalar_single_iter_cost
);
4221 dump_printf (MSG_NOTE
, " Scalar outside cost: %d\n",
4222 scalar_outside_cost
);
4223 dump_printf (MSG_NOTE
, " Vector outside cost: %d\n",
4225 dump_printf (MSG_NOTE
, " prologue iterations: %d\n",
4226 peel_iters_prologue
);
4227 dump_printf (MSG_NOTE
, " epilogue iterations: %d\n",
4228 peel_iters_epilogue
);
4231 /* Calculate number of iterations required to make the vector version
4232 profitable, relative to the loop bodies only. The following condition
4234 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4236 SIC = scalar iteration cost, VIC = vector iteration cost,
4237 VOC = vector outside cost, VF = vectorization factor,
4238 NPEEL = prologue iterations + epilogue iterations,
4239 SOC = scalar outside cost for run time cost model check. */
4241 int saving_per_viter
= (scalar_single_iter_cost
* assumed_vf
4243 if (saving_per_viter
<= 0)
4245 if (LOOP_VINFO_LOOP (loop_vinfo
)->force_vectorize
)
4246 warning_at (vect_location
.get_location_t (), OPT_Wopenmp_simd
,
4247 "vectorization did not happen for a simd loop");
4249 if (dump_enabled_p ())
4250 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4251 "cost model: the vector iteration cost = %d "
4252 "divided by the scalar iteration cost = %d "
4253 "is greater or equal to the vectorization factor = %d"
4255 vec_inside_cost
, scalar_single_iter_cost
, assumed_vf
);
4256 *ret_min_profitable_niters
= -1;
4257 *ret_min_profitable_estimate
= -1;
4261 /* ??? The "if" arm is written to handle all cases; see below for what
4262 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4263 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
4265 /* Rewriting the condition above in terms of the number of
4266 vector iterations (vniters) rather than the number of
4267 scalar iterations (niters) gives:
4269 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4271 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4273 For integer N, X and Y when X > 0:
4275 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4276 int outside_overhead
= (vec_outside_cost
4277 - scalar_single_iter_cost
* peel_iters_prologue
4278 - scalar_single_iter_cost
* peel_iters_epilogue
4279 - scalar_outside_cost
);
4280 /* We're only interested in cases that require at least one
4281 vector iteration. */
4282 int min_vec_niters
= 1;
4283 if (outside_overhead
> 0)
4284 min_vec_niters
= outside_overhead
/ saving_per_viter
+ 1;
4286 if (dump_enabled_p ())
4287 dump_printf (MSG_NOTE
, " Minimum number of vector iterations: %d\n",
4290 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
4292 /* Now that we know the minimum number of vector iterations,
4293 find the minimum niters for which the scalar cost is larger:
4295 SIC * niters > VIC * vniters + VOC - SOC
4297 We know that the minimum niters is no more than
4298 vniters * VF + NPEEL, but it might be (and often is) less
4299 than that if a partial vector iteration is cheaper than the
4300 equivalent scalar code. */
4301 int threshold
= (vec_inside_cost
* min_vec_niters
4303 - scalar_outside_cost
);
4305 min_profitable_iters
= 1;
4307 min_profitable_iters
= threshold
/ scalar_single_iter_cost
+ 1;
4310 /* Convert the number of vector iterations into a number of
4311 scalar iterations. */
4312 min_profitable_iters
= (min_vec_niters
* assumed_vf
4313 + peel_iters_prologue
4314 + peel_iters_epilogue
);
4318 min_profitable_iters
= ((vec_outside_cost
- scalar_outside_cost
)
4320 - vec_inside_cost
* peel_iters_prologue
4321 - vec_inside_cost
* peel_iters_epilogue
);
4322 if (min_profitable_iters
<= 0)
4323 min_profitable_iters
= 0;
4326 min_profitable_iters
/= saving_per_viter
;
4328 if ((scalar_single_iter_cost
* assumed_vf
* min_profitable_iters
)
4329 <= (((int) vec_inside_cost
* min_profitable_iters
)
4330 + (((int) vec_outside_cost
- scalar_outside_cost
)
4332 min_profitable_iters
++;
4336 if (dump_enabled_p ())
4337 dump_printf (MSG_NOTE
,
4338 " Calculated minimum iters for profitability: %d\n",
4339 min_profitable_iters
);
4341 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
)
4342 && min_profitable_iters
< (assumed_vf
+ peel_iters_prologue
))
4343 /* We want the vectorized loop to execute at least once. */
4344 min_profitable_iters
= assumed_vf
+ peel_iters_prologue
;
4345 else if (min_profitable_iters
< peel_iters_prologue
)
4346 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4347 vectorized loop executes at least once. */
4348 min_profitable_iters
= peel_iters_prologue
;
4350 if (dump_enabled_p ())
4351 dump_printf_loc (MSG_NOTE
, vect_location
,
4352 " Runtime profitability threshold = %d\n",
4353 min_profitable_iters
);
4355 *ret_min_profitable_niters
= min_profitable_iters
;
4357 /* Calculate number of iterations required to make the vector version
4358 profitable, relative to the loop bodies only.
4360 Non-vectorized variant is SIC * niters and it must win over vector
4361 variant on the expected loop trip count. The following condition must hold true:
4362 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4364 if (vec_outside_cost
<= 0)
4365 min_profitable_estimate
= 0;
4366 /* ??? This "else if" arm is written to handle all cases; see below for
4367 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4368 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
4370 /* This is a repeat of the code above, but with + SOC rather
4372 int outside_overhead
= (vec_outside_cost
4373 - scalar_single_iter_cost
* peel_iters_prologue
4374 - scalar_single_iter_cost
* peel_iters_epilogue
4375 + scalar_outside_cost
);
4376 int min_vec_niters
= 1;
4377 if (outside_overhead
> 0)
4378 min_vec_niters
= outside_overhead
/ saving_per_viter
+ 1;
4380 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
4382 int threshold
= (vec_inside_cost
* min_vec_niters
4384 + scalar_outside_cost
);
4385 min_profitable_estimate
= threshold
/ scalar_single_iter_cost
+ 1;
4388 min_profitable_estimate
= (min_vec_niters
* assumed_vf
4389 + peel_iters_prologue
4390 + peel_iters_epilogue
);
4394 min_profitable_estimate
= ((vec_outside_cost
+ scalar_outside_cost
)
4396 - vec_inside_cost
* peel_iters_prologue
4397 - vec_inside_cost
* peel_iters_epilogue
)
4398 / ((scalar_single_iter_cost
* assumed_vf
)
4401 min_profitable_estimate
= MAX (min_profitable_estimate
, min_profitable_iters
);
4402 if (dump_enabled_p ())
4403 dump_printf_loc (MSG_NOTE
, vect_location
,
4404 " Static estimate profitability threshold = %d\n",
4405 min_profitable_estimate
);
4407 *ret_min_profitable_estimate
= min_profitable_estimate
;
4410 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4411 vector elements (not bits) for a vector with NELT elements. */
4413 calc_vec_perm_mask_for_shift (unsigned int offset
, unsigned int nelt
,
4414 vec_perm_builder
*sel
)
4416 /* The encoding is a single stepped pattern. Any wrap-around is handled
4417 by vec_perm_indices. */
4418 sel
->new_vector (nelt
, 1, 3);
4419 for (unsigned int i
= 0; i
< 3; i
++)
4420 sel
->quick_push (i
+ offset
);
4423 /* Checks whether the target supports whole-vector shifts for vectors of mode
4424 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4425 it supports vec_perm_const with masks for all necessary shift amounts. */
4427 have_whole_vector_shift (machine_mode mode
)
4429 if (optab_handler (vec_shr_optab
, mode
) != CODE_FOR_nothing
)
4432 /* Variable-length vectors should be handled via the optab. */
4434 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
4437 vec_perm_builder sel
;
4438 vec_perm_indices indices
;
4439 for (unsigned int i
= nelt
/ 2; i
>= 1; i
/= 2)
4441 calc_vec_perm_mask_for_shift (i
, nelt
, &sel
);
4442 indices
.new_vector (sel
, 2, nelt
);
4443 if (!can_vec_perm_const_p (mode
, indices
, false))
4449 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4450 functions. Design better to avoid maintenance issues. */
4452 /* Function vect_model_reduction_cost.
4454 Models cost for a reduction operation, including the vector ops
4455 generated within the strip-mine loop in some cases, the initial
4456 definition before the loop, and the epilogue code that must be generated. */
4459 vect_model_reduction_cost (loop_vec_info loop_vinfo
,
4460 stmt_vec_info stmt_info
, internal_fn reduc_fn
,
4461 vect_reduction_type reduction_type
,
4462 int ncopies
, stmt_vector_for_cost
*cost_vec
)
4464 int prologue_cost
= 0, epilogue_cost
= 0, inside_cost
;
4465 enum tree_code code
;
4469 class loop
*loop
= NULL
;
4472 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4474 /* Condition reductions generate two reductions in the loop. */
4475 if (reduction_type
== COND_REDUCTION
)
4478 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4479 mode
= TYPE_MODE (vectype
);
4480 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
4482 code
= gimple_assign_rhs_code (orig_stmt_info
->stmt
);
4484 if (reduction_type
== EXTRACT_LAST_REDUCTION
)
4485 /* No extra instructions are needed in the prologue. The loop body
4486 operations are costed in vectorizable_condition. */
4488 else if (reduction_type
== FOLD_LEFT_REDUCTION
)
4490 /* No extra instructions needed in the prologue. */
4493 if (reduc_fn
!= IFN_LAST
)
4494 /* Count one reduction-like operation per vector. */
4495 inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vec_to_scalar
,
4496 stmt_info
, 0, vect_body
);
4499 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4500 unsigned int nelements
= ncopies
* vect_nunits_for_cost (vectype
);
4501 inside_cost
= record_stmt_cost (cost_vec
, nelements
,
4502 vec_to_scalar
, stmt_info
, 0,
4504 inside_cost
+= record_stmt_cost (cost_vec
, nelements
,
4505 scalar_stmt
, stmt_info
, 0,
4511 /* Add in cost for initial definition.
4512 For cond reduction we have four vectors: initial index, step,
4513 initial result of the data reduction, initial value of the index
4515 int prologue_stmts
= reduction_type
== COND_REDUCTION
? 4 : 1;
4516 prologue_cost
+= record_stmt_cost (cost_vec
, prologue_stmts
,
4517 scalar_to_vec
, stmt_info
, 0,
4521 /* Determine cost of epilogue code.
4523 We have a reduction operator that will reduce the vector in one statement.
4524 Also requires scalar extract. */
4526 if (!loop
|| !nested_in_vect_loop_p (loop
, orig_stmt_info
))
4528 if (reduc_fn
!= IFN_LAST
)
4530 if (reduction_type
== COND_REDUCTION
)
4532 /* An EQ stmt and an COND_EXPR stmt. */
4533 epilogue_cost
+= record_stmt_cost (cost_vec
, 2,
4534 vector_stmt
, stmt_info
, 0,
4536 /* Reduction of the max index and a reduction of the found
4538 epilogue_cost
+= record_stmt_cost (cost_vec
, 2,
4539 vec_to_scalar
, stmt_info
, 0,
4541 /* A broadcast of the max value. */
4542 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
4543 scalar_to_vec
, stmt_info
, 0,
4548 epilogue_cost
+= record_stmt_cost (cost_vec
, 1, vector_stmt
,
4549 stmt_info
, 0, vect_epilogue
);
4550 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
4551 vec_to_scalar
, stmt_info
, 0,
4555 else if (reduction_type
== COND_REDUCTION
)
4557 unsigned estimated_nunits
= vect_nunits_for_cost (vectype
);
4558 /* Extraction of scalar elements. */
4559 epilogue_cost
+= record_stmt_cost (cost_vec
,
4560 2 * estimated_nunits
,
4561 vec_to_scalar
, stmt_info
, 0,
4563 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4564 epilogue_cost
+= record_stmt_cost (cost_vec
,
4565 2 * estimated_nunits
- 3,
4566 scalar_stmt
, stmt_info
, 0,
4569 else if (reduction_type
== EXTRACT_LAST_REDUCTION
4570 || reduction_type
== FOLD_LEFT_REDUCTION
)
4571 /* No extra instructions need in the epilogue. */
4575 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
4577 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info
->stmt
)));
4578 int element_bitsize
= tree_to_uhwi (bitsize
);
4579 int nelements
= vec_size_in_bits
/ element_bitsize
;
4581 if (code
== COND_EXPR
)
4584 optab
= optab_for_tree_code (code
, vectype
, optab_default
);
4586 /* We have a whole vector shift available. */
4587 if (optab
!= unknown_optab
4588 && VECTOR_MODE_P (mode
)
4589 && optab_handler (optab
, mode
) != CODE_FOR_nothing
4590 && have_whole_vector_shift (mode
))
4592 /* Final reduction via vector shifts and the reduction operator.
4593 Also requires scalar extract. */
4594 epilogue_cost
+= record_stmt_cost (cost_vec
,
4595 exact_log2 (nelements
) * 2,
4596 vector_stmt
, stmt_info
, 0,
4598 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
4599 vec_to_scalar
, stmt_info
, 0,
4603 /* Use extracts and reduction op for final reduction. For N
4604 elements, we have N extracts and N-1 reduction ops. */
4605 epilogue_cost
+= record_stmt_cost (cost_vec
,
4606 nelements
+ nelements
- 1,
4607 vector_stmt
, stmt_info
, 0,
4612 if (dump_enabled_p ())
4613 dump_printf (MSG_NOTE
,
4614 "vect_model_reduction_cost: inside_cost = %d, "
4615 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost
,
4616 prologue_cost
, epilogue_cost
);
4621 /* Function get_initial_def_for_reduction
4624 STMT_VINFO - a stmt that performs a reduction operation in the loop.
4625 INIT_VAL - the initial value of the reduction variable
4628 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4629 of the reduction (used for adjusting the epilog - see below).
4630 Return a vector variable, initialized according to the operation that
4631 STMT_VINFO performs. This vector will be used as the initial value
4632 of the vector of partial results.
4634 Option1 (adjust in epilog): Initialize the vector as follows:
4635 add/bit or/xor: [0,0,...,0,0]
4636 mult/bit and: [1,1,...,1,1]
4637 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4638 and when necessary (e.g. add/mult case) let the caller know
4639 that it needs to adjust the result by init_val.
4641 Option2: Initialize the vector as follows:
4642 add/bit or/xor: [init_val,0,0,...,0]
4643 mult/bit and: [init_val,1,1,...,1]
4644 min/max/cond_expr: [init_val,init_val,...,init_val]
4645 and no adjustments are needed.
4647 For example, for the following code:
4653 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4654 For a vector of 4 units, we want to return either [0,0,0,init_val],
4655 or [0,0,0,0] and let the caller know that it needs to adjust
4656 the result at the end by 'init_val'.
4658 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4659 initialization vector is simpler (same element in all entries), if
4660 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4662 A cost model should help decide between these two schemes. */
4665 get_initial_def_for_reduction (loop_vec_info loop_vinfo
,
4666 stmt_vec_info stmt_vinfo
,
4667 enum tree_code code
, tree init_val
,
4668 tree
*adjustment_def
)
4670 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4671 tree scalar_type
= TREE_TYPE (init_val
);
4672 tree vectype
= get_vectype_for_scalar_type (loop_vinfo
, scalar_type
);
4675 REAL_VALUE_TYPE real_init_val
= dconst0
;
4676 int int_init_val
= 0;
4677 gimple_seq stmts
= NULL
;
4679 gcc_assert (vectype
);
4681 gcc_assert (POINTER_TYPE_P (scalar_type
) || INTEGRAL_TYPE_P (scalar_type
)
4682 || SCALAR_FLOAT_TYPE_P (scalar_type
));
4684 gcc_assert (nested_in_vect_loop_p (loop
, stmt_vinfo
)
4685 || loop
== (gimple_bb (stmt_vinfo
->stmt
))->loop_father
);
4687 /* ADJUSTMENT_DEF is NULL when called from
4688 vect_create_epilog_for_reduction to vectorize double reduction. */
4690 *adjustment_def
= NULL
;
4694 case WIDEN_SUM_EXPR
:
4704 if (code
== MULT_EXPR
)
4706 real_init_val
= dconst1
;
4710 if (code
== BIT_AND_EXPR
)
4713 if (SCALAR_FLOAT_TYPE_P (scalar_type
))
4714 def_for_init
= build_real (scalar_type
, real_init_val
);
4716 def_for_init
= build_int_cst (scalar_type
, int_init_val
);
4718 if (adjustment_def
|| operand_equal_p (def_for_init
, init_val
, 0))
4720 /* Option1: the first element is '0' or '1' as well. */
4721 if (!operand_equal_p (def_for_init
, init_val
, 0))
4722 *adjustment_def
= init_val
;
4723 init_def
= gimple_build_vector_from_val (&stmts
, vectype
,
4726 else if (!TYPE_VECTOR_SUBPARTS (vectype
).is_constant ())
4728 /* Option2 (variable length): the first element is INIT_VAL. */
4729 init_def
= gimple_build_vector_from_val (&stmts
, vectype
,
4731 init_def
= gimple_build (&stmts
, CFN_VEC_SHL_INSERT
,
4732 vectype
, init_def
, init_val
);
4736 /* Option2: the first element is INIT_VAL. */
4737 tree_vector_builder
elts (vectype
, 1, 2);
4738 elts
.quick_push (init_val
);
4739 elts
.quick_push (def_for_init
);
4740 init_def
= gimple_build_vector (&stmts
, &elts
);
4749 init_val
= gimple_convert (&stmts
, TREE_TYPE (vectype
), init_val
);
4750 init_def
= gimple_build_vector_from_val (&stmts
, vectype
, init_val
);
4759 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop
), stmts
);
4763 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4764 NUMBER_OF_VECTORS is the number of vector defs to create.
4765 If NEUTRAL_OP is nonnull, introducing extra elements of that
4766 value will not change the result. */
4769 get_initial_defs_for_reduction (vec_info
*vinfo
,
4771 vec
<tree
> *vec_oprnds
,
4772 unsigned int number_of_vectors
,
4773 bool reduc_chain
, tree neutral_op
)
4775 vec
<stmt_vec_info
> stmts
= SLP_TREE_SCALAR_STMTS (slp_node
);
4776 stmt_vec_info stmt_vinfo
= stmts
[0];
4777 unsigned HOST_WIDE_INT nunits
;
4778 unsigned j
, number_of_places_left_in_vector
;
4780 unsigned int group_size
= stmts
.length ();
4784 vector_type
= STMT_VINFO_VECTYPE (stmt_vinfo
);
4786 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo
) == vect_reduction_def
);
4788 loop
= (gimple_bb (stmt_vinfo
->stmt
))->loop_father
;
4790 edge pe
= loop_preheader_edge (loop
);
4792 gcc_assert (!reduc_chain
|| neutral_op
);
4794 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4795 created vectors. It is greater than 1 if unrolling is performed.
4797 For example, we have two scalar operands, s1 and s2 (e.g., group of
4798 strided accesses of size two), while NUNITS is four (i.e., four scalars
4799 of this type can be packed in a vector). The output vector will contain
4800 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4803 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4804 vectors containing the operands.
4806 For example, NUNITS is four as before, and the group size is 8
4807 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4808 {s5, s6, s7, s8}. */
4810 if (!TYPE_VECTOR_SUBPARTS (vector_type
).is_constant (&nunits
))
4811 nunits
= group_size
;
4813 number_of_places_left_in_vector
= nunits
;
4814 bool constant_p
= true;
4815 tree_vector_builder
elts (vector_type
, nunits
, 1);
4816 elts
.quick_grow (nunits
);
4817 gimple_seq ctor_seq
= NULL
;
4818 for (j
= 0; j
< nunits
* number_of_vectors
; ++j
)
4822 stmt_vinfo
= stmts
[i
];
4824 /* Get the def before the loop. In reduction chain we have only
4825 one initial value. Else we have as many as PHIs in the group. */
4827 op
= j
!= 0 ? neutral_op
: PHI_ARG_DEF_FROM_EDGE (stmt_vinfo
->stmt
, pe
);
4828 else if (((vec_oprnds
->length () + 1) * nunits
4829 - number_of_places_left_in_vector
>= group_size
)
4833 op
= PHI_ARG_DEF_FROM_EDGE (stmt_vinfo
->stmt
, pe
);
4835 /* Create 'vect_ = {op0,op1,...,opn}'. */
4836 number_of_places_left_in_vector
--;
4837 elts
[nunits
- number_of_places_left_in_vector
- 1] = op
;
4838 if (!CONSTANT_CLASS_P (op
))
4841 if (number_of_places_left_in_vector
== 0)
4844 if (constant_p
&& !neutral_op
4845 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
)
4846 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
))
4847 /* Build the vector directly from ELTS. */
4848 init
= gimple_build_vector (&ctor_seq
, &elts
);
4849 else if (neutral_op
)
4851 /* Build a vector of the neutral value and shift the
4852 other elements into place. */
4853 init
= gimple_build_vector_from_val (&ctor_seq
, vector_type
,
4856 while (k
> 0 && elts
[k
- 1] == neutral_op
)
4861 init
= gimple_build (&ctor_seq
, CFN_VEC_SHL_INSERT
,
4862 vector_type
, init
, elts
[k
]);
4867 /* First time round, duplicate ELTS to fill the
4868 required number of vectors. */
4869 duplicate_and_interleave (vinfo
, &ctor_seq
, vector_type
, elts
,
4870 number_of_vectors
, *vec_oprnds
);
4873 vec_oprnds
->quick_push (init
);
4875 number_of_places_left_in_vector
= nunits
;
4876 elts
.new_vector (vector_type
, nunits
, 1);
4877 elts
.quick_grow (nunits
);
4881 if (ctor_seq
!= NULL
)
4882 gsi_insert_seq_on_edge_immediate (pe
, ctor_seq
);
4885 /* For a statement STMT_INFO taking part in a reduction operation return
4886 the stmt_vec_info the meta information is stored on. */
4889 info_for_reduction (vec_info
*vinfo
, stmt_vec_info stmt_info
)
4891 stmt_info
= vect_orig_stmt (stmt_info
);
4892 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info
));
4893 if (!is_a
<gphi
*> (stmt_info
->stmt
)
4894 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
4895 stmt_info
= STMT_VINFO_REDUC_DEF (stmt_info
);
4896 gphi
*phi
= as_a
<gphi
*> (stmt_info
->stmt
);
4897 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
4899 if (gimple_phi_num_args (phi
) == 1)
4900 stmt_info
= STMT_VINFO_REDUC_DEF (stmt_info
);
4902 else if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
4904 edge pe
= loop_preheader_edge (gimple_bb (phi
)->loop_father
);
4906 = vinfo
->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi
, pe
));
4907 if (info
&& STMT_VINFO_DEF_TYPE (info
) == vect_double_reduction_def
)
4913 /* Function vect_create_epilog_for_reduction
4915 Create code at the loop-epilog to finalize the result of a reduction
4918 STMT_INFO is the scalar reduction stmt that is being vectorized.
4919 SLP_NODE is an SLP node containing a group of reduction statements. The
4920 first one in this group is STMT_INFO.
4921 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4922 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4926 1. Completes the reduction def-use cycles.
4927 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4928 by calling the function specified by REDUC_FN if available, or by
4929 other means (whole-vector shifts or a scalar loop).
4930 The function also creates a new phi node at the loop exit to preserve
4931 loop-closed form, as illustrated below.
4933 The flow at the entry to this function:
4936 vec_def = phi <vec_init, null> # REDUCTION_PHI
4937 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4938 s_loop = scalar_stmt # (scalar) STMT_INFO
4940 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4944 The above is transformed by this function into:
4947 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4948 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4949 s_loop = scalar_stmt # (scalar) STMT_INFO
4951 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4952 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4953 v_out2 = reduce <v_out1>
4954 s_out3 = extract_field <v_out2, 0>
4955 s_out4 = adjust_result <s_out3>
4961 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo
,
4962 stmt_vec_info stmt_info
,
4964 slp_instance slp_node_instance
)
4966 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
4967 gcc_assert (reduc_info
->is_reduc_info
);
4968 /* For double reductions we need to get at the inner loop reduction
4969 stmt which has the meta info attached. Our stmt_info is that of the
4970 loop-closed PHI of the inner loop which we remember as
4971 def for the reduction PHI generation. */
4972 bool double_reduc
= false;
4973 stmt_vec_info rdef_info
= stmt_info
;
4974 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
4976 gcc_assert (!slp_node
);
4977 double_reduc
= true;
4978 stmt_info
= loop_vinfo
->lookup_def (gimple_phi_arg_def
4979 (stmt_info
->stmt
, 0));
4980 stmt_info
= vect_stmt_to_vectorize (stmt_info
);
4982 gphi
*reduc_def_stmt
4983 = as_a
<gphi
*> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
))->stmt
);
4984 enum tree_code code
= STMT_VINFO_REDUC_CODE (reduc_info
);
4985 internal_fn reduc_fn
= STMT_VINFO_REDUC_FN (reduc_info
);
4988 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
), *outer_loop
= NULL
;
4989 basic_block exit_bb
;
4992 gimple
*new_phi
= NULL
, *phi
;
4993 gimple_stmt_iterator exit_gsi
;
4994 tree new_temp
= NULL_TREE
, new_name
, new_scalar_dest
;
4995 gimple
*epilog_stmt
= NULL
;
4999 tree orig_name
, scalar_result
;
5000 imm_use_iterator imm_iter
, phi_imm_iter
;
5001 use_operand_p use_p
, phi_use_p
;
5003 bool nested_in_vect_loop
= false;
5004 auto_vec
<gimple
*> new_phis
;
5006 auto_vec
<tree
> scalar_results
;
5007 unsigned int group_size
= 1, k
;
5008 auto_vec
<gimple
*> phis
;
5009 bool slp_reduc
= false;
5010 bool direct_slp_reduc
;
5011 tree new_phi_result
;
5012 tree induction_index
= NULL_TREE
;
5015 group_size
= SLP_TREE_LANES (slp_node
);
5017 if (nested_in_vect_loop_p (loop
, stmt_info
))
5021 nested_in_vect_loop
= true;
5022 gcc_assert (!slp_node
);
5024 gcc_assert (!nested_in_vect_loop
|| double_reduc
);
5026 vectype
= STMT_VINFO_REDUC_VECTYPE (reduc_info
);
5027 gcc_assert (vectype
);
5028 mode
= TYPE_MODE (vectype
);
5030 tree initial_def
= NULL
;
5031 tree induc_val
= NULL_TREE
;
5032 tree adjustment_def
= NULL
;
5037 /* Get at the scalar def before the loop, that defines the initial value
5038 of the reduction variable. */
5039 initial_def
= PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt
,
5040 loop_preheader_edge (loop
));
5041 /* Optimize: for induction condition reduction, if we can't use zero
5042 for induc_val, use initial_def. */
5043 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
5044 induc_val
= STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
);
5045 else if (double_reduc
)
5047 else if (nested_in_vect_loop
)
5050 adjustment_def
= STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info
);
5057 vec_num
= SLP_TREE_VEC_STMTS (slp_node_instance
->reduc_phis
).length ();
5062 stmt_vec_info reduc_info
= loop_vinfo
->lookup_stmt (reduc_def_stmt
);
5064 ncopies
= STMT_VINFO_VEC_STMTS (reduc_info
).length ();
5067 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5068 which is updated with the current index of the loop for every match of
5069 the original loop's cond_expr (VEC_STMT). This results in a vector
5070 containing the last time the condition passed for that vector lane.
5071 The first match will be a 1 to allow 0 to be used for non-matching
5072 indexes. If there are no matches at all then the vector will be all
5075 PR92772: This algorithm is broken for architectures that support
5076 masked vectors, but do not provide fold_extract_last. */
5077 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
)
5079 auto_vec
<std::pair
<tree
, bool>, 2> ccompares
;
5080 stmt_vec_info cond_info
= STMT_VINFO_REDUC_DEF (reduc_info
);
5081 cond_info
= vect_stmt_to_vectorize (cond_info
);
5082 while (cond_info
!= reduc_info
)
5084 if (gimple_assign_rhs_code (cond_info
->stmt
) == COND_EXPR
)
5086 gimple
*vec_stmt
= STMT_VINFO_VEC_STMTS (cond_info
)[0];
5087 gcc_assert (gimple_assign_rhs_code (vec_stmt
) == VEC_COND_EXPR
);
5089 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt
)),
5090 STMT_VINFO_REDUC_IDX (cond_info
) == 2));
5093 = loop_vinfo
->lookup_def (gimple_op (cond_info
->stmt
,
5094 1 + STMT_VINFO_REDUC_IDX
5096 cond_info
= vect_stmt_to_vectorize (cond_info
);
5098 gcc_assert (ccompares
.length () != 0);
5100 tree indx_before_incr
, indx_after_incr
;
5101 poly_uint64 nunits_out
= TYPE_VECTOR_SUBPARTS (vectype
);
5102 int scalar_precision
5103 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype
)));
5104 tree cr_index_scalar_type
= make_unsigned_type (scalar_precision
);
5105 tree cr_index_vector_type
= get_related_vectype_for_scalar_type
5106 (TYPE_MODE (vectype
), cr_index_scalar_type
,
5107 TYPE_VECTOR_SUBPARTS (vectype
));
5109 /* First we create a simple vector induction variable which starts
5110 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5111 vector size (STEP). */
5113 /* Create a {1,2,3,...} vector. */
5114 tree series_vect
= build_index_vector (cr_index_vector_type
, 1, 1);
5116 /* Create a vector of the step value. */
5117 tree step
= build_int_cst (cr_index_scalar_type
, nunits_out
);
5118 tree vec_step
= build_vector_from_val (cr_index_vector_type
, step
);
5120 /* Create an induction variable. */
5121 gimple_stmt_iterator incr_gsi
;
5123 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
5124 create_iv (series_vect
, vec_step
, NULL_TREE
, loop
, &incr_gsi
,
5125 insert_after
, &indx_before_incr
, &indx_after_incr
);
5127 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5128 filled with zeros (VEC_ZERO). */
5130 /* Create a vector of 0s. */
5131 tree zero
= build_zero_cst (cr_index_scalar_type
);
5132 tree vec_zero
= build_vector_from_val (cr_index_vector_type
, zero
);
5134 /* Create a vector phi node. */
5135 tree new_phi_tree
= make_ssa_name (cr_index_vector_type
);
5136 new_phi
= create_phi_node (new_phi_tree
, loop
->header
);
5137 add_phi_arg (as_a
<gphi
*> (new_phi
), vec_zero
,
5138 loop_preheader_edge (loop
), UNKNOWN_LOCATION
);
5140 /* Now take the condition from the loops original cond_exprs
5141 and produce a new cond_exprs (INDEX_COND_EXPR) which for
5142 every match uses values from the induction variable
5143 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5145 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5146 the new cond_expr (INDEX_COND_EXPR). */
5147 gimple_seq stmts
= NULL
;
5148 for (int i
= ccompares
.length () - 1; i
!= -1; --i
)
5150 tree ccompare
= ccompares
[i
].first
;
5151 if (ccompares
[i
].second
)
5152 new_phi_tree
= gimple_build (&stmts
, VEC_COND_EXPR
,
5153 cr_index_vector_type
,
5155 indx_before_incr
, new_phi_tree
);
5157 new_phi_tree
= gimple_build (&stmts
, VEC_COND_EXPR
,
5158 cr_index_vector_type
,
5160 new_phi_tree
, indx_before_incr
);
5162 gsi_insert_seq_before (&incr_gsi
, stmts
, GSI_SAME_STMT
);
5164 /* Update the phi with the vec cond. */
5165 induction_index
= new_phi_tree
;
5166 add_phi_arg (as_a
<gphi
*> (new_phi
), induction_index
,
5167 loop_latch_edge (loop
), UNKNOWN_LOCATION
);
5170 /* 2. Create epilog code.
5171 The reduction epilog code operates across the elements of the vector
5172 of partial results computed by the vectorized loop.
5173 The reduction epilog code consists of:
5175 step 1: compute the scalar result in a vector (v_out2)
5176 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5177 step 3: adjust the scalar result (s_out3) if needed.
5179 Step 1 can be accomplished using one the following three schemes:
5180 (scheme 1) using reduc_fn, if available.
5181 (scheme 2) using whole-vector shifts, if available.
5182 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5185 The overall epilog code looks like this:
5187 s_out0 = phi <s_loop> # original EXIT_PHI
5188 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5189 v_out2 = reduce <v_out1> # step 1
5190 s_out3 = extract_field <v_out2, 0> # step 2
5191 s_out4 = adjust_result <s_out3> # step 3
5193 (step 3 is optional, and steps 1 and 2 may be combined).
5194 Lastly, the uses of s_out0 are replaced by s_out4. */
5197 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5198 v_out1 = phi <VECT_DEF>
5199 Store them in NEW_PHIS. */
5202 exit_bb
= single_exit (loop
)->dest
;
5203 new_phis
.create (slp_node
? vec_num
: ncopies
);
5204 for (unsigned i
= 0; i
< vec_num
; i
++)
5207 def
= vect_get_slp_vect_def (slp_node
, i
);
5209 def
= gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info
)[0]);
5210 for (j
= 0; j
< ncopies
; j
++)
5212 tree new_def
= copy_ssa_name (def
);
5213 phi
= create_phi_node (new_def
, exit_bb
);
5215 new_phis
.quick_push (phi
);
5218 def
= gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info
)[j
]);
5219 new_phis
.quick_push (phi
);
5222 SET_PHI_ARG_DEF (phi
, single_exit (loop
)->dest_idx
, def
);
5226 exit_gsi
= gsi_after_labels (exit_bb
);
5228 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5229 (i.e. when reduc_fn is not available) and in the final adjustment
5230 code (if needed). Also get the original scalar reduction variable as
5231 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5232 represents a reduction pattern), the tree-code and scalar-def are
5233 taken from the original stmt that the pattern-stmt (STMT) replaces.
5234 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5235 are taken from STMT. */
5237 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
5238 if (orig_stmt_info
!= stmt_info
)
5240 /* Reduction pattern */
5241 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
));
5242 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info
) == stmt_info
);
5245 scalar_dest
= gimple_assign_lhs (orig_stmt_info
->stmt
);
5246 scalar_type
= TREE_TYPE (scalar_dest
);
5247 scalar_results
.create (group_size
);
5248 new_scalar_dest
= vect_create_destination_var (scalar_dest
, NULL
);
5249 bitsize
= TYPE_SIZE (scalar_type
);
5251 /* SLP reduction without reduction chain, e.g.,
5255 b2 = operation (b1) */
5256 slp_reduc
= (slp_node
&& !REDUC_GROUP_FIRST_ELEMENT (stmt_info
));
5258 /* True if we should implement SLP_REDUC using native reduction operations
5259 instead of scalar operations. */
5260 direct_slp_reduc
= (reduc_fn
!= IFN_LAST
5262 && !TYPE_VECTOR_SUBPARTS (vectype
).is_constant ());
5264 /* In case of reduction chain, e.g.,
5267 a3 = operation (a2),
5269 we may end up with more than one vector result. Here we reduce them to
5271 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
) || direct_slp_reduc
)
5273 gimple_seq stmts
= NULL
;
5274 tree first_vect
= PHI_RESULT (new_phis
[0]);
5275 first_vect
= gimple_convert (&stmts
, vectype
, first_vect
);
5276 for (k
= 1; k
< new_phis
.length (); k
++)
5278 gimple
*next_phi
= new_phis
[k
];
5279 tree second_vect
= PHI_RESULT (next_phi
);
5280 second_vect
= gimple_convert (&stmts
, vectype
, second_vect
);
5281 first_vect
= gimple_build (&stmts
, code
, vectype
,
5282 first_vect
, second_vect
);
5284 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5286 new_phi_result
= first_vect
;
5287 new_phis
.truncate (0);
5288 new_phis
.safe_push (SSA_NAME_DEF_STMT (first_vect
));
5290 /* Likewise if we couldn't use a single defuse cycle. */
5291 else if (ncopies
> 1)
5293 gimple_seq stmts
= NULL
;
5294 tree first_vect
= PHI_RESULT (new_phis
[0]);
5295 first_vect
= gimple_convert (&stmts
, vectype
, first_vect
);
5296 for (int k
= 1; k
< ncopies
; ++k
)
5298 tree second_vect
= PHI_RESULT (new_phis
[k
]);
5299 second_vect
= gimple_convert (&stmts
, vectype
, second_vect
);
5300 first_vect
= gimple_build (&stmts
, code
, vectype
,
5301 first_vect
, second_vect
);
5303 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5304 new_phi_result
= first_vect
;
5305 new_phis
.truncate (0);
5306 new_phis
.safe_push (SSA_NAME_DEF_STMT (first_vect
));
5309 new_phi_result
= PHI_RESULT (new_phis
[0]);
5311 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
5312 && reduc_fn
!= IFN_LAST
)
5314 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5315 various data values where the condition matched and another vector
5316 (INDUCTION_INDEX) containing all the indexes of those matches. We
5317 need to extract the last matching index (which will be the index with
5318 highest value) and use this to index into the data vector.
5319 For the case where there were no matches, the data vector will contain
5320 all default values and the index vector will be all zeros. */
5322 /* Get various versions of the type of the vector of indexes. */
5323 tree index_vec_type
= TREE_TYPE (induction_index
);
5324 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type
));
5325 tree index_scalar_type
= TREE_TYPE (index_vec_type
);
5326 tree index_vec_cmp_type
= truth_type_for (index_vec_type
);
5328 /* Get an unsigned integer version of the type of the data vector. */
5329 int scalar_precision
5330 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type
));
5331 tree scalar_type_unsigned
= make_unsigned_type (scalar_precision
);
5332 tree vectype_unsigned
= get_same_sized_vectype (scalar_type_unsigned
,
5335 /* First we need to create a vector (ZERO_VEC) of zeros and another
5336 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5337 can create using a MAX reduction and then expanding.
5338 In the case where the loop never made any matches, the max index will
5341 /* Vector of {0, 0, 0,...}. */
5342 tree zero_vec
= build_zero_cst (vectype
);
5344 gimple_seq stmts
= NULL
;
5345 new_phi_result
= gimple_convert (&stmts
, vectype
, new_phi_result
);
5346 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5348 /* Find maximum value from the vector of found indexes. */
5349 tree max_index
= make_ssa_name (index_scalar_type
);
5350 gcall
*max_index_stmt
= gimple_build_call_internal (IFN_REDUC_MAX
,
5351 1, induction_index
);
5352 gimple_call_set_lhs (max_index_stmt
, max_index
);
5353 gsi_insert_before (&exit_gsi
, max_index_stmt
, GSI_SAME_STMT
);
5355 /* Vector of {max_index, max_index, max_index,...}. */
5356 tree max_index_vec
= make_ssa_name (index_vec_type
);
5357 tree max_index_vec_rhs
= build_vector_from_val (index_vec_type
,
5359 gimple
*max_index_vec_stmt
= gimple_build_assign (max_index_vec
,
5361 gsi_insert_before (&exit_gsi
, max_index_vec_stmt
, GSI_SAME_STMT
);
5363 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5364 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5365 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5366 otherwise. Only one value should match, resulting in a vector
5367 (VEC_COND) with one data value and the rest zeros.
5368 In the case where the loop never made any matches, every index will
5369 match, resulting in a vector with all data values (which will all be
5370 the default value). */
5372 /* Compare the max index vector to the vector of found indexes to find
5373 the position of the max value. */
5374 tree vec_compare
= make_ssa_name (index_vec_cmp_type
);
5375 gimple
*vec_compare_stmt
= gimple_build_assign (vec_compare
, EQ_EXPR
,
5378 gsi_insert_before (&exit_gsi
, vec_compare_stmt
, GSI_SAME_STMT
);
5380 /* Use the compare to choose either values from the data vector or
5382 tree vec_cond
= make_ssa_name (vectype
);
5383 gimple
*vec_cond_stmt
= gimple_build_assign (vec_cond
, VEC_COND_EXPR
,
5384 vec_compare
, new_phi_result
,
5386 gsi_insert_before (&exit_gsi
, vec_cond_stmt
, GSI_SAME_STMT
);
5388 /* Finally we need to extract the data value from the vector (VEC_COND)
5389 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5390 reduction, but because this doesn't exist, we can use a MAX reduction
5391 instead. The data value might be signed or a float so we need to cast
5393 In the case where the loop never made any matches, the data values are
5394 all identical, and so will reduce down correctly. */
5396 /* Make the matched data values unsigned. */
5397 tree vec_cond_cast
= make_ssa_name (vectype_unsigned
);
5398 tree vec_cond_cast_rhs
= build1 (VIEW_CONVERT_EXPR
, vectype_unsigned
,
5400 gimple
*vec_cond_cast_stmt
= gimple_build_assign (vec_cond_cast
,
5403 gsi_insert_before (&exit_gsi
, vec_cond_cast_stmt
, GSI_SAME_STMT
);
5405 /* Reduce down to a scalar value. */
5406 tree data_reduc
= make_ssa_name (scalar_type_unsigned
);
5407 gcall
*data_reduc_stmt
= gimple_build_call_internal (IFN_REDUC_MAX
,
5409 gimple_call_set_lhs (data_reduc_stmt
, data_reduc
);
5410 gsi_insert_before (&exit_gsi
, data_reduc_stmt
, GSI_SAME_STMT
);
5412 /* Convert the reduced value back to the result type and set as the
5415 new_temp
= gimple_build (&stmts
, VIEW_CONVERT_EXPR
, scalar_type
,
5417 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5418 scalar_results
.safe_push (new_temp
);
5420 else if (STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
5421 && reduc_fn
== IFN_LAST
)
5423 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5425 idx_val = induction_index[0];
5426 val = data_reduc[0];
5427 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5428 if (induction_index[i] > idx_val)
5429 val = data_reduc[i], idx_val = induction_index[i];
5432 tree data_eltype
= TREE_TYPE (TREE_TYPE (new_phi_result
));
5433 tree idx_eltype
= TREE_TYPE (TREE_TYPE (induction_index
));
5434 unsigned HOST_WIDE_INT el_size
= tree_to_uhwi (TYPE_SIZE (idx_eltype
));
5435 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index
));
5436 /* Enforced by vectorizable_reduction, which ensures we have target
5437 support before allowing a conditional reduction on variable-length
5439 unsigned HOST_WIDE_INT v_size
= el_size
* nunits
.to_constant ();
5440 tree idx_val
= NULL_TREE
, val
= NULL_TREE
;
5441 for (unsigned HOST_WIDE_INT off
= 0; off
< v_size
; off
+= el_size
)
5443 tree old_idx_val
= idx_val
;
5445 idx_val
= make_ssa_name (idx_eltype
);
5446 epilog_stmt
= gimple_build_assign (idx_val
, BIT_FIELD_REF
,
5447 build3 (BIT_FIELD_REF
, idx_eltype
,
5449 bitsize_int (el_size
),
5450 bitsize_int (off
)));
5451 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5452 val
= make_ssa_name (data_eltype
);
5453 epilog_stmt
= gimple_build_assign (val
, BIT_FIELD_REF
,
5454 build3 (BIT_FIELD_REF
,
5457 bitsize_int (el_size
),
5458 bitsize_int (off
)));
5459 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5462 tree new_idx_val
= idx_val
;
5463 if (off
!= v_size
- el_size
)
5465 new_idx_val
= make_ssa_name (idx_eltype
);
5466 epilog_stmt
= gimple_build_assign (new_idx_val
,
5469 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5471 tree new_val
= make_ssa_name (data_eltype
);
5472 epilog_stmt
= gimple_build_assign (new_val
,
5479 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5480 idx_val
= new_idx_val
;
5484 /* Convert the reduced value back to the result type and set as the
5486 gimple_seq stmts
= NULL
;
5487 val
= gimple_convert (&stmts
, scalar_type
, val
);
5488 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5489 scalar_results
.safe_push (val
);
5492 /* 2.3 Create the reduction code, using one of the three schemes described
5493 above. In SLP we simply need to extract all the elements from the
5494 vector (without reducing them), so we use scalar shifts. */
5495 else if (reduc_fn
!= IFN_LAST
&& !slp_reduc
)
5501 v_out2 = reduc_expr <v_out1> */
5503 if (dump_enabled_p ())
5504 dump_printf_loc (MSG_NOTE
, vect_location
,
5505 "Reduce using direct vector reduction.\n");
5507 gimple_seq stmts
= NULL
;
5508 new_phi_result
= gimple_convert (&stmts
, vectype
, new_phi_result
);
5509 vec_elem_type
= TREE_TYPE (TREE_TYPE (new_phi_result
));
5510 new_temp
= gimple_build (&stmts
, as_combined_fn (reduc_fn
),
5511 vec_elem_type
, new_phi_result
);
5512 new_temp
= gimple_convert (&stmts
, scalar_type
, new_temp
);
5513 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5515 if ((STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
5518 /* Earlier we set the initial value to be a vector if induc_val
5519 values. Check the result and if it is induc_val then replace
5520 with the original initial value, unless induc_val is
5521 the same as initial_def already. */
5522 tree zcompare
= build2 (EQ_EXPR
, boolean_type_node
, new_temp
,
5525 tmp
= make_ssa_name (new_scalar_dest
);
5526 epilog_stmt
= gimple_build_assign (tmp
, COND_EXPR
, zcompare
,
5527 initial_def
, new_temp
);
5528 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5532 scalar_results
.safe_push (new_temp
);
5534 else if (direct_slp_reduc
)
5536 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5537 with the elements for other SLP statements replaced with the
5538 neutral value. We can then do a normal reduction on each vector. */
5540 /* Enforced by vectorizable_reduction. */
5541 gcc_assert (new_phis
.length () == 1);
5542 gcc_assert (pow2p_hwi (group_size
));
5544 slp_tree orig_phis_slp_node
= slp_node_instance
->reduc_phis
;
5545 vec
<stmt_vec_info
> orig_phis
5546 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node
);
5547 gimple_seq seq
= NULL
;
5549 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5550 and the same element size as VECTYPE. */
5551 tree index
= build_index_vector (vectype
, 0, 1);
5552 tree index_type
= TREE_TYPE (index
);
5553 tree index_elt_type
= TREE_TYPE (index_type
);
5554 tree mask_type
= truth_type_for (index_type
);
5556 /* Create a vector that, for each element, identifies which of
5557 the REDUC_GROUP_SIZE results should use it. */
5558 tree index_mask
= build_int_cst (index_elt_type
, group_size
- 1);
5559 index
= gimple_build (&seq
, BIT_AND_EXPR
, index_type
, index
,
5560 build_vector_from_val (index_type
, index_mask
));
5562 /* Get a neutral vector value. This is simply a splat of the neutral
5563 scalar value if we have one, otherwise the initial scalar value
5564 is itself a neutral value. */
5565 tree vector_identity
= NULL_TREE
;
5566 tree neutral_op
= NULL_TREE
;
5569 stmt_vec_info first
= REDUC_GROUP_FIRST_ELEMENT (stmt_info
);
5571 = neutral_op_for_slp_reduction (slp_node_instance
->reduc_phis
,
5572 vectype
, code
, first
!= NULL
);
5575 vector_identity
= gimple_build_vector_from_val (&seq
, vectype
,
5577 for (unsigned int i
= 0; i
< group_size
; ++i
)
5579 /* If there's no univeral neutral value, we can use the
5580 initial scalar value from the original PHI. This is used
5581 for MIN and MAX reduction, for example. */
5585 = PHI_ARG_DEF_FROM_EDGE (orig_phis
[i
]->stmt
,
5586 loop_preheader_edge (loop
));
5587 scalar_value
= gimple_convert (&seq
, TREE_TYPE (vectype
),
5589 vector_identity
= gimple_build_vector_from_val (&seq
, vectype
,
5593 /* Calculate the equivalent of:
5595 sel[j] = (index[j] == i);
5597 which selects the elements of NEW_PHI_RESULT that should
5598 be included in the result. */
5599 tree compare_val
= build_int_cst (index_elt_type
, i
);
5600 compare_val
= build_vector_from_val (index_type
, compare_val
);
5601 tree sel
= gimple_build (&seq
, EQ_EXPR
, mask_type
,
5602 index
, compare_val
);
5604 /* Calculate the equivalent of:
5606 vec = seq ? new_phi_result : vector_identity;
5608 VEC is now suitable for a full vector reduction. */
5609 tree vec
= gimple_build (&seq
, VEC_COND_EXPR
, vectype
,
5610 sel
, new_phi_result
, vector_identity
);
5612 /* Do the reduction and convert it to the appropriate type. */
5613 tree scalar
= gimple_build (&seq
, as_combined_fn (reduc_fn
),
5614 TREE_TYPE (vectype
), vec
);
5615 scalar
= gimple_convert (&seq
, scalar_type
, scalar
);
5616 scalar_results
.safe_push (scalar
);
5618 gsi_insert_seq_before (&exit_gsi
, seq
, GSI_SAME_STMT
);
5622 bool reduce_with_shift
;
5625 gcc_assert (slp_reduc
|| new_phis
.length () == 1);
5627 /* See if the target wants to do the final (shift) reduction
5628 in a vector mode of smaller size and first reduce upper/lower
5629 halves against each other. */
5630 enum machine_mode mode1
= mode
;
5631 tree stype
= TREE_TYPE (vectype
);
5632 unsigned nunits
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
5633 unsigned nunits1
= nunits
;
5634 if ((mode1
= targetm
.vectorize
.split_reduction (mode
)) != mode
5635 && new_phis
.length () == 1)
5637 nunits1
= GET_MODE_NUNITS (mode1
).to_constant ();
5638 /* For SLP reductions we have to make sure lanes match up, but
5639 since we're doing individual element final reduction reducing
5640 vector width here is even more important.
5641 ??? We can also separate lanes with permutes, for the common
5642 case of power-of-two group-size odd/even extracts would work. */
5643 if (slp_reduc
&& nunits
!= nunits1
)
5645 nunits1
= least_common_multiple (nunits1
, group_size
);
5646 gcc_assert (exact_log2 (nunits1
) != -1 && nunits1
<= nunits
);
5650 && (mode1
= targetm
.vectorize
.split_reduction (mode
)) != mode
)
5651 nunits1
= GET_MODE_NUNITS (mode1
).to_constant ();
5653 tree vectype1
= get_related_vectype_for_scalar_type (TYPE_MODE (vectype
),
5655 reduce_with_shift
= have_whole_vector_shift (mode1
);
5656 if (!VECTOR_MODE_P (mode1
))
5657 reduce_with_shift
= false;
5660 optab optab
= optab_for_tree_code (code
, vectype1
, optab_default
);
5661 if (optab_handler (optab
, mode1
) == CODE_FOR_nothing
)
5662 reduce_with_shift
= false;
5665 /* First reduce the vector to the desired vector size we should
5666 do shift reduction on by combining upper and lower halves. */
5667 new_temp
= new_phi_result
;
5668 while (nunits
> nunits1
)
5671 vectype1
= get_related_vectype_for_scalar_type (TYPE_MODE (vectype
),
5673 unsigned int bitsize
= tree_to_uhwi (TYPE_SIZE (vectype1
));
5675 /* The target has to make sure we support lowpart/highpart
5676 extraction, either via direct vector extract or through
5677 an integer mode punning. */
5679 if (convert_optab_handler (vec_extract_optab
,
5680 TYPE_MODE (TREE_TYPE (new_temp
)),
5681 TYPE_MODE (vectype1
))
5682 != CODE_FOR_nothing
)
5684 /* Extract sub-vectors directly once vec_extract becomes
5685 a conversion optab. */
5686 dst1
= make_ssa_name (vectype1
);
5688 = gimple_build_assign (dst1
, BIT_FIELD_REF
,
5689 build3 (BIT_FIELD_REF
, vectype1
,
5690 new_temp
, TYPE_SIZE (vectype1
),
5692 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5693 dst2
= make_ssa_name (vectype1
);
5695 = gimple_build_assign (dst2
, BIT_FIELD_REF
,
5696 build3 (BIT_FIELD_REF
, vectype1
,
5697 new_temp
, TYPE_SIZE (vectype1
),
5698 bitsize_int (bitsize
)));
5699 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5703 /* Extract via punning to appropriately sized integer mode
5705 tree eltype
= build_nonstandard_integer_type (bitsize
, 1);
5706 tree etype
= build_vector_type (eltype
, 2);
5707 gcc_assert (convert_optab_handler (vec_extract_optab
,
5710 != CODE_FOR_nothing
);
5711 tree tem
= make_ssa_name (etype
);
5712 epilog_stmt
= gimple_build_assign (tem
, VIEW_CONVERT_EXPR
,
5713 build1 (VIEW_CONVERT_EXPR
,
5715 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5717 tem
= make_ssa_name (eltype
);
5719 = gimple_build_assign (tem
, BIT_FIELD_REF
,
5720 build3 (BIT_FIELD_REF
, eltype
,
5721 new_temp
, TYPE_SIZE (eltype
),
5723 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5724 dst1
= make_ssa_name (vectype1
);
5725 epilog_stmt
= gimple_build_assign (dst1
, VIEW_CONVERT_EXPR
,
5726 build1 (VIEW_CONVERT_EXPR
,
5728 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5729 tem
= make_ssa_name (eltype
);
5731 = gimple_build_assign (tem
, BIT_FIELD_REF
,
5732 build3 (BIT_FIELD_REF
, eltype
,
5733 new_temp
, TYPE_SIZE (eltype
),
5734 bitsize_int (bitsize
)));
5735 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5736 dst2
= make_ssa_name (vectype1
);
5737 epilog_stmt
= gimple_build_assign (dst2
, VIEW_CONVERT_EXPR
,
5738 build1 (VIEW_CONVERT_EXPR
,
5740 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5743 new_temp
= make_ssa_name (vectype1
);
5744 epilog_stmt
= gimple_build_assign (new_temp
, code
, dst1
, dst2
);
5745 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5746 new_phis
[0] = epilog_stmt
;
5749 if (reduce_with_shift
&& !slp_reduc
)
5751 int element_bitsize
= tree_to_uhwi (bitsize
);
5752 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5753 for variable-length vectors and also requires direct target support
5754 for loop reductions. */
5755 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype1
));
5756 int nelements
= vec_size_in_bits
/ element_bitsize
;
5757 vec_perm_builder sel
;
5758 vec_perm_indices indices
;
5762 tree zero_vec
= build_zero_cst (vectype1
);
5764 for (offset = nelements/2; offset >= 1; offset/=2)
5766 Create: va' = vec_shift <va, offset>
5767 Create: va = vop <va, va'>
5772 if (dump_enabled_p ())
5773 dump_printf_loc (MSG_NOTE
, vect_location
,
5774 "Reduce using vector shifts\n");
5776 gimple_seq stmts
= NULL
;
5777 new_temp
= gimple_convert (&stmts
, vectype1
, new_temp
);
5778 for (elt_offset
= nelements
/ 2;
5782 calc_vec_perm_mask_for_shift (elt_offset
, nelements
, &sel
);
5783 indices
.new_vector (sel
, 2, nelements
);
5784 tree mask
= vect_gen_perm_mask_any (vectype1
, indices
);
5785 new_name
= gimple_build (&stmts
, VEC_PERM_EXPR
, vectype1
,
5786 new_temp
, zero_vec
, mask
);
5787 new_temp
= gimple_build (&stmts
, code
,
5788 vectype1
, new_name
, new_temp
);
5790 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5792 /* 2.4 Extract the final scalar result. Create:
5793 s_out3 = extract_field <v_out2, bitpos> */
5795 if (dump_enabled_p ())
5796 dump_printf_loc (MSG_NOTE
, vect_location
,
5797 "extract scalar result\n");
5799 rhs
= build3 (BIT_FIELD_REF
, scalar_type
, new_temp
,
5800 bitsize
, bitsize_zero_node
);
5801 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
5802 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
5803 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
5804 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5805 scalar_results
.safe_push (new_temp
);
5810 s = extract_field <v_out2, 0>
5811 for (offset = element_size;
5812 offset < vector_size;
5813 offset += element_size;)
5815 Create: s' = extract_field <v_out2, offset>
5816 Create: s = op <s, s'> // For non SLP cases
5819 if (dump_enabled_p ())
5820 dump_printf_loc (MSG_NOTE
, vect_location
,
5821 "Reduce using scalar code.\n");
5823 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype1
));
5824 int element_bitsize
= tree_to_uhwi (bitsize
);
5825 tree compute_type
= TREE_TYPE (vectype
);
5826 gimple_seq stmts
= NULL
;
5827 FOR_EACH_VEC_ELT (new_phis
, i
, new_phi
)
5830 if (gimple_code (new_phi
) == GIMPLE_PHI
)
5831 vec_temp
= PHI_RESULT (new_phi
);
5833 vec_temp
= gimple_assign_lhs (new_phi
);
5834 new_temp
= gimple_build (&stmts
, BIT_FIELD_REF
, compute_type
,
5835 vec_temp
, bitsize
, bitsize_zero_node
);
5837 /* In SLP we don't need to apply reduction operation, so we just
5838 collect s' values in SCALAR_RESULTS. */
5840 scalar_results
.safe_push (new_temp
);
5842 for (bit_offset
= element_bitsize
;
5843 bit_offset
< vec_size_in_bits
;
5844 bit_offset
+= element_bitsize
)
5846 tree bitpos
= bitsize_int (bit_offset
);
5847 new_name
= gimple_build (&stmts
, BIT_FIELD_REF
,
5848 compute_type
, vec_temp
,
5852 /* In SLP we don't need to apply reduction operation, so
5853 we just collect s' values in SCALAR_RESULTS. */
5854 new_temp
= new_name
;
5855 scalar_results
.safe_push (new_name
);
5858 new_temp
= gimple_build (&stmts
, code
, compute_type
,
5859 new_name
, new_temp
);
5863 /* The only case where we need to reduce scalar results in SLP, is
5864 unrolling. If the size of SCALAR_RESULTS is greater than
5865 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5866 REDUC_GROUP_SIZE. */
5869 tree res
, first_res
, new_res
;
5871 /* Reduce multiple scalar results in case of SLP unrolling. */
5872 for (j
= group_size
; scalar_results
.iterate (j
, &res
);
5875 first_res
= scalar_results
[j
% group_size
];
5876 new_res
= gimple_build (&stmts
, code
, compute_type
,
5878 scalar_results
[j
% group_size
] = new_res
;
5880 for (k
= 0; k
< group_size
; k
++)
5881 scalar_results
[k
] = gimple_convert (&stmts
, scalar_type
,
5886 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5887 new_temp
= gimple_convert (&stmts
, scalar_type
, new_temp
);
5888 scalar_results
.safe_push (new_temp
);
5891 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5894 if ((STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
5897 /* Earlier we set the initial value to be a vector if induc_val
5898 values. Check the result and if it is induc_val then replace
5899 with the original initial value, unless induc_val is
5900 the same as initial_def already. */
5901 tree zcompare
= build2 (EQ_EXPR
, boolean_type_node
, new_temp
,
5904 tree tmp
= make_ssa_name (new_scalar_dest
);
5905 epilog_stmt
= gimple_build_assign (tmp
, COND_EXPR
, zcompare
,
5906 initial_def
, new_temp
);
5907 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5908 scalar_results
[0] = tmp
;
5912 /* 2.5 Adjust the final result by the initial value of the reduction
5913 variable. (When such adjustment is not needed, then
5914 'adjustment_def' is zero). For example, if code is PLUS we create:
5915 new_temp = loop_exit_def + adjustment_def */
5919 gcc_assert (!slp_reduc
);
5920 gimple_seq stmts
= NULL
;
5921 if (nested_in_vect_loop
)
5923 new_phi
= new_phis
[0];
5924 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def
)));
5925 adjustment_def
= gimple_convert (&stmts
, vectype
, adjustment_def
);
5926 new_temp
= gimple_build (&stmts
, code
, vectype
,
5927 PHI_RESULT (new_phi
), adjustment_def
);
5931 new_temp
= scalar_results
[0];
5932 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def
)) != VECTOR_TYPE
);
5933 adjustment_def
= gimple_convert (&stmts
, scalar_type
, adjustment_def
);
5934 new_temp
= gimple_build (&stmts
, code
, scalar_type
,
5935 new_temp
, adjustment_def
);
5938 epilog_stmt
= gimple_seq_last_stmt (stmts
);
5939 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5940 if (nested_in_vect_loop
)
5943 scalar_results
.quick_push (new_temp
);
5945 scalar_results
[0] = new_temp
;
5948 scalar_results
[0] = new_temp
;
5950 new_phis
[0] = epilog_stmt
;
5956 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5957 phis with new adjusted scalar results, i.e., replace use <s_out0>
5962 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5963 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5964 v_out2 = reduce <v_out1>
5965 s_out3 = extract_field <v_out2, 0>
5966 s_out4 = adjust_result <s_out3>
5973 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5974 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5975 v_out2 = reduce <v_out1>
5976 s_out3 = extract_field <v_out2, 0>
5977 s_out4 = adjust_result <s_out3>
5982 /* In SLP reduction chain we reduce vector results into one vector if
5983 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5984 LHS of the last stmt in the reduction chain, since we are looking for
5985 the loop exit phi node. */
5986 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
5988 stmt_vec_info dest_stmt_info
5989 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node
)[group_size
- 1]);
5990 scalar_dest
= gimple_assign_lhs (dest_stmt_info
->stmt
);
5994 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5995 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5996 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5997 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5998 correspond to the first vector stmt, etc.
5999 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
6000 if (group_size
> new_phis
.length ())
6001 gcc_assert (!(group_size
% new_phis
.length ()));
6003 for (k
= 0; k
< group_size
; k
++)
6007 stmt_vec_info scalar_stmt_info
= SLP_TREE_SCALAR_STMTS (slp_node
)[k
];
6009 orig_stmt_info
= STMT_VINFO_RELATED_STMT (scalar_stmt_info
);
6010 /* SLP statements can't participate in patterns. */
6011 gcc_assert (!orig_stmt_info
);
6012 scalar_dest
= gimple_assign_lhs (scalar_stmt_info
->stmt
);
6015 if (nested_in_vect_loop
)
6024 /* Find the loop-closed-use at the loop exit of the original scalar
6025 result. (The reduction result is expected to have two immediate uses,
6026 one at the latch block, and one at the loop exit). For double
6027 reductions we are looking for exit phis of the outer loop. */
6028 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, scalar_dest
)
6030 if (!flow_bb_inside_loop_p (loop
, gimple_bb (USE_STMT (use_p
))))
6032 if (!is_gimple_debug (USE_STMT (use_p
)))
6033 phis
.safe_push (USE_STMT (use_p
));
6037 if (double_reduc
&& gimple_code (USE_STMT (use_p
)) == GIMPLE_PHI
)
6039 tree phi_res
= PHI_RESULT (USE_STMT (use_p
));
6041 FOR_EACH_IMM_USE_FAST (phi_use_p
, phi_imm_iter
, phi_res
)
6043 if (!flow_bb_inside_loop_p (loop
,
6044 gimple_bb (USE_STMT (phi_use_p
)))
6045 && !is_gimple_debug (USE_STMT (phi_use_p
)))
6046 phis
.safe_push (USE_STMT (phi_use_p
));
6052 FOR_EACH_VEC_ELT (phis
, i
, exit_phi
)
6054 /* Replace the uses: */
6055 orig_name
= PHI_RESULT (exit_phi
);
6056 scalar_result
= scalar_results
[k
];
6057 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, orig_name
)
6059 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
6060 SET_USE (use_p
, scalar_result
);
6061 update_stmt (use_stmt
);
6069 /* Return a vector of type VECTYPE that is equal to the vector select
6070 operation "MASK ? VEC : IDENTITY". Insert the select statements
6074 merge_with_identity (gimple_stmt_iterator
*gsi
, tree mask
, tree vectype
,
6075 tree vec
, tree identity
)
6077 tree cond
= make_temp_ssa_name (vectype
, NULL
, "cond");
6078 gimple
*new_stmt
= gimple_build_assign (cond
, VEC_COND_EXPR
,
6079 mask
, vec
, identity
);
6080 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
6084 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6085 order, starting with LHS. Insert the extraction statements before GSI and
6086 associate the new scalar SSA names with variable SCALAR_DEST.
6087 Return the SSA name for the result. */
6090 vect_expand_fold_left (gimple_stmt_iterator
*gsi
, tree scalar_dest
,
6091 tree_code code
, tree lhs
, tree vector_rhs
)
6093 tree vectype
= TREE_TYPE (vector_rhs
);
6094 tree scalar_type
= TREE_TYPE (vectype
);
6095 tree bitsize
= TYPE_SIZE (scalar_type
);
6096 unsigned HOST_WIDE_INT vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
6097 unsigned HOST_WIDE_INT element_bitsize
= tree_to_uhwi (bitsize
);
6099 for (unsigned HOST_WIDE_INT bit_offset
= 0;
6100 bit_offset
< vec_size_in_bits
;
6101 bit_offset
+= element_bitsize
)
6103 tree bitpos
= bitsize_int (bit_offset
);
6104 tree rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vector_rhs
,
6107 gassign
*stmt
= gimple_build_assign (scalar_dest
, rhs
);
6108 rhs
= make_ssa_name (scalar_dest
, stmt
);
6109 gimple_assign_set_lhs (stmt
, rhs
);
6110 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
6112 stmt
= gimple_build_assign (scalar_dest
, code
, lhs
, rhs
);
6113 tree new_name
= make_ssa_name (scalar_dest
, stmt
);
6114 gimple_assign_set_lhs (stmt
, new_name
);
6115 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
6121 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6122 type of the vector input. */
6125 get_masked_reduction_fn (internal_fn reduc_fn
, tree vectype_in
)
6127 internal_fn mask_reduc_fn
;
6131 case IFN_FOLD_LEFT_PLUS
:
6132 mask_reduc_fn
= IFN_MASK_FOLD_LEFT_PLUS
;
6139 if (direct_internal_fn_supported_p (mask_reduc_fn
, vectype_in
,
6140 OPTIMIZE_FOR_SPEED
))
6141 return mask_reduc_fn
;
6145 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6146 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6147 statement. CODE is the operation performed by STMT_INFO and OPS are
6148 its scalar operands. REDUC_INDEX is the index of the operand in
6149 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6150 implements in-order reduction, or IFN_LAST if we should open-code it.
6151 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6152 that should be used to control the operation in a fully-masked loop. */
6155 vectorize_fold_left_reduction (loop_vec_info loop_vinfo
,
6156 stmt_vec_info stmt_info
,
6157 gimple_stmt_iterator
*gsi
,
6158 gimple
**vec_stmt
, slp_tree slp_node
,
6159 gimple
*reduc_def_stmt
,
6160 tree_code code
, internal_fn reduc_fn
,
6161 tree ops
[3], tree vectype_in
,
6162 int reduc_index
, vec_loop_masks
*masks
)
6164 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6165 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
6166 internal_fn mask_reduc_fn
= get_masked_reduction_fn (reduc_fn
, vectype_in
);
6172 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
6174 gcc_assert (!nested_in_vect_loop_p (loop
, stmt_info
));
6175 gcc_assert (ncopies
== 1);
6176 gcc_assert (TREE_CODE_LENGTH (code
) == binary_op
);
6179 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out
),
6180 TYPE_VECTOR_SUBPARTS (vectype_in
)));
6182 tree op0
= ops
[1 - reduc_index
];
6185 stmt_vec_info scalar_dest_def_info
;
6186 auto_vec
<tree
> vec_oprnds0
;
6189 auto_vec
<vec
<tree
> > vec_defs (2);
6190 vect_get_slp_defs (loop_vinfo
, slp_node
, &vec_defs
);
6191 vec_oprnds0
.safe_splice (vec_defs
[1 - reduc_index
]);
6192 vec_defs
[0].release ();
6193 vec_defs
[1].release ();
6194 group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
6195 scalar_dest_def_info
= SLP_TREE_SCALAR_STMTS (slp_node
)[group_size
- 1];
6199 vect_get_vec_defs_for_operand (loop_vinfo
, stmt_info
, 1,
6201 scalar_dest_def_info
= stmt_info
;
6204 tree scalar_dest
= gimple_assign_lhs (scalar_dest_def_info
->stmt
);
6205 tree scalar_type
= TREE_TYPE (scalar_dest
);
6206 tree reduc_var
= gimple_phi_result (reduc_def_stmt
);
6208 int vec_num
= vec_oprnds0
.length ();
6209 gcc_assert (vec_num
== 1 || slp_node
);
6210 tree vec_elem_type
= TREE_TYPE (vectype_out
);
6211 gcc_checking_assert (useless_type_conversion_p (scalar_type
, vec_elem_type
));
6213 tree vector_identity
= NULL_TREE
;
6214 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
6215 vector_identity
= build_zero_cst (vectype_out
);
6217 tree scalar_dest_var
= vect_create_destination_var (scalar_dest
, NULL
);
6220 FOR_EACH_VEC_ELT (vec_oprnds0
, i
, def0
)
6223 tree mask
= NULL_TREE
;
6224 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
6225 mask
= vect_get_loop_mask (gsi
, masks
, vec_num
, vectype_in
, i
);
6227 /* Handle MINUS by adding the negative. */
6228 if (reduc_fn
!= IFN_LAST
&& code
== MINUS_EXPR
)
6230 tree negated
= make_ssa_name (vectype_out
);
6231 new_stmt
= gimple_build_assign (negated
, NEGATE_EXPR
, def0
);
6232 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
6236 if (mask
&& mask_reduc_fn
== IFN_LAST
)
6237 def0
= merge_with_identity (gsi
, mask
, vectype_out
, def0
,
6240 /* On the first iteration the input is simply the scalar phi
6241 result, and for subsequent iterations it is the output of
6242 the preceding operation. */
6243 if (reduc_fn
!= IFN_LAST
|| (mask
&& mask_reduc_fn
!= IFN_LAST
))
6245 if (mask
&& mask_reduc_fn
!= IFN_LAST
)
6246 new_stmt
= gimple_build_call_internal (mask_reduc_fn
, 3, reduc_var
,
6249 new_stmt
= gimple_build_call_internal (reduc_fn
, 2, reduc_var
,
6251 /* For chained SLP reductions the output of the previous reduction
6252 operation serves as the input of the next. For the final statement
6253 the output cannot be a temporary - we reuse the original
6254 scalar destination of the last statement. */
6255 if (i
!= vec_num
- 1)
6257 gimple_set_lhs (new_stmt
, scalar_dest_var
);
6258 reduc_var
= make_ssa_name (scalar_dest_var
, new_stmt
);
6259 gimple_set_lhs (new_stmt
, reduc_var
);
6264 reduc_var
= vect_expand_fold_left (gsi
, scalar_dest_var
, code
,
6266 new_stmt
= SSA_NAME_DEF_STMT (reduc_var
);
6267 /* Remove the statement, so that we can use the same code paths
6268 as for statements that we've just created. */
6269 gimple_stmt_iterator tmp_gsi
= gsi_for_stmt (new_stmt
);
6270 gsi_remove (&tmp_gsi
, true);
6273 if (i
== vec_num
- 1)
6275 gimple_set_lhs (new_stmt
, scalar_dest
);
6276 vect_finish_replace_stmt (loop_vinfo
,
6277 scalar_dest_def_info
,
6281 vect_finish_stmt_generation (loop_vinfo
,
6282 scalar_dest_def_info
,
6286 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_stmt
);
6289 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_stmt
);
6290 *vec_stmt
= new_stmt
;
6297 /* Function is_nonwrapping_integer_induction.
6299 Check if STMT_VINO (which is part of loop LOOP) both increments and
6300 does not cause overflow. */
6303 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo
, class loop
*loop
)
6305 gphi
*phi
= as_a
<gphi
*> (stmt_vinfo
->stmt
);
6306 tree base
= STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
);
6307 tree step
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
);
6308 tree lhs_type
= TREE_TYPE (gimple_phi_result (phi
));
6309 widest_int ni
, max_loop_value
, lhs_max
;
6310 wi::overflow_type overflow
= wi::OVF_NONE
;
6312 /* Make sure the loop is integer based. */
6313 if (TREE_CODE (base
) != INTEGER_CST
6314 || TREE_CODE (step
) != INTEGER_CST
)
6317 /* Check that the max size of the loop will not wrap. */
6319 if (TYPE_OVERFLOW_UNDEFINED (lhs_type
))
6322 if (! max_stmt_executions (loop
, &ni
))
6325 max_loop_value
= wi::mul (wi::to_widest (step
), ni
, TYPE_SIGN (lhs_type
),
6330 max_loop_value
= wi::add (wi::to_widest (base
), max_loop_value
,
6331 TYPE_SIGN (lhs_type
), &overflow
);
6335 return (wi::min_precision (max_loop_value
, TYPE_SIGN (lhs_type
))
6336 <= TYPE_PRECISION (lhs_type
));
6339 /* Check if masking can be supported by inserting a conditional expression.
6340 CODE is the code for the operation. COND_FN is the conditional internal
6341 function, if it exists. VECTYPE_IN is the type of the vector input. */
6343 use_mask_by_cond_expr_p (enum tree_code code
, internal_fn cond_fn
,
6346 if (cond_fn
!= IFN_LAST
6347 && direct_internal_fn_supported_p (cond_fn
, vectype_in
,
6348 OPTIMIZE_FOR_SPEED
))
6362 /* Insert a conditional expression to enable masked vectorization. CODE is the
6363 code for the operation. VOP is the array of operands. MASK is the loop
6364 mask. GSI is a statement iterator used to place the new conditional
6367 build_vect_cond_expr (enum tree_code code
, tree vop
[3], tree mask
,
6368 gimple_stmt_iterator
*gsi
)
6374 tree vectype
= TREE_TYPE (vop
[1]);
6375 tree zero
= build_zero_cst (vectype
);
6376 tree masked_op1
= make_temp_ssa_name (vectype
, NULL
, "masked_op1");
6377 gassign
*select
= gimple_build_assign (masked_op1
, VEC_COND_EXPR
,
6378 mask
, vop
[1], zero
);
6379 gsi_insert_before (gsi
, select
, GSI_SAME_STMT
);
6380 vop
[1] = masked_op1
;
6386 tree vectype
= TREE_TYPE (vop
[1]);
6387 tree masked_op1
= make_temp_ssa_name (vectype
, NULL
, "masked_op1");
6388 gassign
*select
= gimple_build_assign (masked_op1
, VEC_COND_EXPR
,
6389 mask
, vop
[1], vop
[0]);
6390 gsi_insert_before (gsi
, select
, GSI_SAME_STMT
);
6391 vop
[1] = masked_op1
;
6400 /* Function vectorizable_reduction.
6402 Check if STMT_INFO performs a reduction operation that can be vectorized.
6403 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6404 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6405 Return true if STMT_INFO is vectorizable in this way.
6407 This function also handles reduction idioms (patterns) that have been
6408 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6409 may be of this form:
6410 X = pattern_expr (arg0, arg1, ..., X)
6411 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6412 sequence that had been detected and replaced by the pattern-stmt
6415 This function also handles reduction of condition expressions, for example:
6416 for (int i = 0; i < N; i++)
6419 This is handled by vectorising the loop and creating an additional vector
6420 containing the loop indexes for which "a[i] < value" was true. In the
6421 function epilogue this is reduced to a single max value and then used to
6422 index into the vector of results.
6424 In some cases of reduction patterns, the type of the reduction variable X is
6425 different than the type of the other arguments of STMT_INFO.
6426 In such cases, the vectype that is used when transforming STMT_INFO into
6427 a vector stmt is different than the vectype that is used to determine the
6428 vectorization factor, because it consists of a different number of elements
6429 than the actual number of elements that are being operated upon in parallel.
6431 For example, consider an accumulation of shorts into an int accumulator.
6432 On some targets it's possible to vectorize this pattern operating on 8
6433 shorts at a time (hence, the vectype for purposes of determining the
6434 vectorization factor should be V8HI); on the other hand, the vectype that
6435 is used to create the vector form is actually V4SI (the type of the result).
6437 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6438 indicates what is the actual level of parallelism (V8HI in the example), so
6439 that the right vectorization factor would be derived. This vectype
6440 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6441 be used to create the vectorized stmt. The right vectype for the vectorized
6442 stmt is obtained from the type of the result X:
6443 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6445 This means that, contrary to "regular" reductions (or "regular" stmts in
6446 general), the following equation:
6447 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6448 does *NOT* necessarily hold for reduction patterns. */
6451 vectorizable_reduction (loop_vec_info loop_vinfo
,
6452 stmt_vec_info stmt_info
, slp_tree slp_node
,
6453 slp_instance slp_node_instance
,
6454 stmt_vector_for_cost
*cost_vec
)
6457 tree vectype_in
= NULL_TREE
;
6458 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6459 enum vect_def_type cond_reduc_dt
= vect_unknown_def_type
;
6460 stmt_vec_info cond_stmt_vinfo
= NULL
;
6464 bool single_defuse_cycle
= false;
6465 bool nested_cycle
= false;
6466 bool double_reduc
= false;
6469 tree cr_index_scalar_type
= NULL_TREE
, cr_index_vector_type
= NULL_TREE
;
6470 tree cond_reduc_val
= NULL_TREE
;
6472 /* Make sure it was already recognized as a reduction computation. */
6473 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
6474 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
6475 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_nested_cycle
)
6478 /* The stmt we store reduction analysis meta on. */
6479 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
6480 reduc_info
->is_reduc_info
= true;
6482 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
6484 if (is_a
<gphi
*> (stmt_info
->stmt
))
6488 /* We eventually need to set a vector type on invariant
6492 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node
), j
, child
)
6493 if (!vect_maybe_update_slp_op_vectype
6494 (child
, SLP_TREE_VECTYPE (slp_node
)))
6496 if (dump_enabled_p ())
6497 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6498 "incompatible vector types for "
6503 /* Analysis for double-reduction is done on the outer
6504 loop PHI, nested cycles have no further restrictions. */
6505 STMT_VINFO_TYPE (stmt_info
) = cycle_phi_info_type
;
6508 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
6512 stmt_vec_info orig_stmt_of_analysis
= stmt_info
;
6513 stmt_vec_info phi_info
= stmt_info
;
6514 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
6515 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
6517 if (!is_a
<gphi
*> (stmt_info
->stmt
))
6519 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
6524 slp_node_instance
->reduc_phis
= slp_node
;
6525 /* ??? We're leaving slp_node to point to the PHIs, we only
6526 need it to get at the number of vector stmts which wasn't
6527 yet initialized for the instance root. */
6529 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
)
6530 stmt_info
= vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info
));
6531 else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6533 use_operand_p use_p
;
6535 bool res
= single_imm_use (gimple_phi_result (stmt_info
->stmt
),
6538 phi_info
= loop_vinfo
->lookup_stmt (use_stmt
);
6539 stmt_info
= vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info
));
6543 /* PHIs should not participate in patterns. */
6544 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info
));
6545 gphi
*reduc_def_phi
= as_a
<gphi
*> (phi_info
->stmt
);
6547 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6548 and compute the reduction chain length. Discover the real
6549 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
6551 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi
,
6553 (gimple_bb (reduc_def_phi
)->loop_father
));
6554 unsigned reduc_chain_length
= 0;
6555 bool only_slp_reduc_chain
= true;
6557 slp_tree slp_for_stmt_info
= slp_node
? slp_node_instance
->root
: NULL
;
6558 while (reduc_def
!= PHI_RESULT (reduc_def_phi
))
6560 stmt_vec_info def
= loop_vinfo
->lookup_def (reduc_def
);
6561 stmt_vec_info vdef
= vect_stmt_to_vectorize (def
);
6562 if (STMT_VINFO_REDUC_IDX (vdef
) == -1)
6564 if (dump_enabled_p ())
6565 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6566 "reduction chain broken by patterns.\n");
6569 if (!REDUC_GROUP_FIRST_ELEMENT (vdef
))
6570 only_slp_reduc_chain
= false;
6571 /* ??? For epilogue generation live members of the chain need
6572 to point back to the PHI via their original stmt for
6573 info_for_reduction to work. */
6574 if (STMT_VINFO_LIVE_P (vdef
))
6575 STMT_VINFO_REDUC_DEF (def
) = phi_info
;
6576 gassign
*assign
= dyn_cast
<gassign
*> (vdef
->stmt
);
6579 if (dump_enabled_p ())
6580 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6581 "reduction chain includes calls.\n");
6584 if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign
)))
6586 if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign
)),
6587 TREE_TYPE (gimple_assign_rhs1 (assign
))))
6589 if (dump_enabled_p ())
6590 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6591 "conversion in the reduction chain.\n");
6595 else if (!stmt_info
)
6596 /* First non-conversion stmt. */
6598 reduc_def
= gimple_op (vdef
->stmt
, 1 + STMT_VINFO_REDUC_IDX (vdef
));
6599 reduc_chain_length
++;
6600 if (!stmt_info
&& slp_node
)
6601 slp_for_stmt_info
= SLP_TREE_CHILDREN (slp_for_stmt_info
)[0];
6603 /* PHIs should not participate in patterns. */
6604 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info
));
6606 if (nested_in_vect_loop_p (loop
, stmt_info
))
6609 nested_cycle
= true;
6612 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6614 if (slp_node
&& REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
6616 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info
));
6617 stmt_info
= REDUC_GROUP_FIRST_ELEMENT (stmt_info
);
6619 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
6620 gcc_assert (slp_node
6621 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
) == stmt_info
);
6623 /* 1. Is vectorizable reduction? */
6624 /* Not supportable if the reduction variable is used in the loop, unless
6625 it's a reduction chain. */
6626 if (STMT_VINFO_RELEVANT (stmt_info
) > vect_used_in_outer
6627 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
6630 /* Reductions that are not used even in an enclosing outer-loop,
6631 are expected to be "live" (used out of the loop). */
6632 if (STMT_VINFO_RELEVANT (stmt_info
) == vect_unused_in_scope
6633 && !STMT_VINFO_LIVE_P (stmt_info
))
6636 /* 2. Has this been recognized as a reduction pattern?
6638 Check if STMT represents a pattern that has been recognized
6639 in earlier analysis stages. For stmts that represent a pattern,
6640 the STMT_VINFO_RELATED_STMT field records the last stmt in
6641 the original sequence that constitutes the pattern. */
6643 stmt_vec_info orig_stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
6646 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
));
6647 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info
));
6650 /* 3. Check the operands of the operation. The first operands are defined
6651 inside the loop body. The last operand is the reduction variable,
6652 which is defined by the loop-header-phi. */
6654 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
6655 STMT_VINFO_REDUC_VECTYPE (reduc_info
) = vectype_out
;
6656 gassign
*stmt
= as_a
<gassign
*> (stmt_info
->stmt
);
6657 enum tree_code code
= gimple_assign_rhs_code (stmt
);
6658 bool lane_reduc_code_p
6659 = (code
== DOT_PROD_EXPR
|| code
== WIDEN_SUM_EXPR
|| code
== SAD_EXPR
);
6660 int op_type
= TREE_CODE_LENGTH (code
);
6662 scalar_dest
= gimple_assign_lhs (stmt
);
6663 scalar_type
= TREE_TYPE (scalar_dest
);
6664 if (!POINTER_TYPE_P (scalar_type
) && !INTEGRAL_TYPE_P (scalar_type
)
6665 && !SCALAR_FLOAT_TYPE_P (scalar_type
))
6668 /* Do not try to vectorize bit-precision reductions. */
6669 if (!type_has_mode_precision_p (scalar_type
))
6672 /* For lane-reducing ops we're reducing the number of reduction PHIs
6673 which means the only use of that may be in the lane-reducing operation. */
6674 if (lane_reduc_code_p
6675 && reduc_chain_length
!= 1
6676 && !only_slp_reduc_chain
)
6678 if (dump_enabled_p ())
6679 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6680 "lane-reducing reduction with extra stmts.\n");
6684 /* All uses but the last are expected to be defined in the loop.
6685 The last use is the reduction variable. In case of nested cycle this
6686 assumption is not true: we use reduc_index to record the index of the
6687 reduction variable. */
6688 slp_tree
*slp_op
= XALLOCAVEC (slp_tree
, op_type
);
6689 /* We need to skip an extra operand for COND_EXPRs with embedded
6691 unsigned opno_adjust
= 0;
6692 if (code
== COND_EXPR
6693 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt
)))
6695 for (i
= 0; i
< op_type
; i
++)
6697 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6698 if (i
== 0 && code
== COND_EXPR
)
6701 stmt_vec_info def_stmt_info
;
6702 enum vect_def_type dt
;
6704 if (!vect_is_simple_use (loop_vinfo
, stmt_info
, slp_for_stmt_info
,
6705 i
+ opno_adjust
, &op
, &slp_op
[i
], &dt
, &tem
,
6708 if (dump_enabled_p ())
6709 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6710 "use not simple.\n");
6713 if (i
== STMT_VINFO_REDUC_IDX (stmt_info
))
6716 /* There should be only one cycle def in the stmt, the one
6717 leading to reduc_def. */
6718 if (VECTORIZABLE_CYCLE_DEF (dt
))
6721 /* To properly compute ncopies we are interested in the widest
6722 non-reduction input type in case we're looking at a widening
6723 accumulation that we later handle in vect_transform_reduction. */
6724 if (lane_reduc_code_p
6727 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in
)))
6728 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem
))))))
6731 if (code
== COND_EXPR
)
6733 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6734 if (dt
== vect_constant_def
)
6737 cond_reduc_val
= op
;
6739 if (dt
== vect_induction_def
6741 && is_nonwrapping_integer_induction (def_stmt_info
, loop
))
6744 cond_stmt_vinfo
= def_stmt_info
;
6749 vectype_in
= STMT_VINFO_VECTYPE (phi_info
);
6750 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info
) = vectype_in
;
6752 enum vect_reduction_type v_reduc_type
= STMT_VINFO_REDUC_TYPE (phi_info
);
6753 STMT_VINFO_REDUC_TYPE (reduc_info
) = v_reduc_type
;
6754 /* If we have a condition reduction, see if we can simplify it further. */
6755 if (v_reduc_type
== COND_REDUCTION
)
6760 /* When the condition uses the reduction value in the condition, fail. */
6761 if (STMT_VINFO_REDUC_IDX (stmt_info
) == 0)
6763 if (dump_enabled_p ())
6764 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6765 "condition depends on previous iteration\n");
6769 if (reduc_chain_length
== 1
6770 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST
,
6771 vectype_in
, OPTIMIZE_FOR_SPEED
))
6773 if (dump_enabled_p ())
6774 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6775 "optimizing condition reduction with"
6776 " FOLD_EXTRACT_LAST.\n");
6777 STMT_VINFO_REDUC_TYPE (reduc_info
) = EXTRACT_LAST_REDUCTION
;
6779 else if (cond_reduc_dt
== vect_induction_def
)
6782 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo
);
6783 tree step
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo
);
6785 gcc_assert (TREE_CODE (base
) == INTEGER_CST
6786 && TREE_CODE (step
) == INTEGER_CST
);
6787 cond_reduc_val
= NULL_TREE
;
6788 enum tree_code cond_reduc_op_code
= ERROR_MARK
;
6789 tree res
= PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo
));
6790 if (!types_compatible_p (TREE_TYPE (res
), TREE_TYPE (base
)))
6792 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6793 above base; punt if base is the minimum value of the type for
6794 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6795 else if (tree_int_cst_sgn (step
) == -1)
6797 cond_reduc_op_code
= MIN_EXPR
;
6798 if (tree_int_cst_sgn (base
) == -1)
6799 cond_reduc_val
= build_int_cst (TREE_TYPE (base
), 0);
6800 else if (tree_int_cst_lt (base
,
6801 TYPE_MAX_VALUE (TREE_TYPE (base
))))
6803 = int_const_binop (PLUS_EXPR
, base
, integer_one_node
);
6807 cond_reduc_op_code
= MAX_EXPR
;
6808 if (tree_int_cst_sgn (base
) == 1)
6809 cond_reduc_val
= build_int_cst (TREE_TYPE (base
), 0);
6810 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base
)),
6813 = int_const_binop (MINUS_EXPR
, base
, integer_one_node
);
6817 if (dump_enabled_p ())
6818 dump_printf_loc (MSG_NOTE
, vect_location
,
6819 "condition expression based on "
6820 "integer induction.\n");
6821 STMT_VINFO_REDUC_CODE (reduc_info
) = cond_reduc_op_code
;
6822 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
)
6824 STMT_VINFO_REDUC_TYPE (reduc_info
) = INTEGER_INDUC_COND_REDUCTION
;
6827 else if (cond_reduc_dt
== vect_constant_def
)
6829 enum vect_def_type cond_initial_dt
;
6830 tree cond_initial_val
6831 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi
, loop_preheader_edge (loop
));
6833 gcc_assert (cond_reduc_val
!= NULL_TREE
);
6834 vect_is_simple_use (cond_initial_val
, loop_vinfo
, &cond_initial_dt
);
6835 if (cond_initial_dt
== vect_constant_def
6836 && types_compatible_p (TREE_TYPE (cond_initial_val
),
6837 TREE_TYPE (cond_reduc_val
)))
6839 tree e
= fold_binary (LE_EXPR
, boolean_type_node
,
6840 cond_initial_val
, cond_reduc_val
);
6841 if (e
&& (integer_onep (e
) || integer_zerop (e
)))
6843 if (dump_enabled_p ())
6844 dump_printf_loc (MSG_NOTE
, vect_location
,
6845 "condition expression based on "
6846 "compile time constant.\n");
6847 /* Record reduction code at analysis stage. */
6848 STMT_VINFO_REDUC_CODE (reduc_info
)
6849 = integer_onep (e
) ? MAX_EXPR
: MIN_EXPR
;
6850 STMT_VINFO_REDUC_TYPE (reduc_info
) = CONST_COND_REDUCTION
;
6856 if (STMT_VINFO_LIVE_P (phi_info
))
6862 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
6864 gcc_assert (ncopies
>= 1);
6866 poly_uint64 nunits_out
= TYPE_VECTOR_SUBPARTS (vectype_out
);
6870 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info
)
6871 == vect_double_reduction_def
);
6872 double_reduc
= true;
6875 /* 4.2. Check support for the epilog operation.
6877 If STMT represents a reduction pattern, then the type of the
6878 reduction variable may be different than the type of the rest
6879 of the arguments. For example, consider the case of accumulation
6880 of shorts into an int accumulator; The original code:
6881 S1: int_a = (int) short_a;
6882 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6885 STMT: int_acc = widen_sum <short_a, int_acc>
6888 1. The tree-code that is used to create the vector operation in the
6889 epilog code (that reduces the partial results) is not the
6890 tree-code of STMT, but is rather the tree-code of the original
6891 stmt from the pattern that STMT is replacing. I.e, in the example
6892 above we want to use 'widen_sum' in the loop, but 'plus' in the
6894 2. The type (mode) we use to check available target support
6895 for the vector operation to be created in the *epilog*, is
6896 determined by the type of the reduction variable (in the example
6897 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6898 However the type (mode) we use to check available target support
6899 for the vector operation to be created *inside the loop*, is
6900 determined by the type of the other arguments to STMT (in the
6901 example we'd check this: optab_handler (widen_sum_optab,
6904 This is contrary to "regular" reductions, in which the types of all
6905 the arguments are the same as the type of the reduction variable.
6906 For "regular" reductions we can therefore use the same vector type
6907 (and also the same tree-code) when generating the epilog code and
6908 when generating the code inside the loop. */
6910 enum tree_code orig_code
= STMT_VINFO_REDUC_CODE (phi_info
);
6911 STMT_VINFO_REDUC_CODE (reduc_info
) = orig_code
;
6913 vect_reduction_type reduction_type
= STMT_VINFO_REDUC_TYPE (reduc_info
);
6914 if (reduction_type
== TREE_CODE_REDUCTION
)
6916 /* Check whether it's ok to change the order of the computation.
6917 Generally, when vectorizing a reduction we change the order of the
6918 computation. This may change the behavior of the program in some
6919 cases, so we need to check that this is ok. One exception is when
6920 vectorizing an outer-loop: the inner-loop is executed sequentially,
6921 and therefore vectorizing reductions in the inner-loop during
6922 outer-loop vectorization is safe. Likewise when we are vectorizing
6923 a series of reductions using SLP and the VF is one the reductions
6924 are performed in scalar order. */
6926 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
6927 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), 1u))
6929 else if (needs_fold_left_reduction_p (scalar_type
, orig_code
))
6931 /* When vectorizing a reduction chain w/o SLP the reduction PHI
6932 is not directy used in stmt. */
6933 if (!only_slp_reduc_chain
6934 && reduc_chain_length
!= 1)
6936 if (dump_enabled_p ())
6937 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6938 "in-order reduction chain without SLP.\n");
6941 STMT_VINFO_REDUC_TYPE (reduc_info
)
6942 = reduction_type
= FOLD_LEFT_REDUCTION
;
6944 else if (!commutative_tree_code (orig_code
)
6945 || !associative_tree_code (orig_code
))
6947 if (dump_enabled_p ())
6948 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6949 "reduction: not commutative/associative");
6954 if ((double_reduc
|| reduction_type
!= TREE_CODE_REDUCTION
)
6957 if (dump_enabled_p ())
6958 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6959 "multiple types in double reduction or condition "
6960 "reduction or fold-left reduction.\n");
6964 internal_fn reduc_fn
= IFN_LAST
;
6965 if (reduction_type
== TREE_CODE_REDUCTION
6966 || reduction_type
== FOLD_LEFT_REDUCTION
6967 || reduction_type
== INTEGER_INDUC_COND_REDUCTION
6968 || reduction_type
== CONST_COND_REDUCTION
)
6970 if (reduction_type
== FOLD_LEFT_REDUCTION
6971 ? fold_left_reduction_fn (orig_code
, &reduc_fn
)
6972 : reduction_fn_for_scalar_code (orig_code
, &reduc_fn
))
6974 if (reduc_fn
!= IFN_LAST
6975 && !direct_internal_fn_supported_p (reduc_fn
, vectype_out
,
6976 OPTIMIZE_FOR_SPEED
))
6978 if (dump_enabled_p ())
6979 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6980 "reduc op not supported by target.\n");
6982 reduc_fn
= IFN_LAST
;
6987 if (!nested_cycle
|| double_reduc
)
6989 if (dump_enabled_p ())
6990 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6991 "no reduc code for scalar code.\n");
6997 else if (reduction_type
== COND_REDUCTION
)
6999 int scalar_precision
7000 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type
));
7001 cr_index_scalar_type
= make_unsigned_type (scalar_precision
);
7002 cr_index_vector_type
= get_same_sized_vectype (cr_index_scalar_type
,
7005 if (direct_internal_fn_supported_p (IFN_REDUC_MAX
, cr_index_vector_type
,
7006 OPTIMIZE_FOR_SPEED
))
7007 reduc_fn
= IFN_REDUC_MAX
;
7009 STMT_VINFO_REDUC_FN (reduc_info
) = reduc_fn
;
7011 if (reduction_type
!= EXTRACT_LAST_REDUCTION
7012 && (!nested_cycle
|| double_reduc
)
7013 && reduc_fn
== IFN_LAST
7014 && !nunits_out
.is_constant ())
7016 if (dump_enabled_p ())
7017 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7018 "missing target support for reduction on"
7019 " variable-length vectors.\n");
7023 /* For SLP reductions, see if there is a neutral value we can use. */
7024 tree neutral_op
= NULL_TREE
;
7026 neutral_op
= neutral_op_for_slp_reduction
7027 (slp_node_instance
->reduc_phis
, vectype_out
, orig_code
,
7028 REDUC_GROUP_FIRST_ELEMENT (stmt_info
) != NULL
);
7030 if (double_reduc
&& reduction_type
== FOLD_LEFT_REDUCTION
)
7032 /* We can't support in-order reductions of code such as this:
7034 for (int i = 0; i < n1; ++i)
7035 for (int j = 0; j < n2; ++j)
7038 since GCC effectively transforms the loop when vectorizing:
7040 for (int i = 0; i < n1 / VF; ++i)
7041 for (int j = 0; j < n2; ++j)
7042 for (int k = 0; k < VF; ++k)
7045 which is a reassociation of the original operation. */
7046 if (dump_enabled_p ())
7047 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7048 "in-order double reduction not supported.\n");
7053 if (reduction_type
== FOLD_LEFT_REDUCTION
7055 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
7057 /* We cannot use in-order reductions in this case because there is
7058 an implicit reassociation of the operations involved. */
7059 if (dump_enabled_p ())
7060 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7061 "in-order unchained SLP reductions not supported.\n");
7065 /* For double reductions, and for SLP reductions with a neutral value,
7066 we construct a variable-length initial vector by loading a vector
7067 full of the neutral value and then shift-and-inserting the start
7068 values into the low-numbered elements. */
7069 if ((double_reduc
|| neutral_op
)
7070 && !nunits_out
.is_constant ()
7071 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT
,
7072 vectype_out
, OPTIMIZE_FOR_SPEED
))
7074 if (dump_enabled_p ())
7075 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7076 "reduction on variable-length vectors requires"
7077 " target support for a vector-shift-and-insert"
7082 /* Check extra constraints for variable-length unchained SLP reductions. */
7083 if (STMT_SLP_TYPE (stmt_info
)
7084 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
7085 && !nunits_out
.is_constant ())
7087 /* We checked above that we could build the initial vector when
7088 there's a neutral element value. Check here for the case in
7089 which each SLP statement has its own initial value and in which
7090 that value needs to be repeated for every instance of the
7091 statement within the initial vector. */
7092 unsigned int group_size
= SLP_TREE_LANES (slp_node
);
7094 && !can_duplicate_and_interleave_p (loop_vinfo
, group_size
,
7095 TREE_TYPE (vectype_out
)))
7097 if (dump_enabled_p ())
7098 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7099 "unsupported form of SLP reduction for"
7100 " variable-length vectors: cannot build"
7101 " initial vector.\n");
7104 /* The epilogue code relies on the number of elements being a multiple
7105 of the group size. The duplicate-and-interleave approach to setting
7106 up the initial vector does too. */
7107 if (!multiple_p (nunits_out
, group_size
))
7109 if (dump_enabled_p ())
7110 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7111 "unsupported form of SLP reduction for"
7112 " variable-length vectors: the vector size"
7113 " is not a multiple of the number of results.\n");
7118 if (reduction_type
== COND_REDUCTION
)
7122 if (! max_loop_iterations (loop
, &ni
))
7124 if (dump_enabled_p ())
7125 dump_printf_loc (MSG_NOTE
, vect_location
,
7126 "loop count not known, cannot create cond "
7130 /* Convert backedges to iterations. */
7133 /* The additional index will be the same type as the condition. Check
7134 that the loop can fit into this less one (because we'll use up the
7135 zero slot for when there are no matches). */
7136 tree max_index
= TYPE_MAX_VALUE (cr_index_scalar_type
);
7137 if (wi::geu_p (ni
, wi::to_widest (max_index
)))
7139 if (dump_enabled_p ())
7140 dump_printf_loc (MSG_NOTE
, vect_location
,
7141 "loop size is greater than data size.\n");
7146 /* In case the vectorization factor (VF) is bigger than the number
7147 of elements that we can fit in a vectype (nunits), we have to generate
7148 more than one vector stmt - i.e - we need to "unroll" the
7149 vector stmt by a factor VF/nunits. For more details see documentation
7150 in vectorizable_operation. */
7152 /* If the reduction is used in an outer loop we need to generate
7153 VF intermediate results, like so (e.g. for ncopies=2):
7158 (i.e. we generate VF results in 2 registers).
7159 In this case we have a separate def-use cycle for each copy, and therefore
7160 for each copy we get the vector def for the reduction variable from the
7161 respective phi node created for this copy.
7163 Otherwise (the reduction is unused in the loop nest), we can combine
7164 together intermediate results, like so (e.g. for ncopies=2):
7168 (i.e. we generate VF/2 results in a single register).
7169 In this case for each copy we get the vector def for the reduction variable
7170 from the vectorized reduction operation generated in the previous iteration.
7172 This only works when we see both the reduction PHI and its only consumer
7173 in vectorizable_reduction and there are no intermediate stmts
7176 && (STMT_VINFO_RELEVANT (stmt_info
) <= vect_used_only_live
)
7177 && reduc_chain_length
== 1)
7178 single_defuse_cycle
= true;
7180 if (single_defuse_cycle
|| lane_reduc_code_p
)
7182 gcc_assert (code
!= COND_EXPR
);
7184 /* 4. Supportable by target? */
7187 /* 4.1. check support for the operation in the loop */
7188 optab optab
= optab_for_tree_code (code
, vectype_in
, optab_vector
);
7191 if (dump_enabled_p ())
7192 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7197 machine_mode vec_mode
= TYPE_MODE (vectype_in
);
7198 if (ok
&& optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
7200 if (dump_enabled_p ())
7201 dump_printf (MSG_NOTE
, "op not supported by target.\n");
7202 if (maybe_ne (GET_MODE_SIZE (vec_mode
), UNITS_PER_WORD
)
7203 || !vect_worthwhile_without_simd_p (loop_vinfo
, code
))
7206 if (dump_enabled_p ())
7207 dump_printf (MSG_NOTE
, "proceeding using word mode.\n");
7210 /* Worthwhile without SIMD support? */
7212 && !VECTOR_MODE_P (TYPE_MODE (vectype_in
))
7213 && !vect_worthwhile_without_simd_p (loop_vinfo
, code
))
7215 if (dump_enabled_p ())
7216 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7217 "not worthwhile without SIMD support.\n");
7221 /* lane-reducing operations have to go through vect_transform_reduction.
7222 For the other cases try without the single cycle optimization. */
7225 if (lane_reduc_code_p
)
7228 single_defuse_cycle
= false;
7231 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info
) = single_defuse_cycle
;
7233 /* If the reduction stmt is one of the patterns that have lane
7234 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7235 if ((ncopies
> 1 && ! single_defuse_cycle
)
7236 && lane_reduc_code_p
)
7238 if (dump_enabled_p ())
7239 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7240 "multi def-use cycle not possible for lane-reducing "
7241 "reduction operation\n");
7246 && !(!single_defuse_cycle
7247 && code
!= DOT_PROD_EXPR
7248 && code
!= WIDEN_SUM_EXPR
7250 && reduction_type
!= FOLD_LEFT_REDUCTION
))
7251 for (i
= 0; i
< op_type
; i
++)
7252 if (!vect_maybe_update_slp_op_vectype (slp_op
[i
], vectype_in
))
7254 if (dump_enabled_p ())
7255 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7256 "incompatible vector types for invariants\n");
7261 vec_num
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
7265 vect_model_reduction_cost (loop_vinfo
, stmt_info
, reduc_fn
,
7266 reduction_type
, ncopies
, cost_vec
);
7267 /* Cost the reduction op inside the loop if transformed via
7268 vect_transform_reduction. Otherwise this is costed by the
7269 separate vectorizable_* routines. */
7270 if (single_defuse_cycle
7271 || code
== DOT_PROD_EXPR
7272 || code
== WIDEN_SUM_EXPR
7273 || code
== SAD_EXPR
)
7274 record_stmt_cost (cost_vec
, ncopies
, vector_stmt
, stmt_info
, 0, vect_body
);
7276 if (dump_enabled_p ()
7277 && reduction_type
== FOLD_LEFT_REDUCTION
)
7278 dump_printf_loc (MSG_NOTE
, vect_location
,
7279 "using an in-order (fold-left) reduction.\n");
7280 STMT_VINFO_TYPE (orig_stmt_of_analysis
) = cycle_phi_info_type
;
7281 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7282 reductions go through their own vectorizable_* routines. */
7283 if (!single_defuse_cycle
7284 && code
!= DOT_PROD_EXPR
7285 && code
!= WIDEN_SUM_EXPR
7287 && reduction_type
!= FOLD_LEFT_REDUCTION
)
7290 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info
));
7291 if (slp_node
&& REDUC_GROUP_FIRST_ELEMENT (tem
))
7293 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem
));
7294 tem
= REDUC_GROUP_FIRST_ELEMENT (tem
);
7296 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem
)) = vect_internal_def
;
7297 STMT_VINFO_DEF_TYPE (tem
) = vect_internal_def
;
7299 else if (loop_vinfo
&& LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
))
7301 vec_loop_masks
*masks
= &LOOP_VINFO_MASKS (loop_vinfo
);
7302 internal_fn cond_fn
= get_conditional_internal_fn (code
);
7304 if (reduction_type
!= FOLD_LEFT_REDUCTION
7305 && !use_mask_by_cond_expr_p (code
, cond_fn
, vectype_in
)
7306 && (cond_fn
== IFN_LAST
7307 || !direct_internal_fn_supported_p (cond_fn
, vectype_in
,
7308 OPTIMIZE_FOR_SPEED
)))
7310 if (dump_enabled_p ())
7311 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7312 "can't operate on partial vectors because"
7313 " no conditional operation is available.\n");
7314 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
7316 else if (reduction_type
== FOLD_LEFT_REDUCTION
7317 && reduc_fn
== IFN_LAST
7318 && !expand_vec_cond_expr_p (vectype_in
,
7319 truth_type_for (vectype_in
),
7322 if (dump_enabled_p ())
7323 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7324 "can't operate on partial vectors because"
7325 " no conditional operation is available.\n");
7326 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
7329 vect_record_loop_mask (loop_vinfo
, masks
, ncopies
* vec_num
,
7335 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7339 vect_transform_reduction (loop_vec_info loop_vinfo
,
7340 stmt_vec_info stmt_info
, gimple_stmt_iterator
*gsi
,
7341 gimple
**vec_stmt
, slp_tree slp_node
)
7343 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
7344 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
7349 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
7350 gcc_assert (reduc_info
->is_reduc_info
);
7352 if (nested_in_vect_loop_p (loop
, stmt_info
))
7355 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info
) == vect_double_reduction_def
);
7358 gassign
*stmt
= as_a
<gassign
*> (stmt_info
->stmt
);
7359 enum tree_code code
= gimple_assign_rhs_code (stmt
);
7360 int op_type
= TREE_CODE_LENGTH (code
);
7364 switch (get_gimple_rhs_class (code
))
7366 case GIMPLE_TERNARY_RHS
:
7367 ops
[2] = gimple_assign_rhs3 (stmt
);
7369 case GIMPLE_BINARY_RHS
:
7370 ops
[0] = gimple_assign_rhs1 (stmt
);
7371 ops
[1] = gimple_assign_rhs2 (stmt
);
7377 /* All uses but the last are expected to be defined in the loop.
7378 The last use is the reduction variable. In case of nested cycle this
7379 assumption is not true: we use reduc_index to record the index of the
7380 reduction variable. */
7381 stmt_vec_info phi_info
= STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
));
7382 gphi
*reduc_def_phi
= as_a
<gphi
*> (phi_info
->stmt
);
7383 int reduc_index
= STMT_VINFO_REDUC_IDX (stmt_info
);
7384 tree vectype_in
= STMT_VINFO_REDUC_VECTYPE_IN (reduc_info
);
7389 vec_num
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
7393 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
7397 internal_fn cond_fn
= get_conditional_internal_fn (code
);
7398 vec_loop_masks
*masks
= &LOOP_VINFO_MASKS (loop_vinfo
);
7399 bool mask_by_cond_expr
= use_mask_by_cond_expr_p (code
, cond_fn
, vectype_in
);
7402 tree new_temp
= NULL_TREE
;
7403 auto_vec
<tree
> vec_oprnds0
;
7404 auto_vec
<tree
> vec_oprnds1
;
7405 auto_vec
<tree
> vec_oprnds2
;
7408 if (dump_enabled_p ())
7409 dump_printf_loc (MSG_NOTE
, vect_location
, "transform reduction.\n");
7411 /* FORNOW: Multiple types are not supported for condition. */
7412 if (code
== COND_EXPR
)
7413 gcc_assert (ncopies
== 1);
7415 bool masked_loop_p
= LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
);
7417 vect_reduction_type reduction_type
= STMT_VINFO_REDUC_TYPE (reduc_info
);
7418 if (reduction_type
== FOLD_LEFT_REDUCTION
)
7420 internal_fn reduc_fn
= STMT_VINFO_REDUC_FN (reduc_info
);
7421 return vectorize_fold_left_reduction
7422 (loop_vinfo
, stmt_info
, gsi
, vec_stmt
, slp_node
, reduc_def_phi
, code
,
7423 reduc_fn
, ops
, vectype_in
, reduc_index
, masks
);
7426 bool single_defuse_cycle
= STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info
);
7427 gcc_assert (single_defuse_cycle
7428 || code
== DOT_PROD_EXPR
7429 || code
== WIDEN_SUM_EXPR
7430 || code
== SAD_EXPR
);
7432 /* Create the destination vector */
7433 tree scalar_dest
= gimple_assign_lhs (stmt
);
7434 tree vec_dest
= vect_create_destination_var (scalar_dest
, vectype_out
);
7436 vect_get_vec_defs (loop_vinfo
, stmt_info
, slp_node
, ncopies
,
7437 single_defuse_cycle
&& reduc_index
== 0
7438 ? NULL_TREE
: ops
[0], &vec_oprnds0
,
7439 single_defuse_cycle
&& reduc_index
== 1
7440 ? NULL_TREE
: ops
[1], &vec_oprnds1
,
7441 op_type
== ternary_op
7442 && !(single_defuse_cycle
&& reduc_index
== 2)
7443 ? ops
[2] : NULL_TREE
, &vec_oprnds2
);
7444 if (single_defuse_cycle
)
7446 gcc_assert (!slp_node
);
7447 vect_get_vec_defs_for_operand (loop_vinfo
, stmt_info
, 1,
7449 reduc_index
== 0 ? &vec_oprnds0
7450 : (reduc_index
== 1 ? &vec_oprnds1
7454 FOR_EACH_VEC_ELT (vec_oprnds0
, i
, def0
)
7457 tree vop
[3] = { def0
, vec_oprnds1
[i
], NULL_TREE
};
7458 if (masked_loop_p
&& !mask_by_cond_expr
)
7460 /* Make sure that the reduction accumulator is vop[0]. */
7461 if (reduc_index
== 1)
7463 gcc_assert (commutative_tree_code (code
));
7464 std::swap (vop
[0], vop
[1]);
7466 tree mask
= vect_get_loop_mask (gsi
, masks
, vec_num
* ncopies
,
7468 gcall
*call
= gimple_build_call_internal (cond_fn
, 4, mask
,
7469 vop
[0], vop
[1], vop
[0]);
7470 new_temp
= make_ssa_name (vec_dest
, call
);
7471 gimple_call_set_lhs (call
, new_temp
);
7472 gimple_call_set_nothrow (call
, true);
7473 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, call
, gsi
);
7478 if (op_type
== ternary_op
)
7479 vop
[2] = vec_oprnds2
[i
];
7481 if (masked_loop_p
&& mask_by_cond_expr
)
7483 tree mask
= vect_get_loop_mask (gsi
, masks
, vec_num
* ncopies
,
7485 build_vect_cond_expr (code
, vop
, mask
, gsi
);
7488 new_stmt
= gimple_build_assign (vec_dest
, code
,
7489 vop
[0], vop
[1], vop
[2]);
7490 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
7491 gimple_assign_set_lhs (new_stmt
, new_temp
);
7492 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, new_stmt
, gsi
);
7496 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_stmt
);
7497 else if (single_defuse_cycle
7500 if (reduc_index
== 0)
7501 vec_oprnds0
.safe_push (gimple_get_lhs (new_stmt
));
7502 else if (reduc_index
== 1)
7503 vec_oprnds1
.safe_push (gimple_get_lhs (new_stmt
));
7504 else if (reduc_index
== 2)
7505 vec_oprnds2
.safe_push (gimple_get_lhs (new_stmt
));
7508 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_stmt
);
7512 *vec_stmt
= STMT_VINFO_VEC_STMTS (stmt_info
)[0];
7517 /* Transform phase of a cycle PHI. */
7520 vect_transform_cycle_phi (loop_vec_info loop_vinfo
,
7521 stmt_vec_info stmt_info
, gimple
**vec_stmt
,
7522 slp_tree slp_node
, slp_instance slp_node_instance
)
7524 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
7525 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
7529 bool nested_cycle
= false;
7532 if (nested_in_vect_loop_p (loop
, stmt_info
))
7535 nested_cycle
= true;
7538 stmt_vec_info reduc_stmt_info
= STMT_VINFO_REDUC_DEF (stmt_info
);
7539 reduc_stmt_info
= vect_stmt_to_vectorize (reduc_stmt_info
);
7540 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
7541 gcc_assert (reduc_info
->is_reduc_info
);
7543 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == EXTRACT_LAST_REDUCTION
7544 || STMT_VINFO_REDUC_TYPE (reduc_info
) == FOLD_LEFT_REDUCTION
)
7545 /* Leave the scalar phi in place. */
7548 tree vectype_in
= STMT_VINFO_REDUC_VECTYPE_IN (reduc_info
);
7549 /* For a nested cycle we do not fill the above. */
7551 vectype_in
= STMT_VINFO_VECTYPE (stmt_info
);
7552 gcc_assert (vectype_in
);
7556 /* The size vect_schedule_slp_instance computes is off for us. */
7557 vec_num
= vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
7558 * SLP_TREE_LANES (slp_node
), vectype_in
);
7564 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
7567 /* Check whether we should use a single PHI node and accumulate
7568 vectors to one before the backedge. */
7569 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info
))
7572 /* Create the destination vector */
7573 gphi
*phi
= as_a
<gphi
*> (stmt_info
->stmt
);
7574 tree vec_dest
= vect_create_destination_var (gimple_phi_result (phi
),
7577 /* Get the loop-entry arguments. */
7578 tree vec_initial_def
;
7579 auto_vec
<tree
> vec_initial_defs
;
7582 vec_initial_defs
.reserve (vec_num
);
7585 unsigned phi_idx
= loop_preheader_edge (loop
)->dest_idx
;
7586 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node
)[phi_idx
],
7591 gcc_assert (slp_node
== slp_node_instance
->reduc_phis
);
7592 stmt_vec_info first
= REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info
);
7594 = neutral_op_for_slp_reduction (slp_node
, vectype_out
,
7595 STMT_VINFO_REDUC_CODE (reduc_info
),
7597 get_initial_defs_for_reduction (loop_vinfo
, slp_node_instance
->reduc_phis
,
7598 &vec_initial_defs
, vec_num
,
7599 first
!= NULL
, neutral_op
);
7604 /* Get at the scalar def before the loop, that defines the initial
7605 value of the reduction variable. */
7606 tree initial_def
= PHI_ARG_DEF_FROM_EDGE (phi
,
7607 loop_preheader_edge (loop
));
7608 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7609 and we can't use zero for induc_val, use initial_def. Similarly
7610 for REDUC_MIN and initial_def larger than the base. */
7611 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
7613 tree induc_val
= STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
);
7614 if (TREE_CODE (initial_def
) == INTEGER_CST
7615 && !integer_zerop (induc_val
)
7616 && ((STMT_VINFO_REDUC_CODE (reduc_info
) == MAX_EXPR
7617 && tree_int_cst_lt (initial_def
, induc_val
))
7618 || (STMT_VINFO_REDUC_CODE (reduc_info
) == MIN_EXPR
7619 && tree_int_cst_lt (induc_val
, initial_def
))))
7621 induc_val
= initial_def
;
7622 /* Communicate we used the initial_def to epilouge
7624 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
) = NULL_TREE
;
7626 vec_initial_def
= build_vector_from_val (vectype_out
, induc_val
);
7627 vec_initial_defs
.create (ncopies
);
7628 for (i
= 0; i
< ncopies
; ++i
)
7629 vec_initial_defs
.quick_push (vec_initial_def
);
7631 else if (nested_cycle
)
7633 /* Do not use an adjustment def as that case is not supported
7634 correctly if ncopies is not one. */
7635 vect_get_vec_defs_for_operand (loop_vinfo
, reduc_stmt_info
,
7636 ncopies
, initial_def
,
7641 tree adjustment_def
= NULL_TREE
;
7642 tree
*adjustment_defp
= &adjustment_def
;
7643 enum tree_code code
= STMT_VINFO_REDUC_CODE (reduc_info
);
7644 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
7645 adjustment_defp
= NULL
;
7647 = get_initial_def_for_reduction (loop_vinfo
, reduc_stmt_info
, code
,
7648 initial_def
, adjustment_defp
);
7649 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info
) = adjustment_def
;
7650 vec_initial_defs
.create (ncopies
);
7651 for (i
= 0; i
< ncopies
; ++i
)
7652 vec_initial_defs
.quick_push (vec_initial_def
);
7656 /* Generate the reduction PHIs upfront. */
7657 for (i
= 0; i
< vec_num
; i
++)
7659 tree vec_init_def
= vec_initial_defs
[i
];
7660 for (j
= 0; j
< ncopies
; j
++)
7662 /* Create the reduction-phi that defines the reduction
7664 gphi
*new_phi
= create_phi_node (vec_dest
, loop
->header
);
7666 /* Set the loop-entry arg of the reduction-phi. */
7667 if (j
!= 0 && nested_cycle
)
7668 vec_init_def
= vec_initial_defs
[j
];
7669 add_phi_arg (new_phi
, vec_init_def
, loop_preheader_edge (loop
),
7672 /* The loop-latch arg is set in epilogue processing. */
7675 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_phi
);
7679 *vec_stmt
= new_phi
;
7680 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_phi
);
7688 /* Vectorizes LC PHIs. */
7691 vectorizable_lc_phi (loop_vec_info loop_vinfo
,
7692 stmt_vec_info stmt_info
, gimple
**vec_stmt
,
7696 || !is_a
<gphi
*> (stmt_info
->stmt
)
7697 || gimple_phi_num_args (stmt_info
->stmt
) != 1)
7700 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_internal_def
7701 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
7704 if (!vec_stmt
) /* transformation not required. */
7706 /* Deal with copies from externs or constants that disguise as
7707 loop-closed PHI nodes (PR97886). */
7709 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node
)[0],
7710 SLP_TREE_VECTYPE (slp_node
)))
7712 if (dump_enabled_p ())
7713 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7714 "incompatible vector types for invariants\n");
7717 STMT_VINFO_TYPE (stmt_info
) = lc_phi_info_type
;
7721 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
7722 tree scalar_dest
= gimple_phi_result (stmt_info
->stmt
);
7723 basic_block bb
= gimple_bb (stmt_info
->stmt
);
7724 edge e
= single_pred_edge (bb
);
7725 tree vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
7726 auto_vec
<tree
> vec_oprnds
;
7727 vect_get_vec_defs (loop_vinfo
, stmt_info
, slp_node
,
7728 !slp_node
? vect_get_num_copies (loop_vinfo
, vectype
) : 1,
7729 gimple_phi_arg_def (stmt_info
->stmt
, 0), &vec_oprnds
);
7730 for (unsigned i
= 0; i
< vec_oprnds
.length (); i
++)
7732 /* Create the vectorized LC PHI node. */
7733 gphi
*new_phi
= create_phi_node (vec_dest
, bb
);
7734 add_phi_arg (new_phi
, vec_oprnds
[i
], e
, UNKNOWN_LOCATION
);
7736 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_phi
);
7738 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_phi
);
7741 *vec_stmt
= STMT_VINFO_VEC_STMTS (stmt_info
)[0];
7746 /* Vectorizes PHIs. */
7749 vectorizable_phi (vec_info
*,
7750 stmt_vec_info stmt_info
, gimple
**vec_stmt
,
7751 slp_tree slp_node
, stmt_vector_for_cost
*cost_vec
)
7753 if (!is_a
<gphi
*> (stmt_info
->stmt
) || !slp_node
)
7756 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_internal_def
)
7759 tree vectype
= SLP_TREE_VECTYPE (slp_node
);
7761 if (!vec_stmt
) /* transformation not required. */
7765 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node
), i
, child
)
7768 if (dump_enabled_p ())
7769 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7770 "PHI node with unvectorized backedge def\n");
7773 else if (!vect_maybe_update_slp_op_vectype (child
, vectype
))
7775 if (dump_enabled_p ())
7776 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7777 "incompatible vector types for invariants\n");
7780 record_stmt_cost (cost_vec
, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
),
7781 vector_stmt
, stmt_info
, vectype
, 0, vect_body
);
7782 STMT_VINFO_TYPE (stmt_info
) = phi_info_type
;
7786 tree scalar_dest
= gimple_phi_result (stmt_info
->stmt
);
7787 basic_block bb
= gimple_bb (stmt_info
->stmt
);
7788 tree vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
7789 auto_vec
<gphi
*> new_phis
;
7790 for (unsigned i
= 0; i
< gimple_phi_num_args (stmt_info
->stmt
); ++i
)
7792 slp_tree child
= SLP_TREE_CHILDREN (slp_node
)[i
];
7794 /* Skip not yet vectorized defs. */
7795 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
7796 && SLP_TREE_VEC_STMTS (child
).is_empty ())
7799 auto_vec
<tree
> vec_oprnds
;
7800 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node
)[i
], &vec_oprnds
);
7801 if (!new_phis
.exists ())
7803 new_phis
.create (vec_oprnds
.length ());
7804 for (unsigned j
= 0; j
< vec_oprnds
.length (); j
++)
7806 /* Create the vectorized LC PHI node. */
7807 new_phis
.quick_push (create_phi_node (vec_dest
, bb
));
7808 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_phis
[j
]);
7811 edge e
= gimple_phi_arg_edge (as_a
<gphi
*> (stmt_info
->stmt
), i
);
7812 for (unsigned j
= 0; j
< vec_oprnds
.length (); j
++)
7813 add_phi_arg (new_phis
[j
], vec_oprnds
[j
], e
, UNKNOWN_LOCATION
);
7815 /* We should have at least one already vectorized child. */
7816 gcc_assert (new_phis
.exists ());
7822 /* Function vect_min_worthwhile_factor.
7824 For a loop where we could vectorize the operation indicated by CODE,
7825 return the minimum vectorization factor that makes it worthwhile
7826 to use generic vectors. */
7828 vect_min_worthwhile_factor (enum tree_code code
)
7848 /* Return true if VINFO indicates we are doing loop vectorization and if
7849 it is worth decomposing CODE operations into scalar operations for
7850 that loop's vectorization factor. */
7853 vect_worthwhile_without_simd_p (vec_info
*vinfo
, tree_code code
)
7855 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
7856 unsigned HOST_WIDE_INT value
;
7858 && LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&value
)
7859 && value
>= vect_min_worthwhile_factor (code
));
7862 /* Function vectorizable_induction
7864 Check if STMT_INFO performs an induction computation that can be vectorized.
7865 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7866 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7867 Return true if STMT_INFO is vectorizable in this way. */
7870 vectorizable_induction (loop_vec_info loop_vinfo
,
7871 stmt_vec_info stmt_info
,
7872 gimple
**vec_stmt
, slp_tree slp_node
,
7873 stmt_vector_for_cost
*cost_vec
)
7875 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
7877 bool nested_in_vect_loop
= false;
7878 class loop
*iv_loop
;
7880 edge pe
= loop_preheader_edge (loop
);
7882 tree new_vec
, vec_init
, vec_step
, t
;
7885 gphi
*induction_phi
;
7886 tree induc_def
, vec_dest
;
7887 tree init_expr
, step_expr
;
7888 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
7891 gimple_stmt_iterator si
;
7893 gphi
*phi
= dyn_cast
<gphi
*> (stmt_info
->stmt
);
7897 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
7900 /* Make sure it was recognized as induction computation. */
7901 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_induction_def
)
7904 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
7905 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
7910 ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
7911 gcc_assert (ncopies
>= 1);
7913 /* FORNOW. These restrictions should be relaxed. */
7914 if (nested_in_vect_loop_p (loop
, stmt_info
))
7916 imm_use_iterator imm_iter
;
7917 use_operand_p use_p
;
7924 if (dump_enabled_p ())
7925 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7926 "multiple types in nested loop.\n");
7931 latch_e
= loop_latch_edge (loop
->inner
);
7932 loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
7933 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, loop_arg
)
7935 gimple
*use_stmt
= USE_STMT (use_p
);
7936 if (is_gimple_debug (use_stmt
))
7939 if (!flow_bb_inside_loop_p (loop
->inner
, gimple_bb (use_stmt
)))
7941 exit_phi
= use_stmt
;
7947 stmt_vec_info exit_phi_vinfo
= loop_vinfo
->lookup_stmt (exit_phi
);
7948 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo
)
7949 && !STMT_VINFO_LIVE_P (exit_phi_vinfo
)))
7951 if (dump_enabled_p ())
7952 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7953 "inner-loop induction only used outside "
7954 "of the outer vectorized loop.\n");
7959 nested_in_vect_loop
= true;
7960 iv_loop
= loop
->inner
;
7964 gcc_assert (iv_loop
== (gimple_bb (phi
))->loop_father
);
7966 if (slp_node
&& !nunits
.is_constant ())
7968 /* The current SLP code creates the step value element-by-element. */
7969 if (dump_enabled_p ())
7970 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7971 "SLP induction not supported for variable-length"
7976 if (!vec_stmt
) /* transformation not required. */
7978 unsigned inside_cost
= 0, prologue_cost
= 0;
7981 /* We eventually need to set a vector type on invariant
7985 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node
), j
, child
)
7986 if (!vect_maybe_update_slp_op_vectype
7987 (child
, SLP_TREE_VECTYPE (slp_node
)))
7989 if (dump_enabled_p ())
7990 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7991 "incompatible vector types for "
7995 /* loop cost for vec_loop. */
7997 = record_stmt_cost (cost_vec
,
7998 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
),
7999 vector_stmt
, stmt_info
, 0, vect_body
);
8000 /* prologue cost for vec_init (if not nested) and step. */
8001 prologue_cost
= record_stmt_cost (cost_vec
, 1 + !nested_in_vect_loop
,
8003 stmt_info
, 0, vect_prologue
);
8005 else /* if (!slp_node) */
8007 /* loop cost for vec_loop. */
8008 inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vector_stmt
,
8009 stmt_info
, 0, vect_body
);
8010 /* prologue cost for vec_init and vec_step. */
8011 prologue_cost
= record_stmt_cost (cost_vec
, 2, scalar_to_vec
,
8012 stmt_info
, 0, vect_prologue
);
8014 if (dump_enabled_p ())
8015 dump_printf_loc (MSG_NOTE
, vect_location
,
8016 "vect_model_induction_cost: inside_cost = %d, "
8017 "prologue_cost = %d .\n", inside_cost
,
8020 STMT_VINFO_TYPE (stmt_info
) = induc_vec_info_type
;
8021 DUMP_VECT_SCOPE ("vectorizable_induction");
8027 /* Compute a vector variable, initialized with the first VF values of
8028 the induction variable. E.g., for an iv with IV_PHI='X' and
8029 evolution S, for a vector of 4 units, we want to compute:
8030 [X, X + S, X + 2*S, X + 3*S]. */
8032 if (dump_enabled_p ())
8033 dump_printf_loc (MSG_NOTE
, vect_location
, "transform induction phi.\n");
8035 step_expr
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info
);
8036 gcc_assert (step_expr
!= NULL_TREE
);
8037 tree step_vectype
= get_same_sized_vectype (TREE_TYPE (step_expr
), vectype
);
8039 pe
= loop_preheader_edge (iv_loop
);
8040 /* Find the first insertion point in the BB. */
8041 basic_block bb
= gimple_bb (phi
);
8042 si
= gsi_after_labels (bb
);
8044 /* For SLP induction we have to generate several IVs as for example
8045 with group size 3 we need
8046 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
8047 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
8050 /* Enforced above. */
8051 unsigned int const_nunits
= nunits
.to_constant ();
8053 /* The initial values are vectorized, but any lanes > group_size
8056 = SLP_TREE_CHILDREN (slp_node
)[pe
->dest_idx
];
8058 /* Gather steps. Since we do not vectorize inductions as
8059 cycles we have to reconstruct the step from SCEV data. */
8060 unsigned group_size
= SLP_TREE_LANES (slp_node
);
8061 tree
*steps
= XALLOCAVEC (tree
, group_size
);
8062 tree
*inits
= XALLOCAVEC (tree
, group_size
);
8063 stmt_vec_info phi_info
;
8064 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node
), i
, phi_info
)
8066 steps
[i
] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info
);
8068 inits
[i
] = gimple_phi_arg_def (as_a
<gphi
*> (phi_info
->stmt
),
8072 /* Now generate the IVs. */
8073 unsigned nvects
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
8074 gcc_assert ((const_nunits
* nvects
) % group_size
== 0);
8076 if (nested_in_vect_loop
)
8080 /* Compute the number of distinct IVs we need. First reduce
8081 group_size if it is a multiple of const_nunits so we get
8082 one IV for a group_size of 4 but const_nunits 2. */
8083 unsigned group_sizep
= group_size
;
8084 if (group_sizep
% const_nunits
== 0)
8085 group_sizep
= group_sizep
/ const_nunits
;
8086 nivs
= least_common_multiple (group_sizep
,
8087 const_nunits
) / const_nunits
;
8089 tree stept
= TREE_TYPE (step_vectype
);
8090 tree lupdate_mul
= NULL_TREE
;
8091 if (!nested_in_vect_loop
)
8093 /* The number of iterations covered in one vector iteration. */
8094 unsigned lup_mul
= (nvects
* const_nunits
) / group_size
;
8096 = build_vector_from_val (step_vectype
,
8097 SCALAR_FLOAT_TYPE_P (stept
)
8098 ? build_real_from_wide (stept
, lup_mul
,
8100 : build_int_cstu (stept
, lup_mul
));
8102 tree peel_mul
= NULL_TREE
;
8103 gimple_seq init_stmts
= NULL
;
8104 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
))
8106 if (SCALAR_FLOAT_TYPE_P (stept
))
8107 peel_mul
= gimple_build (&init_stmts
, FLOAT_EXPR
, stept
,
8108 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
));
8110 peel_mul
= gimple_convert (&init_stmts
, stept
,
8111 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
));
8112 peel_mul
= gimple_build_vector_from_val (&init_stmts
,
8113 step_vectype
, peel_mul
);
8116 auto_vec
<tree
> vec_steps
;
8117 for (ivn
= 0; ivn
< nivs
; ++ivn
)
8119 tree_vector_builder
step_elts (step_vectype
, const_nunits
, 1);
8120 tree_vector_builder
init_elts (vectype
, const_nunits
, 1);
8121 tree_vector_builder
mul_elts (step_vectype
, const_nunits
, 1);
8122 for (unsigned eltn
= 0; eltn
< const_nunits
; ++eltn
)
8124 /* The scalar steps of the IVs. */
8125 tree elt
= steps
[(ivn
*const_nunits
+ eltn
) % group_size
];
8126 elt
= gimple_convert (&init_stmts
, TREE_TYPE (step_vectype
), elt
);
8127 step_elts
.quick_push (elt
);
8130 /* The scalar inits of the IVs if not vectorized. */
8131 elt
= inits
[(ivn
*const_nunits
+ eltn
) % group_size
];
8132 if (!useless_type_conversion_p (TREE_TYPE (vectype
),
8134 elt
= gimple_build (&init_stmts
, VIEW_CONVERT_EXPR
,
8135 TREE_TYPE (vectype
), elt
);
8136 init_elts
.quick_push (elt
);
8138 /* The number of steps to add to the initial values. */
8139 unsigned mul_elt
= (ivn
*const_nunits
+ eltn
) / group_size
;
8140 mul_elts
.quick_push (SCALAR_FLOAT_TYPE_P (stept
)
8141 ? build_real_from_wide (stept
,
8143 : build_int_cstu (stept
, mul_elt
));
8145 vec_step
= gimple_build_vector (&init_stmts
, &step_elts
);
8146 vec_steps
.safe_push (vec_step
);
8147 tree step_mul
= gimple_build_vector (&init_stmts
, &mul_elts
);
8149 step_mul
= gimple_build (&init_stmts
, PLUS_EXPR
, step_vectype
,
8150 step_mul
, peel_mul
);
8152 vec_init
= gimple_build_vector (&init_stmts
, &init_elts
);
8154 /* Create the induction-phi that defines the induction-operand. */
8155 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
,
8157 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
8158 induc_def
= PHI_RESULT (induction_phi
);
8160 /* Create the iv update inside the loop */
8163 up
= gimple_build (&init_stmts
, MULT_EXPR
, step_vectype
,
8164 vec_step
, lupdate_mul
);
8165 gimple_seq stmts
= NULL
;
8166 vec_def
= gimple_convert (&stmts
, step_vectype
, induc_def
);
8167 vec_def
= gimple_build (&stmts
,
8168 PLUS_EXPR
, step_vectype
, vec_def
, up
);
8169 vec_def
= gimple_convert (&stmts
, vectype
, vec_def
);
8170 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
8171 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
8175 vec_init
= vect_get_slp_vect_def (init_node
, ivn
);
8176 if (!nested_in_vect_loop
8177 && !integer_zerop (step_mul
))
8179 vec_def
= gimple_convert (&init_stmts
, step_vectype
, vec_init
);
8180 up
= gimple_build (&init_stmts
, MULT_EXPR
, step_vectype
,
8181 vec_step
, step_mul
);
8182 vec_def
= gimple_build (&init_stmts
, PLUS_EXPR
, step_vectype
,
8184 vec_init
= gimple_convert (&init_stmts
, vectype
, vec_def
);
8187 /* Set the arguments of the phi node: */
8188 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
8190 SLP_TREE_VEC_STMTS (slp_node
).quick_push (induction_phi
);
8192 if (!nested_in_vect_loop
)
8194 /* Fill up to the number of vectors we need for the whole group. */
8195 nivs
= least_common_multiple (group_size
,
8196 const_nunits
) / const_nunits
;
8197 for (; ivn
< nivs
; ++ivn
)
8199 SLP_TREE_VEC_STMTS (slp_node
)
8200 .quick_push (SLP_TREE_VEC_STMTS (slp_node
)[0]);
8201 vec_steps
.safe_push (vec_steps
[0]);
8205 /* Re-use IVs when we can. We are generating further vector
8206 stmts by adding VF' * stride to the IVs generated above. */
8210 = least_common_multiple (group_size
, const_nunits
) / group_size
;
8212 = build_vector_from_val (step_vectype
,
8213 SCALAR_FLOAT_TYPE_P (stept
)
8214 ? build_real_from_wide (stept
,
8216 : build_int_cstu (stept
, vfp
));
8217 for (; ivn
< nvects
; ++ivn
)
8219 gimple
*iv
= SLP_TREE_VEC_STMTS (slp_node
)[ivn
- nivs
];
8220 tree def
= gimple_get_lhs (iv
);
8222 vec_steps
[ivn
- nivs
]
8223 = gimple_build (&init_stmts
, MULT_EXPR
, step_vectype
,
8224 vec_steps
[ivn
- nivs
], lupdate_mul
);
8225 gimple_seq stmts
= NULL
;
8226 def
= gimple_convert (&stmts
, step_vectype
, def
);
8227 def
= gimple_build (&stmts
, PLUS_EXPR
, step_vectype
,
8228 def
, vec_steps
[ivn
% nivs
]);
8229 def
= gimple_convert (&stmts
, vectype
, def
);
8230 if (gimple_code (iv
) == GIMPLE_PHI
)
8231 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
8234 gimple_stmt_iterator tgsi
= gsi_for_stmt (iv
);
8235 gsi_insert_seq_after (&tgsi
, stmts
, GSI_CONTINUE_LINKING
);
8237 SLP_TREE_VEC_STMTS (slp_node
)
8238 .quick_push (SSA_NAME_DEF_STMT (def
));
8242 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, init_stmts
);
8243 gcc_assert (!new_bb
);
8248 init_expr
= PHI_ARG_DEF_FROM_EDGE (phi
,
8249 loop_preheader_edge (iv_loop
));
8251 gimple_seq stmts
= NULL
;
8252 if (!nested_in_vect_loop
)
8254 /* Convert the initial value to the IV update type. */
8255 tree new_type
= TREE_TYPE (step_expr
);
8256 init_expr
= gimple_convert (&stmts
, new_type
, init_expr
);
8258 /* If we are using the loop mask to "peel" for alignment then we need
8259 to adjust the start value here. */
8260 tree skip_niters
= LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
);
8261 if (skip_niters
!= NULL_TREE
)
8263 if (FLOAT_TYPE_P (vectype
))
8264 skip_niters
= gimple_build (&stmts
, FLOAT_EXPR
, new_type
,
8267 skip_niters
= gimple_convert (&stmts
, new_type
, skip_niters
);
8268 tree skip_step
= gimple_build (&stmts
, MULT_EXPR
, new_type
,
8269 skip_niters
, step_expr
);
8270 init_expr
= gimple_build (&stmts
, MINUS_EXPR
, new_type
,
8271 init_expr
, skip_step
);
8277 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
8278 gcc_assert (!new_bb
);
8281 /* Create the vector that holds the initial_value of the induction. */
8282 if (nested_in_vect_loop
)
8284 /* iv_loop is nested in the loop to be vectorized. init_expr had already
8285 been created during vectorization of previous stmts. We obtain it
8286 from the STMT_VINFO_VEC_STMT of the defining stmt. */
8287 auto_vec
<tree
> vec_inits
;
8288 vect_get_vec_defs_for_operand (loop_vinfo
, stmt_info
, 1,
8289 init_expr
, &vec_inits
);
8290 vec_init
= vec_inits
[0];
8291 /* If the initial value is not of proper type, convert it. */
8292 if (!useless_type_conversion_p (vectype
, TREE_TYPE (vec_init
)))
8295 = gimple_build_assign (vect_get_new_ssa_name (vectype
,
8299 build1 (VIEW_CONVERT_EXPR
, vectype
,
8301 vec_init
= gimple_assign_lhs (new_stmt
);
8302 new_bb
= gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop
),
8304 gcc_assert (!new_bb
);
8309 /* iv_loop is the loop to be vectorized. Create:
8310 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
8312 new_name
= gimple_convert (&stmts
, TREE_TYPE (step_expr
), init_expr
);
8314 unsigned HOST_WIDE_INT const_nunits
;
8315 if (nunits
.is_constant (&const_nunits
))
8317 tree_vector_builder
elts (step_vectype
, const_nunits
, 1);
8318 elts
.quick_push (new_name
);
8319 for (i
= 1; i
< const_nunits
; i
++)
8321 /* Create: new_name_i = new_name + step_expr */
8322 new_name
= gimple_build (&stmts
, PLUS_EXPR
, TREE_TYPE (new_name
),
8323 new_name
, step_expr
);
8324 elts
.quick_push (new_name
);
8326 /* Create a vector from [new_name_0, new_name_1, ...,
8327 new_name_nunits-1] */
8328 vec_init
= gimple_build_vector (&stmts
, &elts
);
8330 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr
)))
8331 /* Build the initial value directly from a VEC_SERIES_EXPR. */
8332 vec_init
= gimple_build (&stmts
, VEC_SERIES_EXPR
, step_vectype
,
8333 new_name
, step_expr
);
8337 [base, base, base, ...]
8338 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
8339 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)));
8340 gcc_assert (flag_associative_math
);
8341 tree index
= build_index_vector (step_vectype
, 0, 1);
8342 tree base_vec
= gimple_build_vector_from_val (&stmts
, step_vectype
,
8344 tree step_vec
= gimple_build_vector_from_val (&stmts
, step_vectype
,
8346 vec_init
= gimple_build (&stmts
, FLOAT_EXPR
, step_vectype
, index
);
8347 vec_init
= gimple_build (&stmts
, MULT_EXPR
, step_vectype
,
8348 vec_init
, step_vec
);
8349 vec_init
= gimple_build (&stmts
, PLUS_EXPR
, step_vectype
,
8350 vec_init
, base_vec
);
8352 vec_init
= gimple_convert (&stmts
, vectype
, vec_init
);
8356 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
8357 gcc_assert (!new_bb
);
8362 /* Create the vector that holds the step of the induction. */
8363 if (nested_in_vect_loop
)
8364 /* iv_loop is nested in the loop to be vectorized. Generate:
8365 vec_step = [S, S, S, S] */
8366 new_name
= step_expr
;
8369 /* iv_loop is the loop to be vectorized. Generate:
8370 vec_step = [VF*S, VF*S, VF*S, VF*S] */
8371 gimple_seq seq
= NULL
;
8372 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
8374 expr
= build_int_cst (integer_type_node
, vf
);
8375 expr
= gimple_build (&seq
, FLOAT_EXPR
, TREE_TYPE (step_expr
), expr
);
8378 expr
= build_int_cst (TREE_TYPE (step_expr
), vf
);
8379 new_name
= gimple_build (&seq
, MULT_EXPR
, TREE_TYPE (step_expr
),
8383 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, seq
);
8384 gcc_assert (!new_bb
);
8388 t
= unshare_expr (new_name
);
8389 gcc_assert (CONSTANT_CLASS_P (new_name
)
8390 || TREE_CODE (new_name
) == SSA_NAME
);
8391 new_vec
= build_vector_from_val (step_vectype
, t
);
8392 vec_step
= vect_init_vector (loop_vinfo
, stmt_info
,
8393 new_vec
, step_vectype
, NULL
);
8396 /* Create the following def-use cycle:
8401 vec_iv = PHI <vec_init, vec_loop>
8405 vec_loop = vec_iv + vec_step; */
8407 /* Create the induction-phi that defines the induction-operand. */
8408 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
, "vec_iv_");
8409 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
8410 induc_def
= PHI_RESULT (induction_phi
);
8412 /* Create the iv update inside the loop */
8414 vec_def
= gimple_convert (&stmts
, step_vectype
, induc_def
);
8415 vec_def
= gimple_build (&stmts
, PLUS_EXPR
, step_vectype
, vec_def
, vec_step
);
8416 vec_def
= gimple_convert (&stmts
, vectype
, vec_def
);
8417 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
8418 new_stmt
= SSA_NAME_DEF_STMT (vec_def
);
8420 /* Set the arguments of the phi node: */
8421 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
8422 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
8425 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (induction_phi
);
8426 *vec_stmt
= induction_phi
;
8428 /* In case that vectorization factor (VF) is bigger than the number
8429 of elements that we can fit in a vectype (nunits), we have to generate
8430 more than one vector stmt - i.e - we need to "unroll" the
8431 vector stmt by a factor VF/nunits. For more details see documentation
8432 in vectorizable_operation. */
8436 gimple_seq seq
= NULL
;
8437 /* FORNOW. This restriction should be relaxed. */
8438 gcc_assert (!nested_in_vect_loop
);
8440 /* Create the vector that holds the step of the induction. */
8441 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
8443 expr
= build_int_cst (integer_type_node
, nunits
);
8444 expr
= gimple_build (&seq
, FLOAT_EXPR
, TREE_TYPE (step_expr
), expr
);
8447 expr
= build_int_cst (TREE_TYPE (step_expr
), nunits
);
8448 new_name
= gimple_build (&seq
, MULT_EXPR
, TREE_TYPE (step_expr
),
8452 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, seq
);
8453 gcc_assert (!new_bb
);
8456 t
= unshare_expr (new_name
);
8457 gcc_assert (CONSTANT_CLASS_P (new_name
)
8458 || TREE_CODE (new_name
) == SSA_NAME
);
8459 new_vec
= build_vector_from_val (step_vectype
, t
);
8460 vec_step
= vect_init_vector (loop_vinfo
, stmt_info
,
8461 new_vec
, step_vectype
, NULL
);
8463 vec_def
= induc_def
;
8464 for (i
= 1; i
< ncopies
; i
++)
8466 /* vec_i = vec_prev + vec_step */
8467 gimple_seq stmts
= NULL
;
8468 vec_def
= gimple_convert (&stmts
, step_vectype
, vec_def
);
8469 vec_def
= gimple_build (&stmts
,
8470 PLUS_EXPR
, step_vectype
, vec_def
, vec_step
);
8471 vec_def
= gimple_convert (&stmts
, vectype
, vec_def
);
8473 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
8474 new_stmt
= SSA_NAME_DEF_STMT (vec_def
);
8475 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_stmt
);
8479 if (dump_enabled_p ())
8480 dump_printf_loc (MSG_NOTE
, vect_location
,
8481 "transform induction: created def-use cycle: %G%G",
8482 induction_phi
, SSA_NAME_DEF_STMT (vec_def
));
8487 /* Function vectorizable_live_operation.
8489 STMT_INFO computes a value that is used outside the loop. Check if
8490 it can be supported. */
8493 vectorizable_live_operation (vec_info
*vinfo
,
8494 stmt_vec_info stmt_info
,
8495 gimple_stmt_iterator
*gsi
,
8496 slp_tree slp_node
, slp_instance slp_node_instance
,
8497 int slp_index
, bool vec_stmt_p
,
8498 stmt_vector_for_cost
*cost_vec
)
8500 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
8501 imm_use_iterator imm_iter
;
8502 tree lhs
, lhs_type
, bitsize
;
8503 tree vectype
= (slp_node
8504 ? SLP_TREE_VECTYPE (slp_node
)
8505 : STMT_VINFO_VECTYPE (stmt_info
));
8506 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
8509 auto_vec
<tree
> vec_oprnds
;
8511 poly_uint64 vec_index
= 0;
8513 gcc_assert (STMT_VINFO_LIVE_P (stmt_info
));
8515 /* If a stmt of a reduction is live, vectorize it via
8516 vect_create_epilog_for_reduction. vectorizable_reduction assessed
8517 validity so just trigger the transform here. */
8518 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
)))
8524 /* For reduction chains the meta-info is attached to
8525 the group leader. */
8526 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
8527 stmt_info
= REDUC_GROUP_FIRST_ELEMENT (stmt_info
);
8528 /* For SLP reductions we vectorize the epilogue for
8529 all involved stmts together. */
8530 else if (slp_index
!= 0)
8533 /* For SLP reductions the meta-info is attached to
8534 the representative. */
8535 stmt_info
= SLP_TREE_REPRESENTATIVE (slp_node
);
8537 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
8538 gcc_assert (reduc_info
->is_reduc_info
);
8539 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == FOLD_LEFT_REDUCTION
8540 || STMT_VINFO_REDUC_TYPE (reduc_info
) == EXTRACT_LAST_REDUCTION
)
8542 vect_create_epilog_for_reduction (loop_vinfo
, stmt_info
, slp_node
,
8547 /* If STMT is not relevant and it is a simple assignment and its inputs are
8548 invariant then it can remain in place, unvectorized. The original last
8549 scalar value that it computes will be used. */
8550 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
8552 gcc_assert (is_simple_and_all_uses_invariant (stmt_info
, loop_vinfo
));
8553 if (dump_enabled_p ())
8554 dump_printf_loc (MSG_NOTE
, vect_location
,
8555 "statement is simple and uses invariant. Leaving in "
8563 ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
8567 gcc_assert (slp_index
>= 0);
8569 /* Get the last occurrence of the scalar index from the concatenation of
8570 all the slp vectors. Calculate which slp vector it is and the index
8572 int num_scalar
= SLP_TREE_LANES (slp_node
);
8573 int num_vec
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
8574 poly_uint64 pos
= (num_vec
* nunits
) - num_scalar
+ slp_index
;
8576 /* Calculate which vector contains the result, and which lane of
8577 that vector we need. */
8578 if (!can_div_trunc_p (pos
, nunits
, &vec_entry
, &vec_index
))
8580 if (dump_enabled_p ())
8581 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8582 "Cannot determine which vector holds the"
8583 " final result.\n");
8590 /* No transformation required. */
8591 if (loop_vinfo
&& LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
))
8593 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST
, vectype
,
8594 OPTIMIZE_FOR_SPEED
))
8596 if (dump_enabled_p ())
8597 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8598 "can't operate on partial vectors "
8599 "because the target doesn't support extract "
8600 "last reduction.\n");
8601 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
8605 if (dump_enabled_p ())
8606 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8607 "can't operate on partial vectors "
8608 "because an SLP statement is live after "
8610 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
8612 else if (ncopies
> 1)
8614 if (dump_enabled_p ())
8615 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8616 "can't operate on partial vectors "
8617 "because ncopies is greater than 1.\n");
8618 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
8622 gcc_assert (ncopies
== 1 && !slp_node
);
8623 vect_record_loop_mask (loop_vinfo
,
8624 &LOOP_VINFO_MASKS (loop_vinfo
),
8628 /* ??? Enable for loop costing as well. */
8630 record_stmt_cost (cost_vec
, 1, vec_to_scalar
, stmt_info
, NULL_TREE
,
8635 /* Use the lhs of the original scalar statement. */
8636 gimple
*stmt
= vect_orig_stmt (stmt_info
)->stmt
;
8637 if (dump_enabled_p ())
8638 dump_printf_loc (MSG_NOTE
, vect_location
, "extracting lane for live "
8641 lhs
= gimple_get_lhs (stmt
);
8642 lhs_type
= TREE_TYPE (lhs
);
8644 bitsize
= vector_element_bits_tree (vectype
);
8646 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8647 tree vec_lhs
, bitstart
;
8651 gcc_assert (!loop_vinfo
|| !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
));
8653 /* Get the correct slp vectorized stmt. */
8654 vec_stmt
= SLP_TREE_VEC_STMTS (slp_node
)[vec_entry
];
8655 vec_lhs
= gimple_get_lhs (vec_stmt
);
8657 /* Get entry to use. */
8658 bitstart
= bitsize_int (vec_index
);
8659 bitstart
= int_const_binop (MULT_EXPR
, bitsize
, bitstart
);
8663 /* For multiple copies, get the last copy. */
8664 vec_stmt
= STMT_VINFO_VEC_STMTS (stmt_info
).last ();
8665 vec_lhs
= gimple_get_lhs (vec_stmt
);
8667 /* Get the last lane in the vector. */
8668 bitstart
= int_const_binop (MULT_EXPR
, bitsize
, bitsize_int (nunits
- 1));
8673 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8674 requirement, insert one phi node for it. It looks like:
8681 # vec_lhs' = PHI <vec_lhs>
8682 new_tree = lane_extract <vec_lhs', ...>;
8685 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
8686 basic_block exit_bb
= single_exit (loop
)->dest
;
8687 gcc_assert (single_pred_p (exit_bb
));
8689 tree vec_lhs_phi
= copy_ssa_name (vec_lhs
);
8690 gimple
*phi
= create_phi_node (vec_lhs_phi
, exit_bb
);
8691 SET_PHI_ARG_DEF (phi
, single_exit (loop
)->dest_idx
, vec_lhs
);
8693 gimple_seq stmts
= NULL
;
8695 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
8699 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8701 where VEC_LHS is the vectorized live-out result and MASK is
8702 the loop mask for the final iteration. */
8703 gcc_assert (ncopies
== 1 && !slp_node
);
8704 tree scalar_type
= TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info
));
8705 tree mask
= vect_get_loop_mask (gsi
, &LOOP_VINFO_MASKS (loop_vinfo
),
8707 tree scalar_res
= gimple_build (&stmts
, CFN_EXTRACT_LAST
, scalar_type
,
8710 /* Convert the extracted vector element to the scalar type. */
8711 new_tree
= gimple_convert (&stmts
, lhs_type
, scalar_res
);
8715 tree bftype
= TREE_TYPE (vectype
);
8716 if (VECTOR_BOOLEAN_TYPE_P (vectype
))
8717 bftype
= build_nonstandard_integer_type (tree_to_uhwi (bitsize
), 1);
8718 new_tree
= build3 (BIT_FIELD_REF
, bftype
,
8719 vec_lhs_phi
, bitsize
, bitstart
);
8720 new_tree
= force_gimple_operand (fold_convert (lhs_type
, new_tree
),
8721 &stmts
, true, NULL_TREE
);
8726 gimple_stmt_iterator exit_gsi
= gsi_after_labels (exit_bb
);
8727 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
8729 /* Remove existing phi from lhs and create one copy from new_tree. */
8730 tree lhs_phi
= NULL_TREE
;
8731 gimple_stmt_iterator gsi
;
8732 for (gsi
= gsi_start_phis (exit_bb
);
8733 !gsi_end_p (gsi
); gsi_next (&gsi
))
8735 gimple
*phi
= gsi_stmt (gsi
);
8736 if ((gimple_phi_arg_def (phi
, 0) == lhs
))
8738 remove_phi_node (&gsi
, false);
8739 lhs_phi
= gimple_phi_result (phi
);
8740 gimple
*copy
= gimple_build_assign (lhs_phi
, new_tree
);
8741 gsi_insert_before (&exit_gsi
, copy
, GSI_SAME_STMT
);
8747 /* Replace use of lhs with newly computed result. If the use stmt is a
8748 single arg PHI, just replace all uses of PHI result. It's necessary
8749 because lcssa PHI defining lhs may be before newly inserted stmt. */
8750 use_operand_p use_p
;
8751 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, lhs
)
8752 if (!flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
))
8753 && !is_gimple_debug (use_stmt
))
8755 if (gimple_code (use_stmt
) == GIMPLE_PHI
8756 && gimple_phi_num_args (use_stmt
) == 1)
8758 replace_uses_by (gimple_phi_result (use_stmt
), new_tree
);
8762 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
8763 SET_USE (use_p
, new_tree
);
8765 update_stmt (use_stmt
);
8770 /* For basic-block vectorization simply insert the lane-extraction. */
8771 tree bftype
= TREE_TYPE (vectype
);
8772 if (VECTOR_BOOLEAN_TYPE_P (vectype
))
8773 bftype
= build_nonstandard_integer_type (tree_to_uhwi (bitsize
), 1);
8774 tree new_tree
= build3 (BIT_FIELD_REF
, bftype
,
8775 vec_lhs
, bitsize
, bitstart
);
8776 gimple_seq stmts
= NULL
;
8777 new_tree
= force_gimple_operand (fold_convert (lhs_type
, new_tree
),
8778 &stmts
, true, NULL_TREE
);
8779 if (TREE_CODE (new_tree
) == SSA_NAME
8780 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs
))
8781 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree
) = 1;
8782 if (is_a
<gphi
*> (vec_stmt
))
8784 gimple_stmt_iterator si
= gsi_after_labels (gimple_bb (vec_stmt
));
8785 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
8789 gimple_stmt_iterator si
= gsi_for_stmt (vec_stmt
);
8790 gsi_insert_seq_after (&si
, stmts
, GSI_SAME_STMT
);
8793 /* Replace use of lhs with newly computed result. If the use stmt is a
8794 single arg PHI, just replace all uses of PHI result. It's necessary
8795 because lcssa PHI defining lhs may be before newly inserted stmt. */
8796 use_operand_p use_p
;
8797 stmt_vec_info use_stmt_info
;
8798 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, lhs
)
8799 if (!is_gimple_debug (use_stmt
)
8800 && (!(use_stmt_info
= vinfo
->lookup_stmt (use_stmt
))
8801 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
))))
8803 /* ??? This can happen when the live lane ends up being
8804 used in a vector construction code-generated by an
8805 external SLP node (and code-generation for that already
8806 happened). See gcc.dg/vect/bb-slp-47.c.
8807 Doing this is what would happen if that vector CTOR
8808 were not code-generated yet so it is not too bad.
8809 ??? In fact we'd likely want to avoid this situation
8810 in the first place. */
8811 if (TREE_CODE (new_tree
) == SSA_NAME
8812 && !SSA_NAME_IS_DEFAULT_DEF (new_tree
)
8813 && gimple_code (use_stmt
) != GIMPLE_PHI
8814 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree
),
8817 enum tree_code code
= gimple_assign_rhs_code (use_stmt
);
8818 gcc_assert (code
== CONSTRUCTOR
8819 || code
== VIEW_CONVERT_EXPR
8820 || CONVERT_EXPR_CODE_P (code
));
8821 if (dump_enabled_p ())
8822 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8823 "Using original scalar computation for "
8824 "live lane because use preceeds vector "
8828 /* ??? It can also happen that we end up pulling a def into
8829 a loop where replacing out-of-loop uses would require
8830 a new LC SSA PHI node. Retain the original scalar in
8831 those cases as well. PR98064. */
8832 if (TREE_CODE (new_tree
) == SSA_NAME
8833 && !SSA_NAME_IS_DEFAULT_DEF (new_tree
)
8834 && (gimple_bb (use_stmt
)->loop_father
8835 != gimple_bb (vec_stmt
)->loop_father
)
8836 && !flow_loop_nested_p (gimple_bb (vec_stmt
)->loop_father
,
8837 gimple_bb (use_stmt
)->loop_father
))
8839 if (dump_enabled_p ())
8840 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8841 "Using original scalar computation for "
8842 "live lane because there is an out-of-loop "
8843 "definition for it\n");
8846 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
8847 SET_USE (use_p
, new_tree
);
8848 update_stmt (use_stmt
);
8855 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8858 vect_loop_kill_debug_uses (class loop
*loop
, stmt_vec_info stmt_info
)
8860 ssa_op_iter op_iter
;
8861 imm_use_iterator imm_iter
;
8862 def_operand_p def_p
;
8865 FOR_EACH_PHI_OR_STMT_DEF (def_p
, stmt_info
->stmt
, op_iter
, SSA_OP_DEF
)
8867 FOR_EACH_IMM_USE_STMT (ustmt
, imm_iter
, DEF_FROM_PTR (def_p
))
8871 if (!is_gimple_debug (ustmt
))
8874 bb
= gimple_bb (ustmt
);
8876 if (!flow_bb_inside_loop_p (loop
, bb
))
8878 if (gimple_debug_bind_p (ustmt
))
8880 if (dump_enabled_p ())
8881 dump_printf_loc (MSG_NOTE
, vect_location
,
8882 "killing debug use\n");
8884 gimple_debug_bind_reset_value (ustmt
);
8885 update_stmt (ustmt
);
8894 /* Given loop represented by LOOP_VINFO, return true if computation of
8895 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8899 loop_niters_no_overflow (loop_vec_info loop_vinfo
)
8901 /* Constant case. */
8902 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
8904 tree cst_niters
= LOOP_VINFO_NITERS (loop_vinfo
);
8905 tree cst_nitersm1
= LOOP_VINFO_NITERSM1 (loop_vinfo
);
8907 gcc_assert (TREE_CODE (cst_niters
) == INTEGER_CST
);
8908 gcc_assert (TREE_CODE (cst_nitersm1
) == INTEGER_CST
);
8909 if (wi::to_widest (cst_nitersm1
) < wi::to_widest (cst_niters
))
8914 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
8915 /* Check the upper bound of loop niters. */
8916 if (get_max_loop_iterations (loop
, &max
))
8918 tree type
= TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
));
8919 signop sgn
= TYPE_SIGN (type
);
8920 widest_int type_max
= widest_int::from (wi::max_value (type
), sgn
);
8927 /* Return a mask type with half the number of elements as OLD_TYPE,
8928 given that it should have mode NEW_MODE. */
8931 vect_halve_mask_nunits (tree old_type
, machine_mode new_mode
)
8933 poly_uint64 nunits
= exact_div (TYPE_VECTOR_SUBPARTS (old_type
), 2);
8934 return build_truth_vector_type_for_mode (nunits
, new_mode
);
8937 /* Return a mask type with twice as many elements as OLD_TYPE,
8938 given that it should have mode NEW_MODE. */
8941 vect_double_mask_nunits (tree old_type
, machine_mode new_mode
)
8943 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (old_type
) * 2;
8944 return build_truth_vector_type_for_mode (nunits
, new_mode
);
8947 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8948 contain a sequence of NVECTORS masks that each control a vector of type
8949 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
8950 these vector masks with the vector version of SCALAR_MASK. */
8953 vect_record_loop_mask (loop_vec_info loop_vinfo
, vec_loop_masks
*masks
,
8954 unsigned int nvectors
, tree vectype
, tree scalar_mask
)
8956 gcc_assert (nvectors
!= 0);
8957 if (masks
->length () < nvectors
)
8958 masks
->safe_grow_cleared (nvectors
, true);
8959 rgroup_controls
*rgm
= &(*masks
)[nvectors
- 1];
8960 /* The number of scalars per iteration and the number of vectors are
8961 both compile-time constants. */
8962 unsigned int nscalars_per_iter
8963 = exact_div (nvectors
* TYPE_VECTOR_SUBPARTS (vectype
),
8964 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)).to_constant ();
8968 scalar_cond_masked_key
cond (scalar_mask
, nvectors
);
8969 loop_vinfo
->scalar_cond_masked_set
.add (cond
);
8972 if (rgm
->max_nscalars_per_iter
< nscalars_per_iter
)
8974 rgm
->max_nscalars_per_iter
= nscalars_per_iter
;
8975 rgm
->type
= truth_type_for (vectype
);
8980 /* Given a complete set of masks MASKS, extract mask number INDEX
8981 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8982 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8984 See the comment above vec_loop_masks for more details about the mask
8988 vect_get_loop_mask (gimple_stmt_iterator
*gsi
, vec_loop_masks
*masks
,
8989 unsigned int nvectors
, tree vectype
, unsigned int index
)
8991 rgroup_controls
*rgm
= &(*masks
)[nvectors
- 1];
8992 tree mask_type
= rgm
->type
;
8994 /* Populate the rgroup's mask array, if this is the first time we've
8996 if (rgm
->controls
.is_empty ())
8998 rgm
->controls
.safe_grow_cleared (nvectors
, true);
8999 for (unsigned int i
= 0; i
< nvectors
; ++i
)
9001 tree mask
= make_temp_ssa_name (mask_type
, NULL
, "loop_mask");
9002 /* Provide a dummy definition until the real one is available. */
9003 SSA_NAME_DEF_STMT (mask
) = gimple_build_nop ();
9004 rgm
->controls
[i
] = mask
;
9008 tree mask
= rgm
->controls
[index
];
9009 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type
),
9010 TYPE_VECTOR_SUBPARTS (vectype
)))
9012 /* A loop mask for data type X can be reused for data type Y
9013 if X has N times more elements than Y and if Y's elements
9014 are N times bigger than X's. In this case each sequence
9015 of N elements in the loop mask will be all-zero or all-one.
9016 We can then view-convert the mask so that each sequence of
9017 N elements is replaced by a single element. */
9018 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type
),
9019 TYPE_VECTOR_SUBPARTS (vectype
)));
9020 gimple_seq seq
= NULL
;
9021 mask_type
= truth_type_for (vectype
);
9022 mask
= gimple_build (&seq
, VIEW_CONVERT_EXPR
, mask_type
, mask
);
9024 gsi_insert_seq_before (gsi
, seq
, GSI_SAME_STMT
);
9029 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
9030 lengths for controlling an operation on VECTYPE. The operation splits
9031 each element of VECTYPE into FACTOR separate subelements, measuring the
9032 length as a number of these subelements. */
9035 vect_record_loop_len (loop_vec_info loop_vinfo
, vec_loop_lens
*lens
,
9036 unsigned int nvectors
, tree vectype
, unsigned int factor
)
9038 gcc_assert (nvectors
!= 0);
9039 if (lens
->length () < nvectors
)
9040 lens
->safe_grow_cleared (nvectors
, true);
9041 rgroup_controls
*rgl
= &(*lens
)[nvectors
- 1];
9043 /* The number of scalars per iteration, scalar occupied bytes and
9044 the number of vectors are both compile-time constants. */
9045 unsigned int nscalars_per_iter
9046 = exact_div (nvectors
* TYPE_VECTOR_SUBPARTS (vectype
),
9047 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)).to_constant ();
9049 if (rgl
->max_nscalars_per_iter
< nscalars_per_iter
)
9051 /* For now, we only support cases in which all loads and stores fall back
9052 to VnQI or none do. */
9053 gcc_assert (!rgl
->max_nscalars_per_iter
9054 || (rgl
->factor
== 1 && factor
== 1)
9055 || (rgl
->max_nscalars_per_iter
* rgl
->factor
9056 == nscalars_per_iter
* factor
));
9057 rgl
->max_nscalars_per_iter
= nscalars_per_iter
;
9058 rgl
->type
= vectype
;
9059 rgl
->factor
= factor
;
9063 /* Given a complete set of length LENS, extract length number INDEX for an
9064 rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
9067 vect_get_loop_len (loop_vec_info loop_vinfo
, vec_loop_lens
*lens
,
9068 unsigned int nvectors
, unsigned int index
)
9070 rgroup_controls
*rgl
= &(*lens
)[nvectors
- 1];
9072 /* Populate the rgroup's len array, if this is the first time we've
9074 if (rgl
->controls
.is_empty ())
9076 rgl
->controls
.safe_grow_cleared (nvectors
, true);
9077 for (unsigned int i
= 0; i
< nvectors
; ++i
)
9079 tree len_type
= LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo
);
9080 gcc_assert (len_type
!= NULL_TREE
);
9081 tree len
= make_temp_ssa_name (len_type
, NULL
, "loop_len");
9083 /* Provide a dummy definition until the real one is available. */
9084 SSA_NAME_DEF_STMT (len
) = gimple_build_nop ();
9085 rgl
->controls
[i
] = len
;
9089 return rgl
->controls
[index
];
9092 /* Scale profiling counters by estimation for LOOP which is vectorized
9096 scale_profile_for_vect_loop (class loop
*loop
, unsigned vf
)
9098 edge preheader
= loop_preheader_edge (loop
);
9099 /* Reduce loop iterations by the vectorization factor. */
9100 gcov_type new_est_niter
= niter_for_unrolled_loop (loop
, vf
);
9101 profile_count freq_h
= loop
->header
->count
, freq_e
= preheader
->count ();
9103 if (freq_h
.nonzero_p ())
9105 profile_probability p
;
9107 /* Avoid dropping loop body profile counter to 0 because of zero count
9108 in loop's preheader. */
9109 if (!(freq_e
== profile_count::zero ()))
9110 freq_e
= freq_e
.force_nonzero ();
9111 p
= freq_e
.apply_scale (new_est_niter
+ 1, 1).probability_in (freq_h
);
9112 scale_loop_frequencies (loop
, p
);
9115 edge exit_e
= single_exit (loop
);
9116 exit_e
->probability
= profile_probability::always ()
9117 .apply_scale (1, new_est_niter
+ 1);
9119 edge exit_l
= single_pred_edge (loop
->latch
);
9120 profile_probability prob
= exit_l
->probability
;
9121 exit_l
->probability
= exit_e
->probability
.invert ();
9122 if (prob
.initialized_p () && exit_l
->probability
.initialized_p ())
9123 scale_bbs_frequencies (&loop
->latch
, 1, exit_l
->probability
/ prob
);
9126 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
9127 latch edge values originally defined by it. */
9130 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo
,
9131 stmt_vec_info def_stmt_info
)
9133 tree def
= gimple_get_lhs (vect_orig_stmt (def_stmt_info
)->stmt
);
9134 if (!def
|| TREE_CODE (def
) != SSA_NAME
)
9136 stmt_vec_info phi_info
;
9137 imm_use_iterator iter
;
9138 use_operand_p use_p
;
9139 FOR_EACH_IMM_USE_FAST (use_p
, iter
, def
)
9140 if (gphi
*phi
= dyn_cast
<gphi
*> (USE_STMT (use_p
)))
9141 if (gimple_bb (phi
)->loop_father
->header
== gimple_bb (phi
)
9142 && (phi_info
= loop_vinfo
->lookup_stmt (phi
))
9143 && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info
))
9144 && STMT_VINFO_REDUC_TYPE (phi_info
) != FOLD_LEFT_REDUCTION
9145 && STMT_VINFO_REDUC_TYPE (phi_info
) != EXTRACT_LAST_REDUCTION
)
9147 loop_p loop
= gimple_bb (phi
)->loop_father
;
9148 edge e
= loop_latch_edge (loop
);
9149 if (PHI_ARG_DEF_FROM_EDGE (phi
, e
) == def
)
9151 vec
<gimple
*> &phi_defs
= STMT_VINFO_VEC_STMTS (phi_info
);
9152 vec
<gimple
*> &latch_defs
= STMT_VINFO_VEC_STMTS (def_stmt_info
);
9153 gcc_assert (phi_defs
.length () == latch_defs
.length ());
9154 for (unsigned i
= 0; i
< phi_defs
.length (); ++i
)
9155 add_phi_arg (as_a
<gphi
*> (phi_defs
[i
]),
9156 gimple_get_lhs (latch_defs
[i
]), e
,
9157 gimple_phi_arg_location (phi
, e
->dest_idx
));
9162 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9163 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9167 vect_transform_loop_stmt (loop_vec_info loop_vinfo
, stmt_vec_info stmt_info
,
9168 gimple_stmt_iterator
*gsi
, stmt_vec_info
*seen_store
)
9170 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
9171 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
9173 if (dump_enabled_p ())
9174 dump_printf_loc (MSG_NOTE
, vect_location
,
9175 "------>vectorizing statement: %G", stmt_info
->stmt
);
9177 if (MAY_HAVE_DEBUG_BIND_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
9178 vect_loop_kill_debug_uses (loop
, stmt_info
);
9180 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
9181 && !STMT_VINFO_LIVE_P (stmt_info
))
9184 if (STMT_VINFO_VECTYPE (stmt_info
))
9187 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
9188 if (!STMT_SLP_TYPE (stmt_info
)
9189 && maybe_ne (nunits
, vf
)
9190 && dump_enabled_p ())
9191 /* For SLP VF is set according to unrolling factor, and not
9192 to vector size, hence for SLP this print is not valid. */
9193 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
9196 /* Pure SLP statements have already been vectorized. We still need
9197 to apply loop vectorization to hybrid SLP statements. */
9198 if (PURE_SLP_STMT (stmt_info
))
9201 if (dump_enabled_p ())
9202 dump_printf_loc (MSG_NOTE
, vect_location
, "transform statement.\n");
9204 if (vect_transform_stmt (loop_vinfo
, stmt_info
, gsi
, NULL
, NULL
))
9205 *seen_store
= stmt_info
;
9210 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9211 in the hash_map with its corresponding values. */
9214 find_in_mapping (tree t
, void *context
)
9216 hash_map
<tree
,tree
>* mapping
= (hash_map
<tree
, tree
>*) context
;
9218 tree
*value
= mapping
->get (t
);
9219 return value
? *value
: t
;
9222 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
9223 original loop that has now been vectorized.
9225 The inits of the data_references need to be advanced with the number of
9226 iterations of the main loop. This has been computed in vect_do_peeling and
9227 is stored in parameter ADVANCE. We first restore the data_references
9228 initial offset with the values recored in ORIG_DRS_INIT.
9230 Since the loop_vec_info of this EPILOGUE was constructed for the original
9231 loop, its stmt_vec_infos all point to the original statements. These need
9232 to be updated to point to their corresponding copies as well as the SSA_NAMES
9233 in their PATTERN_DEF_SEQs and RELATED_STMTs.
9235 The data_reference's connections also need to be updated. Their
9236 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9237 stmt_vec_infos, their statements need to point to their corresponding copy,
9238 if they are gather loads or scatter stores then their reference needs to be
9239 updated to point to its corresponding copy and finally we set
9240 'base_misaligned' to false as we have already peeled for alignment in the
9241 prologue of the main loop. */
9244 update_epilogue_loop_vinfo (class loop
*epilogue
, tree advance
)
9246 loop_vec_info epilogue_vinfo
= loop_vec_info_for_loop (epilogue
);
9247 auto_vec
<gimple
*> stmt_worklist
;
9248 hash_map
<tree
,tree
> mapping
;
9249 gimple
*orig_stmt
, *new_stmt
;
9250 gimple_stmt_iterator epilogue_gsi
;
9251 gphi_iterator epilogue_phi_gsi
;
9252 stmt_vec_info stmt_vinfo
= NULL
, related_vinfo
;
9253 basic_block
*epilogue_bbs
= get_loop_body (epilogue
);
9256 free (LOOP_VINFO_BBS (epilogue_vinfo
));
9257 LOOP_VINFO_BBS (epilogue_vinfo
) = epilogue_bbs
;
9259 /* Advance data_reference's with the number of iterations of the previous
9260 loop and its prologue. */
9261 vect_update_inits_of_drs (epilogue_vinfo
, advance
, PLUS_EXPR
);
9264 /* The EPILOGUE loop is a copy of the original loop so they share the same
9265 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
9266 point to the copied statements. We also create a mapping of all LHS' in
9267 the original loop and all the LHS' in the EPILOGUE and create worklists to
9268 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
9269 for (unsigned i
= 0; i
< epilogue
->num_nodes
; ++i
)
9271 for (epilogue_phi_gsi
= gsi_start_phis (epilogue_bbs
[i
]);
9272 !gsi_end_p (epilogue_phi_gsi
); gsi_next (&epilogue_phi_gsi
))
9274 new_stmt
= epilogue_phi_gsi
.phi ();
9276 gcc_assert (gimple_uid (new_stmt
) > 0);
9278 = epilogue_vinfo
->stmt_vec_infos
[gimple_uid (new_stmt
) - 1];
9280 orig_stmt
= STMT_VINFO_STMT (stmt_vinfo
);
9281 STMT_VINFO_STMT (stmt_vinfo
) = new_stmt
;
9283 mapping
.put (gimple_phi_result (orig_stmt
),
9284 gimple_phi_result (new_stmt
));
9285 /* PHI nodes can not have patterns or related statements. */
9286 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo
) == NULL
9287 && STMT_VINFO_RELATED_STMT (stmt_vinfo
) == NULL
);
9290 for (epilogue_gsi
= gsi_start_bb (epilogue_bbs
[i
]);
9291 !gsi_end_p (epilogue_gsi
); gsi_next (&epilogue_gsi
))
9293 new_stmt
= gsi_stmt (epilogue_gsi
);
9294 if (is_gimple_debug (new_stmt
))
9297 gcc_assert (gimple_uid (new_stmt
) > 0);
9299 = epilogue_vinfo
->stmt_vec_infos
[gimple_uid (new_stmt
) - 1];
9301 orig_stmt
= STMT_VINFO_STMT (stmt_vinfo
);
9302 STMT_VINFO_STMT (stmt_vinfo
) = new_stmt
;
9304 if (tree old_lhs
= gimple_get_lhs (orig_stmt
))
9305 mapping
.put (old_lhs
, gimple_get_lhs (new_stmt
));
9307 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo
))
9309 gimple_seq seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo
);
9310 for (gimple_stmt_iterator gsi
= gsi_start (seq
);
9311 !gsi_end_p (gsi
); gsi_next (&gsi
))
9312 stmt_worklist
.safe_push (gsi_stmt (gsi
));
9315 related_vinfo
= STMT_VINFO_RELATED_STMT (stmt_vinfo
);
9316 if (related_vinfo
!= NULL
&& related_vinfo
!= stmt_vinfo
)
9318 gimple
*stmt
= STMT_VINFO_STMT (related_vinfo
);
9319 stmt_worklist
.safe_push (stmt
);
9320 /* Set BB such that the assert in
9321 'get_initial_def_for_reduction' is able to determine that
9322 the BB of the related stmt is inside this loop. */
9323 gimple_set_bb (stmt
,
9324 gimple_bb (new_stmt
));
9325 related_vinfo
= STMT_VINFO_RELATED_STMT (related_vinfo
);
9326 gcc_assert (related_vinfo
== NULL
9327 || related_vinfo
== stmt_vinfo
);
9332 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9333 using the original main loop and thus need to be updated to refer to the
9334 cloned variables used in the epilogue. */
9335 for (unsigned i
= 0; i
< stmt_worklist
.length (); ++i
)
9337 gimple
*stmt
= stmt_worklist
[i
];
9340 for (unsigned j
= 1; j
< gimple_num_ops (stmt
); ++j
)
9342 tree op
= gimple_op (stmt
, j
);
9343 if ((new_op
= mapping
.get(op
)))
9344 gimple_set_op (stmt
, j
, *new_op
);
9347 /* PR92429: The last argument of simplify_replace_tree disables
9348 folding when replacing arguments. This is required as
9349 otherwise you might end up with different statements than the
9350 ones analyzed in vect_loop_analyze, leading to different
9352 op
= simplify_replace_tree (op
, NULL_TREE
, NULL_TREE
,
9353 &find_in_mapping
, &mapping
, false);
9354 gimple_set_op (stmt
, j
, op
);
9359 struct data_reference
*dr
;
9360 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (epilogue_vinfo
);
9361 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
9363 orig_stmt
= DR_STMT (dr
);
9364 gcc_assert (gimple_uid (orig_stmt
) > 0);
9365 stmt_vinfo
= epilogue_vinfo
->stmt_vec_infos
[gimple_uid (orig_stmt
) - 1];
9366 /* Data references for gather loads and scatter stores do not use the
9367 updated offset we set using ADVANCE. Instead we have to make sure the
9368 reference in the data references point to the corresponding copy of
9369 the original in the epilogue. */
9370 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo
))
9371 == VMAT_GATHER_SCATTER
)
9374 = simplify_replace_tree (DR_REF (dr
), NULL_TREE
, NULL_TREE
,
9375 &find_in_mapping
, &mapping
);
9376 DR_BASE_ADDRESS (dr
)
9377 = simplify_replace_tree (DR_BASE_ADDRESS (dr
), NULL_TREE
, NULL_TREE
,
9378 &find_in_mapping
, &mapping
);
9380 DR_STMT (dr
) = STMT_VINFO_STMT (stmt_vinfo
);
9381 stmt_vinfo
->dr_aux
.stmt
= stmt_vinfo
;
9382 /* The vector size of the epilogue is smaller than that of the main loop
9383 so the alignment is either the same or lower. This means the dr will
9384 thus by definition be aligned. */
9385 STMT_VINFO_DR_INFO (stmt_vinfo
)->base_misaligned
= false;
9388 epilogue_vinfo
->shared
->datarefs_copy
.release ();
9389 epilogue_vinfo
->shared
->save_datarefs ();
9392 /* Function vect_transform_loop.
9394 The analysis phase has determined that the loop is vectorizable.
9395 Vectorize the loop - created vectorized stmts to replace the scalar
9396 stmts in the loop, and update the loop exit condition.
9397 Returns scalar epilogue loop if any. */
9400 vect_transform_loop (loop_vec_info loop_vinfo
, gimple
*loop_vectorized_call
)
9402 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
9403 class loop
*epilogue
= NULL
;
9404 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
9405 int nbbs
= loop
->num_nodes
;
9407 tree niters_vector
= NULL_TREE
;
9408 tree step_vector
= NULL_TREE
;
9409 tree niters_vector_mult_vf
= NULL_TREE
;
9410 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
9411 unsigned int lowest_vf
= constant_lower_bound (vf
);
9413 bool check_profitability
= false;
9416 DUMP_VECT_SCOPE ("vec_transform_loop");
9418 loop_vinfo
->shared
->check_datarefs ();
9420 /* Use the more conservative vectorization threshold. If the number
9421 of iterations is constant assume the cost check has been performed
9422 by our caller. If the threshold makes all loops profitable that
9423 run at least the (estimated) vectorization factor number of times
9424 checking is pointless, too. */
9425 th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
9426 if (vect_apply_runtime_profitability_check_p (loop_vinfo
))
9428 if (dump_enabled_p ())
9429 dump_printf_loc (MSG_NOTE
, vect_location
,
9430 "Profitability threshold is %d loop iterations.\n",
9432 check_profitability
= true;
9435 /* Make sure there exists a single-predecessor exit bb. Do this before
9437 edge e
= single_exit (loop
);
9438 if (! single_pred_p (e
->dest
))
9440 split_loop_exit_edge (e
, true);
9441 if (dump_enabled_p ())
9442 dump_printf (MSG_NOTE
, "split exit edge\n");
9445 /* Version the loop first, if required, so the profitability check
9448 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
9451 = vect_loop_versioning (loop_vinfo
, loop_vectorized_call
);
9452 sloop
->force_vectorize
= false;
9453 check_profitability
= false;
9456 /* Make sure there exists a single-predecessor exit bb also on the
9457 scalar loop copy. Do this after versioning but before peeling
9458 so CFG structure is fine for both scalar and if-converted loop
9459 to make slpeel_duplicate_current_defs_from_edges face matched
9460 loop closed PHI nodes on the exit. */
9461 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
))
9463 e
= single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
));
9464 if (! single_pred_p (e
->dest
))
9466 split_loop_exit_edge (e
, true);
9467 if (dump_enabled_p ())
9468 dump_printf (MSG_NOTE
, "split exit edge of scalar loop\n");
9472 tree niters
= vect_build_loop_niters (loop_vinfo
);
9473 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = niters
;
9474 tree nitersm1
= unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo
));
9475 bool niters_no_overflow
= loop_niters_no_overflow (loop_vinfo
);
9477 drs_init_vec orig_drs_init
;
9479 epilogue
= vect_do_peeling (loop_vinfo
, niters
, nitersm1
, &niters_vector
,
9480 &step_vector
, &niters_vector_mult_vf
, th
,
9481 check_profitability
, niters_no_overflow
,
9484 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
)
9485 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo
).initialized_p ())
9486 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
),
9487 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo
));
9489 if (niters_vector
== NULL_TREE
)
9491 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
9492 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
)
9493 && known_eq (lowest_vf
, vf
))
9496 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
)),
9497 LOOP_VINFO_INT_NITERS (loop_vinfo
) / lowest_vf
);
9498 step_vector
= build_one_cst (TREE_TYPE (niters
));
9500 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo
))
9501 vect_gen_vector_loop_niters (loop_vinfo
, niters
, &niters_vector
,
9502 &step_vector
, niters_no_overflow
);
9504 /* vect_do_peeling subtracted the number of peeled prologue
9505 iterations from LOOP_VINFO_NITERS. */
9506 vect_gen_vector_loop_niters (loop_vinfo
, LOOP_VINFO_NITERS (loop_vinfo
),
9507 &niters_vector
, &step_vector
,
9508 niters_no_overflow
);
9511 /* 1) Make sure the loop header has exactly two entries
9512 2) Make sure we have a preheader basic block. */
9514 gcc_assert (EDGE_COUNT (loop
->header
->preds
) == 2);
9516 split_edge (loop_preheader_edge (loop
));
9518 if (vect_use_loop_mask_for_alignment_p (loop_vinfo
))
9519 /* This will deal with any possible peeling. */
9520 vect_prepare_for_masked_peels (loop_vinfo
);
9522 /* Schedule the SLP instances first, then handle loop vectorization
9524 if (!loop_vinfo
->slp_instances
.is_empty ())
9526 DUMP_VECT_SCOPE ("scheduling SLP instances");
9527 vect_schedule_slp (loop_vinfo
, LOOP_VINFO_SLP_INSTANCES (loop_vinfo
));
9530 /* FORNOW: the vectorizer supports only loops which body consist
9531 of one basic block (header + empty latch). When the vectorizer will
9532 support more involved loop forms, the order by which the BBs are
9533 traversed need to be reconsidered. */
9535 for (i
= 0; i
< nbbs
; i
++)
9537 basic_block bb
= bbs
[i
];
9538 stmt_vec_info stmt_info
;
9540 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
9543 gphi
*phi
= si
.phi ();
9544 if (dump_enabled_p ())
9545 dump_printf_loc (MSG_NOTE
, vect_location
,
9546 "------>vectorizing phi: %G", phi
);
9547 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
9551 if (MAY_HAVE_DEBUG_BIND_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
9552 vect_loop_kill_debug_uses (loop
, stmt_info
);
9554 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
9555 && !STMT_VINFO_LIVE_P (stmt_info
))
9558 if (STMT_VINFO_VECTYPE (stmt_info
)
9560 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
)), vf
))
9561 && dump_enabled_p ())
9562 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
9564 if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
9565 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
9566 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
9567 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
9568 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_internal_def
)
9569 && ! PURE_SLP_STMT (stmt_info
))
9571 if (dump_enabled_p ())
9572 dump_printf_loc (MSG_NOTE
, vect_location
, "transform phi.\n");
9573 vect_transform_stmt (loop_vinfo
, stmt_info
, NULL
, NULL
, NULL
);
9577 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
9580 gphi
*phi
= si
.phi ();
9581 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
9585 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
9586 && !STMT_VINFO_LIVE_P (stmt_info
))
9589 if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
9590 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
9591 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
9592 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
9593 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_internal_def
)
9594 && ! PURE_SLP_STMT (stmt_info
))
9595 maybe_set_vectorized_backedge_value (loop_vinfo
, stmt_info
);
9598 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
9601 stmt
= gsi_stmt (si
);
9602 /* During vectorization remove existing clobber stmts. */
9603 if (gimple_clobber_p (stmt
))
9605 unlink_stmt_vdef (stmt
);
9606 gsi_remove (&si
, true);
9607 release_defs (stmt
);
9611 /* Ignore vector stmts created in the outer loop. */
9612 stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
9614 /* vector stmts created in the outer-loop during vectorization of
9615 stmts in an inner-loop may not have a stmt_info, and do not
9616 need to be vectorized. */
9617 stmt_vec_info seen_store
= NULL
;
9620 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
9622 gimple
*def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
9623 for (gimple_stmt_iterator subsi
= gsi_start (def_seq
);
9624 !gsi_end_p (subsi
); gsi_next (&subsi
))
9626 stmt_vec_info pat_stmt_info
9627 = loop_vinfo
->lookup_stmt (gsi_stmt (subsi
));
9628 vect_transform_loop_stmt (loop_vinfo
, pat_stmt_info
,
9631 stmt_vec_info pat_stmt_info
9632 = STMT_VINFO_RELATED_STMT (stmt_info
);
9633 if (vect_transform_loop_stmt (loop_vinfo
, pat_stmt_info
,
9635 maybe_set_vectorized_backedge_value (loop_vinfo
,
9640 if (vect_transform_loop_stmt (loop_vinfo
, stmt_info
, &si
,
9642 maybe_set_vectorized_backedge_value (loop_vinfo
,
9649 if (STMT_VINFO_GROUPED_ACCESS (seen_store
))
9650 /* Interleaving. If IS_STORE is TRUE, the
9651 vectorization of the interleaving chain was
9652 completed - free all the stores in the chain. */
9653 vect_remove_stores (loop_vinfo
,
9654 DR_GROUP_FIRST_ELEMENT (seen_store
));
9656 /* Free the attached stmt_vec_info and remove the stmt. */
9657 loop_vinfo
->remove_stmt (stmt_info
);
9662 /* Stub out scalar statements that must not survive vectorization.
9663 Doing this here helps with grouped statements, or statements that
9664 are involved in patterns. */
9665 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb
);
9666 !gsi_end_p (gsi
); gsi_next (&gsi
))
9668 gcall
*call
= dyn_cast
<gcall
*> (gsi_stmt (gsi
));
9669 if (call
&& gimple_call_internal_p (call
, IFN_MASK_LOAD
))
9671 tree lhs
= gimple_get_lhs (call
);
9672 if (!VECTOR_TYPE_P (TREE_TYPE (lhs
)))
9674 tree zero
= build_zero_cst (TREE_TYPE (lhs
));
9675 gimple
*new_stmt
= gimple_build_assign (lhs
, zero
);
9676 gsi_replace (&gsi
, new_stmt
, true);
9682 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9683 a zero NITERS becomes a nonzero NITERS_VECTOR. */
9684 if (integer_onep (step_vector
))
9685 niters_no_overflow
= true;
9686 vect_set_loop_condition (loop
, loop_vinfo
, niters_vector
, step_vector
,
9687 niters_vector_mult_vf
, !niters_no_overflow
);
9689 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
9690 scale_profile_for_vect_loop (loop
, assumed_vf
);
9692 /* True if the final iteration might not handle a full vector's
9693 worth of scalar iterations. */
9694 bool final_iter_may_be_partial
9695 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
);
9696 /* The minimum number of iterations performed by the epilogue. This
9697 is 1 when peeling for gaps because we always need a final scalar
9699 int min_epilogue_iters
= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) ? 1 : 0;
9700 /* +1 to convert latch counts to loop iteration counts,
9701 -min_epilogue_iters to remove iterations that cannot be performed
9702 by the vector code. */
9703 int bias_for_lowest
= 1 - min_epilogue_iters
;
9704 int bias_for_assumed
= bias_for_lowest
;
9705 int alignment_npeels
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
9706 if (alignment_npeels
&& LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
9708 /* When the amount of peeling is known at compile time, the first
9709 iteration will have exactly alignment_npeels active elements.
9710 In the worst case it will have at least one. */
9711 int min_first_active
= (alignment_npeels
> 0 ? alignment_npeels
: 1);
9712 bias_for_lowest
+= lowest_vf
- min_first_active
;
9713 bias_for_assumed
+= assumed_vf
- min_first_active
;
9715 /* In these calculations the "- 1" converts loop iteration counts
9716 back to latch counts. */
9717 if (loop
->any_upper_bound
)
9718 loop
->nb_iterations_upper_bound
9719 = (final_iter_may_be_partial
9720 ? wi::udiv_ceil (loop
->nb_iterations_upper_bound
+ bias_for_lowest
,
9722 : wi::udiv_floor (loop
->nb_iterations_upper_bound
+ bias_for_lowest
,
9724 if (loop
->any_likely_upper_bound
)
9725 loop
->nb_iterations_likely_upper_bound
9726 = (final_iter_may_be_partial
9727 ? wi::udiv_ceil (loop
->nb_iterations_likely_upper_bound
9728 + bias_for_lowest
, lowest_vf
) - 1
9729 : wi::udiv_floor (loop
->nb_iterations_likely_upper_bound
9730 + bias_for_lowest
, lowest_vf
) - 1);
9731 if (loop
->any_estimate
)
9732 loop
->nb_iterations_estimate
9733 = (final_iter_may_be_partial
9734 ? wi::udiv_ceil (loop
->nb_iterations_estimate
+ bias_for_assumed
,
9736 : wi::udiv_floor (loop
->nb_iterations_estimate
+ bias_for_assumed
,
9739 if (dump_enabled_p ())
9741 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
9743 dump_printf_loc (MSG_NOTE
, vect_location
,
9744 "LOOP VECTORIZED\n");
9746 dump_printf_loc (MSG_NOTE
, vect_location
,
9747 "OUTER LOOP VECTORIZED\n");
9748 dump_printf (MSG_NOTE
, "\n");
9751 dump_printf_loc (MSG_NOTE
, vect_location
,
9752 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9753 GET_MODE_NAME (loop_vinfo
->vector_mode
));
9756 /* Loops vectorized with a variable factor won't benefit from
9757 unrolling/peeling. */
9758 if (!vf
.is_constant ())
9761 if (dump_enabled_p ())
9762 dump_printf_loc (MSG_NOTE
, vect_location
, "Disabling unrolling due to"
9763 " variable-length vectorization factor\n");
9765 /* Free SLP instances here because otherwise stmt reference counting
9767 slp_instance instance
;
9768 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), i
, instance
)
9769 vect_free_slp_instance (instance
);
9770 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
9771 /* Clear-up safelen field since its value is invalid after vectorization
9772 since vectorized loop can have loop-carried dependencies. */
9777 update_epilogue_loop_vinfo (epilogue
, advance
);
9779 epilogue
->simduid
= loop
->simduid
;
9780 epilogue
->force_vectorize
= loop
->force_vectorize
;
9781 epilogue
->dont_vectorize
= false;
9787 /* The code below is trying to perform simple optimization - revert
9788 if-conversion for masked stores, i.e. if the mask of a store is zero
9789 do not perform it and all stored value producers also if possible.
9797 this transformation will produce the following semi-hammock:
9799 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9801 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9802 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9803 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9804 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9805 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9806 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9811 optimize_mask_stores (class loop
*loop
)
9813 basic_block
*bbs
= get_loop_body (loop
);
9814 unsigned nbbs
= loop
->num_nodes
;
9817 class loop
*bb_loop
;
9818 gimple_stmt_iterator gsi
;
9820 auto_vec
<gimple
*> worklist
;
9821 auto_purge_vect_location sentinel
;
9823 vect_location
= find_loop_location (loop
);
9824 /* Pick up all masked stores in loop if any. */
9825 for (i
= 0; i
< nbbs
; i
++)
9828 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
);
9831 stmt
= gsi_stmt (gsi
);
9832 if (gimple_call_internal_p (stmt
, IFN_MASK_STORE
))
9833 worklist
.safe_push (stmt
);
9838 if (worklist
.is_empty ())
9841 /* Loop has masked stores. */
9842 while (!worklist
.is_empty ())
9844 gimple
*last
, *last_store
;
9847 basic_block store_bb
, join_bb
;
9848 gimple_stmt_iterator gsi_to
;
9849 tree vdef
, new_vdef
;
9854 last
= worklist
.pop ();
9855 mask
= gimple_call_arg (last
, 2);
9856 bb
= gimple_bb (last
);
9857 /* Create then_bb and if-then structure in CFG, then_bb belongs to
9858 the same loop as if_bb. It could be different to LOOP when two
9859 level loop-nest is vectorized and mask_store belongs to the inner
9861 e
= split_block (bb
, last
);
9862 bb_loop
= bb
->loop_father
;
9863 gcc_assert (loop
== bb_loop
|| flow_loop_nested_p (loop
, bb_loop
));
9865 store_bb
= create_empty_bb (bb
);
9866 add_bb_to_loop (store_bb
, bb_loop
);
9867 e
->flags
= EDGE_TRUE_VALUE
;
9868 efalse
= make_edge (bb
, store_bb
, EDGE_FALSE_VALUE
);
9869 /* Put STORE_BB to likely part. */
9870 efalse
->probability
= profile_probability::unlikely ();
9871 store_bb
->count
= efalse
->count ();
9872 make_single_succ_edge (store_bb
, join_bb
, EDGE_FALLTHRU
);
9873 if (dom_info_available_p (CDI_DOMINATORS
))
9874 set_immediate_dominator (CDI_DOMINATORS
, store_bb
, bb
);
9875 if (dump_enabled_p ())
9876 dump_printf_loc (MSG_NOTE
, vect_location
,
9877 "Create new block %d to sink mask stores.",
9879 /* Create vector comparison with boolean result. */
9880 vectype
= TREE_TYPE (mask
);
9881 zero
= build_zero_cst (vectype
);
9882 stmt
= gimple_build_cond (EQ_EXPR
, mask
, zero
, NULL_TREE
, NULL_TREE
);
9883 gsi
= gsi_last_bb (bb
);
9884 gsi_insert_after (&gsi
, stmt
, GSI_SAME_STMT
);
9885 /* Create new PHI node for vdef of the last masked store:
9886 .MEM_2 = VDEF <.MEM_1>
9887 will be converted to
9888 .MEM.3 = VDEF <.MEM_1>
9889 and new PHI node will be created in join bb
9890 .MEM_2 = PHI <.MEM_1, .MEM_3>
9892 vdef
= gimple_vdef (last
);
9893 new_vdef
= make_ssa_name (gimple_vop (cfun
), last
);
9894 gimple_set_vdef (last
, new_vdef
);
9895 phi
= create_phi_node (vdef
, join_bb
);
9896 add_phi_arg (phi
, new_vdef
, EDGE_SUCC (store_bb
, 0), UNKNOWN_LOCATION
);
9898 /* Put all masked stores with the same mask to STORE_BB if possible. */
9901 gimple_stmt_iterator gsi_from
;
9902 gimple
*stmt1
= NULL
;
9904 /* Move masked store to STORE_BB. */
9906 gsi
= gsi_for_stmt (last
);
9908 /* Shift GSI to the previous stmt for further traversal. */
9910 gsi_to
= gsi_start_bb (store_bb
);
9911 gsi_move_before (&gsi_from
, &gsi_to
);
9912 /* Setup GSI_TO to the non-empty block start. */
9913 gsi_to
= gsi_start_bb (store_bb
);
9914 if (dump_enabled_p ())
9915 dump_printf_loc (MSG_NOTE
, vect_location
,
9916 "Move stmt to created bb\n%G", last
);
9917 /* Move all stored value producers if possible. */
9918 while (!gsi_end_p (gsi
))
9921 imm_use_iterator imm_iter
;
9922 use_operand_p use_p
;
9925 /* Skip debug statements. */
9926 if (is_gimple_debug (gsi_stmt (gsi
)))
9931 stmt1
= gsi_stmt (gsi
);
9932 /* Do not consider statements writing to memory or having
9933 volatile operand. */
9934 if (gimple_vdef (stmt1
)
9935 || gimple_has_volatile_ops (stmt1
))
9939 lhs
= gimple_get_lhs (stmt1
);
9943 /* LHS of vectorized stmt must be SSA_NAME. */
9944 if (TREE_CODE (lhs
) != SSA_NAME
)
9947 if (!VECTOR_TYPE_P (TREE_TYPE (lhs
)))
9949 /* Remove dead scalar statement. */
9950 if (has_zero_uses (lhs
))
9952 gsi_remove (&gsi_from
, true);
9957 /* Check that LHS does not have uses outside of STORE_BB. */
9959 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, lhs
)
9962 use_stmt
= USE_STMT (use_p
);
9963 if (is_gimple_debug (use_stmt
))
9965 if (gimple_bb (use_stmt
) != store_bb
)
9974 if (gimple_vuse (stmt1
)
9975 && gimple_vuse (stmt1
) != gimple_vuse (last_store
))
9978 /* Can move STMT1 to STORE_BB. */
9979 if (dump_enabled_p ())
9980 dump_printf_loc (MSG_NOTE
, vect_location
,
9981 "Move stmt to created bb\n%G", stmt1
);
9982 gsi_move_before (&gsi_from
, &gsi_to
);
9983 /* Shift GSI_TO for further insertion. */
9986 /* Put other masked stores with the same mask to STORE_BB. */
9987 if (worklist
.is_empty ()
9988 || gimple_call_arg (worklist
.last (), 2) != mask
9989 || worklist
.last () != stmt1
)
9991 last
= worklist
.pop ();
9993 add_phi_arg (phi
, gimple_vuse (last_store
), e
, UNKNOWN_LOCATION
);
9997 /* Decide whether it is possible to use a zero-based induction variable
9998 when vectorizing LOOP_VINFO with partial vectors. If it is, return
9999 the value that the induction variable must be able to hold in order
10000 to ensure that the rgroups eventually have no active vector elements.
10001 Return -1 otherwise. */
10004 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo
)
10006 tree niters_skip
= LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
);
10007 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
10008 unsigned HOST_WIDE_INT max_vf
= vect_max_vf (loop_vinfo
);
10010 /* Calculate the value that the induction variable must be able
10011 to hit in order to ensure that we end the loop with an all-false mask.
10012 This involves adding the maximum number of inactive trailing scalar
10014 widest_int iv_limit
= -1;
10015 if (max_loop_iterations (loop
, &iv_limit
))
10019 /* Add the maximum number of skipped iterations to the
10020 maximum iteration count. */
10021 if (TREE_CODE (niters_skip
) == INTEGER_CST
)
10022 iv_limit
+= wi::to_widest (niters_skip
);
10024 iv_limit
+= max_vf
- 1;
10026 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
))
10027 /* Make a conservatively-correct assumption. */
10028 iv_limit
+= max_vf
- 1;
10030 /* IV_LIMIT is the maximum number of latch iterations, which is also
10031 the maximum in-range IV value. Round this value down to the previous
10032 vector alignment boundary and then add an extra full iteration. */
10033 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
10034 iv_limit
= (iv_limit
& -(int) known_alignment (vf
)) + max_vf
;
10039 /* For the given rgroup_controls RGC, check whether an induction variable
10040 would ever hit a value that produces a set of all-false masks or zero
10041 lengths before wrapping around. Return true if it's possible to wrap
10042 around before hitting the desirable value, otherwise return false. */
10045 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo
, rgroup_controls
*rgc
)
10047 widest_int iv_limit
= vect_iv_limit_for_partial_vectors (loop_vinfo
);
10049 if (iv_limit
== -1)
10052 tree compare_type
= LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo
);
10053 unsigned int compare_precision
= TYPE_PRECISION (compare_type
);
10054 unsigned nitems
= rgc
->max_nscalars_per_iter
* rgc
->factor
;
10056 if (wi::min_precision (iv_limit
* nitems
, UNSIGNED
) > compare_precision
)