Daily bump.
[gcc.git] / gcc / tree-vect-loop.c
1 /* Loop Vectorization
2 Copyright (C) 2003-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
49 #include "cgraph.h"
50 #include "tree-cfg.h"
51 #include "tree-if-conv.h"
52 #include "internal-fn.h"
53 #include "tree-vector-builder.h"
54 #include "vec-perm-indices.h"
55 #include "tree-eh.h"
56
57 /* Loop Vectorization Pass.
58
59 This pass tries to vectorize loops.
60
61 For example, the vectorizer transforms the following simple loop:
62
63 short a[N]; short b[N]; short c[N]; int i;
64
65 for (i=0; i<N; i++){
66 a[i] = b[i] + c[i];
67 }
68
69 as if it was manually vectorized by rewriting the source code into:
70
71 typedef int __attribute__((mode(V8HI))) v8hi;
72 short a[N]; short b[N]; short c[N]; int i;
73 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
74 v8hi va, vb, vc;
75
76 for (i=0; i<N/8; i++){
77 vb = pb[i];
78 vc = pc[i];
79 va = vb + vc;
80 pa[i] = va;
81 }
82
83 The main entry to this pass is vectorize_loops(), in which
84 the vectorizer applies a set of analyses on a given set of loops,
85 followed by the actual vectorization transformation for the loops that
86 had successfully passed the analysis phase.
87 Throughout this pass we make a distinction between two types of
88 data: scalars (which are represented by SSA_NAMES), and memory references
89 ("data-refs"). These two types of data require different handling both
90 during analysis and transformation. The types of data-refs that the
91 vectorizer currently supports are ARRAY_REFS which base is an array DECL
92 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
93 accesses are required to have a simple (consecutive) access pattern.
94
95 Analysis phase:
96 ===============
97 The driver for the analysis phase is vect_analyze_loop().
98 It applies a set of analyses, some of which rely on the scalar evolution
99 analyzer (scev) developed by Sebastian Pop.
100
101 During the analysis phase the vectorizer records some information
102 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
103 loop, as well as general information about the loop as a whole, which is
104 recorded in a "loop_vec_info" struct attached to each loop.
105
106 Transformation phase:
107 =====================
108 The loop transformation phase scans all the stmts in the loop, and
109 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
110 the loop that needs to be vectorized. It inserts the vector code sequence
111 just before the scalar stmt S, and records a pointer to the vector code
112 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
113 attached to S). This pointer will be used for the vectorization of following
114 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
115 otherwise, we rely on dead code elimination for removing it.
116
117 For example, say stmt S1 was vectorized into stmt VS1:
118
119 VS1: vb = px[i];
120 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
121 S2: a = b;
122
123 To vectorize stmt S2, the vectorizer first finds the stmt that defines
124 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
125 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
126 resulting sequence would be:
127
128 VS1: vb = px[i];
129 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
130 VS2: va = vb;
131 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
132
133 Operands that are not SSA_NAMEs, are data-refs that appear in
134 load/store operations (like 'x[i]' in S1), and are handled differently.
135
136 Target modeling:
137 =================
138 Currently the only target specific information that is used is the
139 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
140 Targets that can support different sizes of vectors, for now will need
141 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
142 flexibility will be added in the future.
143
144 Since we only vectorize operations which vector form can be
145 expressed using existing tree codes, to verify that an operation is
146 supported, the vectorizer checks the relevant optab at the relevant
147 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
148 the value found is CODE_FOR_nothing, then there's no target support, and
149 we can't vectorize the stmt.
150
151 For additional information on this project see:
152 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
153 */
154
155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
157 bool *, bool *);
158
159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
160 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
161 may already be set for general statements (not just data refs). */
162
163 static opt_result
164 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
165 bool vectype_maybe_set_p,
166 poly_uint64 *vf)
167 {
168 gimple *stmt = stmt_info->stmt;
169
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
173 {
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return opt_result::success ();
177 }
178
179 tree stmt_vectype, nunits_vectype;
180 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
181 &stmt_vectype,
182 &nunits_vectype);
183 if (!res)
184 return res;
185
186 if (stmt_vectype)
187 {
188 if (STMT_VINFO_VECTYPE (stmt_info))
189 /* The only case when a vectype had been already set is for stmts
190 that contain a data ref, or for "pattern-stmts" (stmts generated
191 by the vectorizer to represent/replace a certain idiom). */
192 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
193 || vectype_maybe_set_p)
194 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
195 else
196 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
197 }
198
199 if (nunits_vectype)
200 vect_update_max_nunits (vf, nunits_vectype);
201
202 return opt_result::success ();
203 }
204
205 /* Subroutine of vect_determine_vectorization_factor. Set the vector
206 types of STMT_INFO and all attached pattern statements and update
207 the vectorization factor VF accordingly. Return true on success
208 or false if something prevented vectorization. */
209
210 static opt_result
211 vect_determine_vf_for_stmt (vec_info *vinfo,
212 stmt_vec_info stmt_info, poly_uint64 *vf)
213 {
214 if (dump_enabled_p ())
215 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
216 stmt_info->stmt);
217 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
218 if (!res)
219 return res;
220
221 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
222 && STMT_VINFO_RELATED_STMT (stmt_info))
223 {
224 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
225 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
226
227 /* If a pattern statement has def stmts, analyze them too. */
228 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
229 !gsi_end_p (si); gsi_next (&si))
230 {
231 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
232 if (dump_enabled_p ())
233 dump_printf_loc (MSG_NOTE, vect_location,
234 "==> examining pattern def stmt: %G",
235 def_stmt_info->stmt);
236 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
237 if (!res)
238 return res;
239 }
240
241 if (dump_enabled_p ())
242 dump_printf_loc (MSG_NOTE, vect_location,
243 "==> examining pattern statement: %G",
244 stmt_info->stmt);
245 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
246 if (!res)
247 return res;
248 }
249
250 return opt_result::success ();
251 }
252
253 /* Function vect_determine_vectorization_factor
254
255 Determine the vectorization factor (VF). VF is the number of data elements
256 that are operated upon in parallel in a single iteration of the vectorized
257 loop. For example, when vectorizing a loop that operates on 4byte elements,
258 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
259 elements can fit in a single vector register.
260
261 We currently support vectorization of loops in which all types operated upon
262 are of the same size. Therefore this function currently sets VF according to
263 the size of the types operated upon, and fails if there are multiple sizes
264 in the loop.
265
266 VF is also the factor by which the loop iterations are strip-mined, e.g.:
267 original loop:
268 for (i=0; i<N; i++){
269 a[i] = b[i] + c[i];
270 }
271
272 vectorized loop:
273 for (i=0; i<N; i+=VF){
274 a[i:VF] = b[i:VF] + c[i:VF];
275 }
276 */
277
278 static opt_result
279 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
280 {
281 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
282 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
283 unsigned nbbs = loop->num_nodes;
284 poly_uint64 vectorization_factor = 1;
285 tree scalar_type = NULL_TREE;
286 gphi *phi;
287 tree vectype;
288 stmt_vec_info stmt_info;
289 unsigned i;
290
291 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
292
293 for (i = 0; i < nbbs; i++)
294 {
295 basic_block bb = bbs[i];
296
297 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
298 gsi_next (&si))
299 {
300 phi = si.phi ();
301 stmt_info = loop_vinfo->lookup_stmt (phi);
302 if (dump_enabled_p ())
303 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
304 phi);
305
306 gcc_assert (stmt_info);
307
308 if (STMT_VINFO_RELEVANT_P (stmt_info)
309 || STMT_VINFO_LIVE_P (stmt_info))
310 {
311 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
312 scalar_type = TREE_TYPE (PHI_RESULT (phi));
313
314 if (dump_enabled_p ())
315 dump_printf_loc (MSG_NOTE, vect_location,
316 "get vectype for scalar type: %T\n",
317 scalar_type);
318
319 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
320 if (!vectype)
321 return opt_result::failure_at (phi,
322 "not vectorized: unsupported "
323 "data-type %T\n",
324 scalar_type);
325 STMT_VINFO_VECTYPE (stmt_info) = vectype;
326
327 if (dump_enabled_p ())
328 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
329 vectype);
330
331 if (dump_enabled_p ())
332 {
333 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
334 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
335 dump_printf (MSG_NOTE, "\n");
336 }
337
338 vect_update_max_nunits (&vectorization_factor, vectype);
339 }
340 }
341
342 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
343 gsi_next (&si))
344 {
345 if (is_gimple_debug (gsi_stmt (si)))
346 continue;
347 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
348 opt_result res
349 = vect_determine_vf_for_stmt (loop_vinfo,
350 stmt_info, &vectorization_factor);
351 if (!res)
352 return res;
353 }
354 }
355
356 /* TODO: Analyze cost. Decide if worth while to vectorize. */
357 if (dump_enabled_p ())
358 {
359 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
360 dump_dec (MSG_NOTE, vectorization_factor);
361 dump_printf (MSG_NOTE, "\n");
362 }
363
364 if (known_le (vectorization_factor, 1U))
365 return opt_result::failure_at (vect_location,
366 "not vectorized: unsupported data-type\n");
367 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
368 return opt_result::success ();
369 }
370
371
372 /* Function vect_is_simple_iv_evolution.
373
374 FORNOW: A simple evolution of an induction variables in the loop is
375 considered a polynomial evolution. */
376
377 static bool
378 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
379 tree * step)
380 {
381 tree init_expr;
382 tree step_expr;
383 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
384 basic_block bb;
385
386 /* When there is no evolution in this loop, the evolution function
387 is not "simple". */
388 if (evolution_part == NULL_TREE)
389 return false;
390
391 /* When the evolution is a polynomial of degree >= 2
392 the evolution function is not "simple". */
393 if (tree_is_chrec (evolution_part))
394 return false;
395
396 step_expr = evolution_part;
397 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
398
399 if (dump_enabled_p ())
400 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
401 step_expr, init_expr);
402
403 *init = init_expr;
404 *step = step_expr;
405
406 if (TREE_CODE (step_expr) != INTEGER_CST
407 && (TREE_CODE (step_expr) != SSA_NAME
408 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
409 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
410 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
411 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
412 || !flag_associative_math)))
413 && (TREE_CODE (step_expr) != REAL_CST
414 || !flag_associative_math))
415 {
416 if (dump_enabled_p ())
417 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
418 "step unknown.\n");
419 return false;
420 }
421
422 return true;
423 }
424
425 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
426 what we are assuming is a double reduction. For example, given
427 a structure like this:
428
429 outer1:
430 x_1 = PHI <x_4(outer2), ...>;
431 ...
432
433 inner:
434 x_2 = PHI <x_1(outer1), ...>;
435 ...
436 x_3 = ...;
437 ...
438
439 outer2:
440 x_4 = PHI <x_3(inner)>;
441 ...
442
443 outer loop analysis would treat x_1 as a double reduction phi and
444 this function would then return true for x_2. */
445
446 static bool
447 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
448 {
449 use_operand_p use_p;
450 ssa_op_iter op_iter;
451 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
452 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
453 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
454 return true;
455 return false;
456 }
457
458 /* Function vect_analyze_scalar_cycles_1.
459
460 Examine the cross iteration def-use cycles of scalar variables
461 in LOOP. LOOP_VINFO represents the loop that is now being
462 considered for vectorization (can be LOOP, or an outer-loop
463 enclosing LOOP). */
464
465 static void
466 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
467 {
468 basic_block bb = loop->header;
469 tree init, step;
470 auto_vec<stmt_vec_info, 64> worklist;
471 gphi_iterator gsi;
472 bool double_reduc, reduc_chain;
473
474 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
475
476 /* First - identify all inductions. Reduction detection assumes that all the
477 inductions have been identified, therefore, this order must not be
478 changed. */
479 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
480 {
481 gphi *phi = gsi.phi ();
482 tree access_fn = NULL;
483 tree def = PHI_RESULT (phi);
484 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
485
486 if (dump_enabled_p ())
487 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
488
489 /* Skip virtual phi's. The data dependences that are associated with
490 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
491 if (virtual_operand_p (def))
492 continue;
493
494 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
495
496 /* Analyze the evolution function. */
497 access_fn = analyze_scalar_evolution (loop, def);
498 if (access_fn)
499 {
500 STRIP_NOPS (access_fn);
501 if (dump_enabled_p ())
502 dump_printf_loc (MSG_NOTE, vect_location,
503 "Access function of PHI: %T\n", access_fn);
504 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
505 = initial_condition_in_loop_num (access_fn, loop->num);
506 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
507 = evolution_part_in_loop_num (access_fn, loop->num);
508 }
509
510 if (!access_fn
511 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
512 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
513 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
514 && TREE_CODE (step) != INTEGER_CST))
515 {
516 worklist.safe_push (stmt_vinfo);
517 continue;
518 }
519
520 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
521 != NULL_TREE);
522 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
523
524 if (dump_enabled_p ())
525 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
526 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
527 }
528
529
530 /* Second - identify all reductions and nested cycles. */
531 while (worklist.length () > 0)
532 {
533 stmt_vec_info stmt_vinfo = worklist.pop ();
534 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
535 tree def = PHI_RESULT (phi);
536
537 if (dump_enabled_p ())
538 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
539
540 gcc_assert (!virtual_operand_p (def)
541 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
542
543 stmt_vec_info reduc_stmt_info
544 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
545 &reduc_chain);
546 if (reduc_stmt_info)
547 {
548 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
549 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
550 if (double_reduc)
551 {
552 if (dump_enabled_p ())
553 dump_printf_loc (MSG_NOTE, vect_location,
554 "Detected double reduction.\n");
555
556 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
557 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
558 }
559 else
560 {
561 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
562 {
563 if (dump_enabled_p ())
564 dump_printf_loc (MSG_NOTE, vect_location,
565 "Detected vectorizable nested cycle.\n");
566
567 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
568 }
569 else
570 {
571 if (dump_enabled_p ())
572 dump_printf_loc (MSG_NOTE, vect_location,
573 "Detected reduction.\n");
574
575 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
576 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
577 /* Store the reduction cycles for possible vectorization in
578 loop-aware SLP if it was not detected as reduction
579 chain. */
580 if (! reduc_chain)
581 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
582 (reduc_stmt_info);
583 }
584 }
585 }
586 else
587 if (dump_enabled_p ())
588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
589 "Unknown def-use cycle pattern.\n");
590 }
591 }
592
593
594 /* Function vect_analyze_scalar_cycles.
595
596 Examine the cross iteration def-use cycles of scalar variables, by
597 analyzing the loop-header PHIs of scalar variables. Classify each
598 cycle as one of the following: invariant, induction, reduction, unknown.
599 We do that for the loop represented by LOOP_VINFO, and also to its
600 inner-loop, if exists.
601 Examples for scalar cycles:
602
603 Example1: reduction:
604
605 loop1:
606 for (i=0; i<N; i++)
607 sum += a[i];
608
609 Example2: induction:
610
611 loop2:
612 for (i=0; i<N; i++)
613 a[i] = i; */
614
615 static void
616 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
617 {
618 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
619
620 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
621
622 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
623 Reductions in such inner-loop therefore have different properties than
624 the reductions in the nest that gets vectorized:
625 1. When vectorized, they are executed in the same order as in the original
626 scalar loop, so we can't change the order of computation when
627 vectorizing them.
628 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
629 current checks are too strict. */
630
631 if (loop->inner)
632 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
633 }
634
635 /* Transfer group and reduction information from STMT_INFO to its
636 pattern stmt. */
637
638 static void
639 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
640 {
641 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
642 stmt_vec_info stmtp;
643 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
644 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
645 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
646 do
647 {
648 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
649 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
650 == STMT_VINFO_DEF_TYPE (stmt_info));
651 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
652 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
653 if (stmt_info)
654 REDUC_GROUP_NEXT_ELEMENT (stmtp)
655 = STMT_VINFO_RELATED_STMT (stmt_info);
656 }
657 while (stmt_info);
658 }
659
660 /* Fixup scalar cycles that now have their stmts detected as patterns. */
661
662 static void
663 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
664 {
665 stmt_vec_info first;
666 unsigned i;
667
668 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
669 {
670 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
671 while (next)
672 {
673 if ((STMT_VINFO_IN_PATTERN_P (next)
674 != STMT_VINFO_IN_PATTERN_P (first))
675 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
676 break;
677 next = REDUC_GROUP_NEXT_ELEMENT (next);
678 }
679 /* If all reduction chain members are well-formed patterns adjust
680 the group to group the pattern stmts instead. */
681 if (! next
682 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
683 {
684 if (STMT_VINFO_IN_PATTERN_P (first))
685 {
686 vect_fixup_reduc_chain (first);
687 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
688 = STMT_VINFO_RELATED_STMT (first);
689 }
690 }
691 /* If not all stmt in the chain are patterns or if we failed
692 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
693 it as regular reduction instead. */
694 else
695 {
696 stmt_vec_info vinfo = first;
697 stmt_vec_info last = NULL;
698 while (vinfo)
699 {
700 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
701 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
702 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
703 last = vinfo;
704 vinfo = next;
705 }
706 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
707 = vect_internal_def;
708 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
709 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
710 --i;
711 }
712 }
713 }
714
715 /* Function vect_get_loop_niters.
716
717 Determine how many iterations the loop is executed and place it
718 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
719 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
720 niter information holds in ASSUMPTIONS.
721
722 Return the loop exit condition. */
723
724
725 static gcond *
726 vect_get_loop_niters (class loop *loop, tree *assumptions,
727 tree *number_of_iterations, tree *number_of_iterationsm1)
728 {
729 edge exit = single_exit (loop);
730 class tree_niter_desc niter_desc;
731 tree niter_assumptions, niter, may_be_zero;
732 gcond *cond = get_loop_exit_condition (loop);
733
734 *assumptions = boolean_true_node;
735 *number_of_iterationsm1 = chrec_dont_know;
736 *number_of_iterations = chrec_dont_know;
737 DUMP_VECT_SCOPE ("get_loop_niters");
738
739 if (!exit)
740 return cond;
741
742 may_be_zero = NULL_TREE;
743 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
744 || chrec_contains_undetermined (niter_desc.niter))
745 return cond;
746
747 niter_assumptions = niter_desc.assumptions;
748 may_be_zero = niter_desc.may_be_zero;
749 niter = niter_desc.niter;
750
751 if (may_be_zero && integer_zerop (may_be_zero))
752 may_be_zero = NULL_TREE;
753
754 if (may_be_zero)
755 {
756 if (COMPARISON_CLASS_P (may_be_zero))
757 {
758 /* Try to combine may_be_zero with assumptions, this can simplify
759 computation of niter expression. */
760 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
761 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
762 niter_assumptions,
763 fold_build1 (TRUTH_NOT_EXPR,
764 boolean_type_node,
765 may_be_zero));
766 else
767 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
768 build_int_cst (TREE_TYPE (niter), 0),
769 rewrite_to_non_trapping_overflow (niter));
770
771 may_be_zero = NULL_TREE;
772 }
773 else if (integer_nonzerop (may_be_zero))
774 {
775 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
776 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
777 return cond;
778 }
779 else
780 return cond;
781 }
782
783 *assumptions = niter_assumptions;
784 *number_of_iterationsm1 = niter;
785
786 /* We want the number of loop header executions which is the number
787 of latch executions plus one.
788 ??? For UINT_MAX latch executions this number overflows to zero
789 for loops like do { n++; } while (n != 0); */
790 if (niter && !chrec_contains_undetermined (niter))
791 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
792 build_int_cst (TREE_TYPE (niter), 1));
793 *number_of_iterations = niter;
794
795 return cond;
796 }
797
798 /* Function bb_in_loop_p
799
800 Used as predicate for dfs order traversal of the loop bbs. */
801
802 static bool
803 bb_in_loop_p (const_basic_block bb, const void *data)
804 {
805 const class loop *const loop = (const class loop *)data;
806 if (flow_bb_inside_loop_p (loop, bb))
807 return true;
808 return false;
809 }
810
811
812 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
813 stmt_vec_info structs for all the stmts in LOOP_IN. */
814
815 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
816 : vec_info (vec_info::loop, init_cost (loop_in), shared),
817 loop (loop_in),
818 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
819 num_itersm1 (NULL_TREE),
820 num_iters (NULL_TREE),
821 num_iters_unchanged (NULL_TREE),
822 num_iters_assumptions (NULL_TREE),
823 th (0),
824 versioning_threshold (0),
825 vectorization_factor (0),
826 max_vectorization_factor (0),
827 mask_skip_niters (NULL_TREE),
828 rgroup_compare_type (NULL_TREE),
829 simd_if_cond (NULL_TREE),
830 unaligned_dr (NULL),
831 peeling_for_alignment (0),
832 ptr_mask (0),
833 ivexpr_map (NULL),
834 scan_map (NULL),
835 slp_unrolling_factor (1),
836 single_scalar_iteration_cost (0),
837 vec_outside_cost (0),
838 vec_inside_cost (0),
839 vectorizable (false),
840 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
841 using_partial_vectors_p (false),
842 epil_using_partial_vectors_p (false),
843 peeling_for_gaps (false),
844 peeling_for_niter (false),
845 no_data_dependencies (false),
846 has_mask_store (false),
847 scalar_loop_scaling (profile_probability::uninitialized ()),
848 scalar_loop (NULL),
849 orig_loop_info (NULL)
850 {
851 /* CHECKME: We want to visit all BBs before their successors (except for
852 latch blocks, for which this assertion wouldn't hold). In the simple
853 case of the loop forms we allow, a dfs order of the BBs would the same
854 as reversed postorder traversal, so we are safe. */
855
856 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
857 bbs, loop->num_nodes, loop);
858 gcc_assert (nbbs == loop->num_nodes);
859
860 for (unsigned int i = 0; i < nbbs; i++)
861 {
862 basic_block bb = bbs[i];
863 gimple_stmt_iterator si;
864
865 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
866 {
867 gimple *phi = gsi_stmt (si);
868 gimple_set_uid (phi, 0);
869 add_stmt (phi);
870 }
871
872 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
873 {
874 gimple *stmt = gsi_stmt (si);
875 gimple_set_uid (stmt, 0);
876 if (is_gimple_debug (stmt))
877 continue;
878 add_stmt (stmt);
879 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
880 third argument is the #pragma omp simd if (x) condition, when 0,
881 loop shouldn't be vectorized, when non-zero constant, it should
882 be vectorized normally, otherwise versioned with vectorized loop
883 done if the condition is non-zero at runtime. */
884 if (loop_in->simduid
885 && is_gimple_call (stmt)
886 && gimple_call_internal_p (stmt)
887 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
888 && gimple_call_num_args (stmt) >= 3
889 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
890 && (loop_in->simduid
891 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
892 {
893 tree arg = gimple_call_arg (stmt, 2);
894 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
895 simd_if_cond = arg;
896 else
897 gcc_assert (integer_nonzerop (arg));
898 }
899 }
900 }
901
902 epilogue_vinfos.create (6);
903 }
904
905 /* Free all levels of rgroup CONTROLS. */
906
907 void
908 release_vec_loop_controls (vec<rgroup_controls> *controls)
909 {
910 rgroup_controls *rgc;
911 unsigned int i;
912 FOR_EACH_VEC_ELT (*controls, i, rgc)
913 rgc->controls.release ();
914 controls->release ();
915 }
916
917 /* Free all memory used by the _loop_vec_info, as well as all the
918 stmt_vec_info structs of all the stmts in the loop. */
919
920 _loop_vec_info::~_loop_vec_info ()
921 {
922 free (bbs);
923
924 release_vec_loop_controls (&masks);
925 release_vec_loop_controls (&lens);
926 delete ivexpr_map;
927 delete scan_map;
928 epilogue_vinfos.release ();
929
930 loop->aux = NULL;
931 }
932
933 /* Return an invariant or register for EXPR and emit necessary
934 computations in the LOOP_VINFO loop preheader. */
935
936 tree
937 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
938 {
939 if (is_gimple_reg (expr)
940 || is_gimple_min_invariant (expr))
941 return expr;
942
943 if (! loop_vinfo->ivexpr_map)
944 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
945 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
946 if (! cached)
947 {
948 gimple_seq stmts = NULL;
949 cached = force_gimple_operand (unshare_expr (expr),
950 &stmts, true, NULL_TREE);
951 if (stmts)
952 {
953 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
954 gsi_insert_seq_on_edge_immediate (e, stmts);
955 }
956 }
957 return cached;
958 }
959
960 /* Return true if we can use CMP_TYPE as the comparison type to produce
961 all masks required to mask LOOP_VINFO. */
962
963 static bool
964 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
965 {
966 rgroup_controls *rgm;
967 unsigned int i;
968 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
969 if (rgm->type != NULL_TREE
970 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
971 cmp_type, rgm->type,
972 OPTIMIZE_FOR_SPEED))
973 return false;
974 return true;
975 }
976
977 /* Calculate the maximum number of scalars per iteration for every
978 rgroup in LOOP_VINFO. */
979
980 static unsigned int
981 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
982 {
983 unsigned int res = 1;
984 unsigned int i;
985 rgroup_controls *rgm;
986 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
987 res = MAX (res, rgm->max_nscalars_per_iter);
988 return res;
989 }
990
991 /* Calculate the minimum precision necessary to represent:
992
993 MAX_NITERS * FACTOR
994
995 as an unsigned integer, where MAX_NITERS is the maximum number of
996 loop header iterations for the original scalar form of LOOP_VINFO. */
997
998 static unsigned
999 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1000 {
1001 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1002
1003 /* Get the maximum number of iterations that is representable
1004 in the counter type. */
1005 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1006 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1007
1008 /* Get a more refined estimate for the number of iterations. */
1009 widest_int max_back_edges;
1010 if (max_loop_iterations (loop, &max_back_edges))
1011 max_ni = wi::smin (max_ni, max_back_edges + 1);
1012
1013 /* Work out how many bits we need to represent the limit. */
1014 return wi::min_precision (max_ni * factor, UNSIGNED);
1015 }
1016
1017 /* True if the loop needs peeling or partial vectors when vectorized. */
1018
1019 static bool
1020 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1021 {
1022 unsigned HOST_WIDE_INT const_vf;
1023 HOST_WIDE_INT max_niter
1024 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1025
1026 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1027 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1028 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1029 (loop_vinfo));
1030
1031 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1032 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1033 {
1034 /* Work out the (constant) number of iterations that need to be
1035 peeled for reasons other than niters. */
1036 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1037 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1038 peel_niter += 1;
1039 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1040 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1041 return true;
1042 }
1043 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1044 /* ??? When peeling for gaps but not alignment, we could
1045 try to check whether the (variable) niters is known to be
1046 VF * N + 1. That's something of a niche case though. */
1047 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1048 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1049 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1050 < (unsigned) exact_log2 (const_vf))
1051 /* In case of versioning, check if the maximum number of
1052 iterations is greater than th. If they are identical,
1053 the epilogue is unnecessary. */
1054 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1055 || ((unsigned HOST_WIDE_INT) max_niter
1056 > (th / const_vf) * const_vf))))
1057 return true;
1058
1059 return false;
1060 }
1061
1062 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1063 whether we can actually generate the masks required. Return true if so,
1064 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1065
1066 static bool
1067 vect_verify_full_masking (loop_vec_info loop_vinfo)
1068 {
1069 unsigned int min_ni_width;
1070 unsigned int max_nscalars_per_iter
1071 = vect_get_max_nscalars_per_iter (loop_vinfo);
1072
1073 /* Use a normal loop if there are no statements that need masking.
1074 This only happens in rare degenerate cases: it means that the loop
1075 has no loads, no stores, and no live-out values. */
1076 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1077 return false;
1078
1079 /* Work out how many bits we need to represent the limit. */
1080 min_ni_width
1081 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1082
1083 /* Find a scalar mode for which WHILE_ULT is supported. */
1084 opt_scalar_int_mode cmp_mode_iter;
1085 tree cmp_type = NULL_TREE;
1086 tree iv_type = NULL_TREE;
1087 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1088 unsigned int iv_precision = UINT_MAX;
1089
1090 if (iv_limit != -1)
1091 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1092 UNSIGNED);
1093
1094 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1095 {
1096 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1097 if (cmp_bits >= min_ni_width
1098 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1099 {
1100 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1101 if (this_type
1102 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1103 {
1104 /* Although we could stop as soon as we find a valid mode,
1105 there are at least two reasons why that's not always the
1106 best choice:
1107
1108 - An IV that's Pmode or wider is more likely to be reusable
1109 in address calculations than an IV that's narrower than
1110 Pmode.
1111
1112 - Doing the comparison in IV_PRECISION or wider allows
1113 a natural 0-based IV, whereas using a narrower comparison
1114 type requires mitigations against wrap-around.
1115
1116 Conversely, if the IV limit is variable, doing the comparison
1117 in a wider type than the original type can introduce
1118 unnecessary extensions, so picking the widest valid mode
1119 is not always a good choice either.
1120
1121 Here we prefer the first IV type that's Pmode or wider,
1122 and the first comparison type that's IV_PRECISION or wider.
1123 (The comparison type must be no wider than the IV type,
1124 to avoid extensions in the vector loop.)
1125
1126 ??? We might want to try continuing beyond Pmode for ILP32
1127 targets if CMP_BITS < IV_PRECISION. */
1128 iv_type = this_type;
1129 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1130 cmp_type = this_type;
1131 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1132 break;
1133 }
1134 }
1135 }
1136
1137 if (!cmp_type)
1138 return false;
1139
1140 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1141 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1142 return true;
1143 }
1144
1145 /* Check whether we can use vector access with length based on precison
1146 comparison. So far, to keep it simple, we only allow the case that the
1147 precision of the target supported length is larger than the precision
1148 required by loop niters. */
1149
1150 static bool
1151 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1152 {
1153 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1154 return false;
1155
1156 unsigned int max_nitems_per_iter = 1;
1157 unsigned int i;
1158 rgroup_controls *rgl;
1159 /* Find the maximum number of items per iteration for every rgroup. */
1160 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1161 {
1162 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1163 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1164 }
1165
1166 /* Work out how many bits we need to represent the length limit. */
1167 unsigned int min_ni_prec
1168 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1169
1170 /* Now use the maximum of below precisions for one suitable IV type:
1171 - the IV's natural precision
1172 - the precision needed to hold: the maximum number of scalar
1173 iterations multiplied by the scale factor (min_ni_prec above)
1174 - the Pmode precision
1175
1176 If min_ni_prec is less than the precision of the current niters,
1177 we perfer to still use the niters type. Prefer to use Pmode and
1178 wider IV to avoid narrow conversions. */
1179
1180 unsigned int ni_prec
1181 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1182 min_ni_prec = MAX (min_ni_prec, ni_prec);
1183 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1184
1185 tree iv_type = NULL_TREE;
1186 opt_scalar_int_mode tmode_iter;
1187 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1188 {
1189 scalar_mode tmode = tmode_iter.require ();
1190 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1191
1192 /* ??? Do we really want to construct one IV whose precision exceeds
1193 BITS_PER_WORD? */
1194 if (tbits > BITS_PER_WORD)
1195 break;
1196
1197 /* Find the first available standard integral type. */
1198 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1199 {
1200 iv_type = build_nonstandard_integer_type (tbits, true);
1201 break;
1202 }
1203 }
1204
1205 if (!iv_type)
1206 {
1207 if (dump_enabled_p ())
1208 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1209 "can't vectorize with length-based partial vectors"
1210 " because there is no suitable iv type.\n");
1211 return false;
1212 }
1213
1214 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1215 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1216
1217 return true;
1218 }
1219
1220 /* Calculate the cost of one scalar iteration of the loop. */
1221 static void
1222 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1223 {
1224 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1225 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1226 int nbbs = loop->num_nodes, factor;
1227 int innerloop_iters, i;
1228
1229 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1230
1231 /* Gather costs for statements in the scalar loop. */
1232
1233 /* FORNOW. */
1234 innerloop_iters = 1;
1235 if (loop->inner)
1236 innerloop_iters = 50; /* FIXME */
1237
1238 for (i = 0; i < nbbs; i++)
1239 {
1240 gimple_stmt_iterator si;
1241 basic_block bb = bbs[i];
1242
1243 if (bb->loop_father == loop->inner)
1244 factor = innerloop_iters;
1245 else
1246 factor = 1;
1247
1248 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1249 {
1250 gimple *stmt = gsi_stmt (si);
1251 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1252
1253 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1254 continue;
1255
1256 /* Skip stmts that are not vectorized inside the loop. */
1257 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1258 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1259 && (!STMT_VINFO_LIVE_P (vstmt_info)
1260 || !VECTORIZABLE_CYCLE_DEF
1261 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1262 continue;
1263
1264 vect_cost_for_stmt kind;
1265 if (STMT_VINFO_DATA_REF (stmt_info))
1266 {
1267 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1268 kind = scalar_load;
1269 else
1270 kind = scalar_store;
1271 }
1272 else if (vect_nop_conversion_p (stmt_info))
1273 continue;
1274 else
1275 kind = scalar_stmt;
1276
1277 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1278 factor, kind, stmt_info, 0, vect_prologue);
1279 }
1280 }
1281
1282 /* Now accumulate cost. */
1283 void *target_cost_data = init_cost (loop);
1284 stmt_info_for_cost *si;
1285 int j;
1286 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1287 j, si)
1288 (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
1289 si->kind, si->stmt_info, si->vectype,
1290 si->misalign, vect_body);
1291 unsigned dummy, body_cost = 0;
1292 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1293 destroy_cost_data (target_cost_data);
1294 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1295 }
1296
1297
1298 /* Function vect_analyze_loop_form_1.
1299
1300 Verify that certain CFG restrictions hold, including:
1301 - the loop has a pre-header
1302 - the loop has a single entry and exit
1303 - the loop exit condition is simple enough
1304 - the number of iterations can be analyzed, i.e, a countable loop. The
1305 niter could be analyzed under some assumptions. */
1306
1307 opt_result
1308 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1309 tree *assumptions, tree *number_of_iterationsm1,
1310 tree *number_of_iterations, gcond **inner_loop_cond)
1311 {
1312 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1313
1314 /* Different restrictions apply when we are considering an inner-most loop,
1315 vs. an outer (nested) loop.
1316 (FORNOW. May want to relax some of these restrictions in the future). */
1317
1318 if (!loop->inner)
1319 {
1320 /* Inner-most loop. We currently require that the number of BBs is
1321 exactly 2 (the header and latch). Vectorizable inner-most loops
1322 look like this:
1323
1324 (pre-header)
1325 |
1326 header <--------+
1327 | | |
1328 | +--> latch --+
1329 |
1330 (exit-bb) */
1331
1332 if (loop->num_nodes != 2)
1333 return opt_result::failure_at (vect_location,
1334 "not vectorized:"
1335 " control flow in loop.\n");
1336
1337 if (empty_block_p (loop->header))
1338 return opt_result::failure_at (vect_location,
1339 "not vectorized: empty loop.\n");
1340 }
1341 else
1342 {
1343 class loop *innerloop = loop->inner;
1344 edge entryedge;
1345
1346 /* Nested loop. We currently require that the loop is doubly-nested,
1347 contains a single inner loop, and the number of BBs is exactly 5.
1348 Vectorizable outer-loops look like this:
1349
1350 (pre-header)
1351 |
1352 header <---+
1353 | |
1354 inner-loop |
1355 | |
1356 tail ------+
1357 |
1358 (exit-bb)
1359
1360 The inner-loop has the properties expected of inner-most loops
1361 as described above. */
1362
1363 if ((loop->inner)->inner || (loop->inner)->next)
1364 return opt_result::failure_at (vect_location,
1365 "not vectorized:"
1366 " multiple nested loops.\n");
1367
1368 if (loop->num_nodes != 5)
1369 return opt_result::failure_at (vect_location,
1370 "not vectorized:"
1371 " control flow in loop.\n");
1372
1373 entryedge = loop_preheader_edge (innerloop);
1374 if (entryedge->src != loop->header
1375 || !single_exit (innerloop)
1376 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1377 return opt_result::failure_at (vect_location,
1378 "not vectorized:"
1379 " unsupported outerloop form.\n");
1380
1381 /* Analyze the inner-loop. */
1382 tree inner_niterm1, inner_niter, inner_assumptions;
1383 opt_result res
1384 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1385 &inner_assumptions, &inner_niterm1,
1386 &inner_niter, NULL);
1387 if (!res)
1388 {
1389 if (dump_enabled_p ())
1390 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1391 "not vectorized: Bad inner loop.\n");
1392 return res;
1393 }
1394
1395 /* Don't support analyzing niter under assumptions for inner
1396 loop. */
1397 if (!integer_onep (inner_assumptions))
1398 return opt_result::failure_at (vect_location,
1399 "not vectorized: Bad inner loop.\n");
1400
1401 if (!expr_invariant_in_loop_p (loop, inner_niter))
1402 return opt_result::failure_at (vect_location,
1403 "not vectorized: inner-loop count not"
1404 " invariant.\n");
1405
1406 if (dump_enabled_p ())
1407 dump_printf_loc (MSG_NOTE, vect_location,
1408 "Considering outer-loop vectorization.\n");
1409 }
1410
1411 if (!single_exit (loop))
1412 return opt_result::failure_at (vect_location,
1413 "not vectorized: multiple exits.\n");
1414 if (EDGE_COUNT (loop->header->preds) != 2)
1415 return opt_result::failure_at (vect_location,
1416 "not vectorized:"
1417 " too many incoming edges.\n");
1418
1419 /* We assume that the loop exit condition is at the end of the loop. i.e,
1420 that the loop is represented as a do-while (with a proper if-guard
1421 before the loop if needed), where the loop header contains all the
1422 executable statements, and the latch is empty. */
1423 if (!empty_block_p (loop->latch)
1424 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1425 return opt_result::failure_at (vect_location,
1426 "not vectorized: latch block not empty.\n");
1427
1428 /* Make sure the exit is not abnormal. */
1429 edge e = single_exit (loop);
1430 if (e->flags & EDGE_ABNORMAL)
1431 return opt_result::failure_at (vect_location,
1432 "not vectorized:"
1433 " abnormal loop exit edge.\n");
1434
1435 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1436 number_of_iterationsm1);
1437 if (!*loop_cond)
1438 return opt_result::failure_at
1439 (vect_location,
1440 "not vectorized: complicated exit condition.\n");
1441
1442 if (integer_zerop (*assumptions)
1443 || !*number_of_iterations
1444 || chrec_contains_undetermined (*number_of_iterations))
1445 return opt_result::failure_at
1446 (*loop_cond,
1447 "not vectorized: number of iterations cannot be computed.\n");
1448
1449 if (integer_zerop (*number_of_iterations))
1450 return opt_result::failure_at
1451 (*loop_cond,
1452 "not vectorized: number of iterations = 0.\n");
1453
1454 return opt_result::success ();
1455 }
1456
1457 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1458
1459 opt_loop_vec_info
1460 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1461 {
1462 tree assumptions, number_of_iterations, number_of_iterationsm1;
1463 gcond *loop_cond, *inner_loop_cond = NULL;
1464
1465 opt_result res
1466 = vect_analyze_loop_form_1 (loop, &loop_cond,
1467 &assumptions, &number_of_iterationsm1,
1468 &number_of_iterations, &inner_loop_cond);
1469 if (!res)
1470 return opt_loop_vec_info::propagate_failure (res);
1471
1472 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1473 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1474 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1475 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1476 if (!integer_onep (assumptions))
1477 {
1478 /* We consider to vectorize this loop by versioning it under
1479 some assumptions. In order to do this, we need to clear
1480 existing information computed by scev and niter analyzer. */
1481 scev_reset_htab ();
1482 free_numbers_of_iterations_estimates (loop);
1483 /* Also set flag for this loop so that following scev and niter
1484 analysis are done under the assumptions. */
1485 loop_constraint_set (loop, LOOP_C_FINITE);
1486 /* Also record the assumptions for versioning. */
1487 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1488 }
1489
1490 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1491 {
1492 if (dump_enabled_p ())
1493 {
1494 dump_printf_loc (MSG_NOTE, vect_location,
1495 "Symbolic number of iterations is ");
1496 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1497 dump_printf (MSG_NOTE, "\n");
1498 }
1499 }
1500
1501 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1502 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1503 if (inner_loop_cond)
1504 {
1505 stmt_vec_info inner_loop_cond_info
1506 = loop_vinfo->lookup_stmt (inner_loop_cond);
1507 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1508 }
1509
1510 gcc_assert (!loop->aux);
1511 loop->aux = loop_vinfo;
1512 return opt_loop_vec_info::success (loop_vinfo);
1513 }
1514
1515
1516
1517 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1518 statements update the vectorization factor. */
1519
1520 static void
1521 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1522 {
1523 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1524 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1525 int nbbs = loop->num_nodes;
1526 poly_uint64 vectorization_factor;
1527 int i;
1528
1529 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1530
1531 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1532 gcc_assert (known_ne (vectorization_factor, 0U));
1533
1534 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1535 vectorization factor of the loop is the unrolling factor required by
1536 the SLP instances. If that unrolling factor is 1, we say, that we
1537 perform pure SLP on loop - cross iteration parallelism is not
1538 exploited. */
1539 bool only_slp_in_loop = true;
1540 for (i = 0; i < nbbs; i++)
1541 {
1542 basic_block bb = bbs[i];
1543 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1544 gsi_next (&si))
1545 {
1546 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1547 if (!stmt_info)
1548 continue;
1549 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1550 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1551 && !PURE_SLP_STMT (stmt_info))
1552 /* STMT needs both SLP and loop-based vectorization. */
1553 only_slp_in_loop = false;
1554 }
1555 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1556 gsi_next (&si))
1557 {
1558 if (is_gimple_debug (gsi_stmt (si)))
1559 continue;
1560 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1561 stmt_info = vect_stmt_to_vectorize (stmt_info);
1562 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1563 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1564 && !PURE_SLP_STMT (stmt_info))
1565 /* STMT needs both SLP and loop-based vectorization. */
1566 only_slp_in_loop = false;
1567 }
1568 }
1569
1570 if (only_slp_in_loop)
1571 {
1572 if (dump_enabled_p ())
1573 dump_printf_loc (MSG_NOTE, vect_location,
1574 "Loop contains only SLP stmts\n");
1575 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1576 }
1577 else
1578 {
1579 if (dump_enabled_p ())
1580 dump_printf_loc (MSG_NOTE, vect_location,
1581 "Loop contains SLP and non-SLP stmts\n");
1582 /* Both the vectorization factor and unroll factor have the form
1583 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1584 so they must have a common multiple. */
1585 vectorization_factor
1586 = force_common_multiple (vectorization_factor,
1587 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1588 }
1589
1590 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1591 if (dump_enabled_p ())
1592 {
1593 dump_printf_loc (MSG_NOTE, vect_location,
1594 "Updating vectorization factor to ");
1595 dump_dec (MSG_NOTE, vectorization_factor);
1596 dump_printf (MSG_NOTE, ".\n");
1597 }
1598 }
1599
1600 /* Return true if STMT_INFO describes a double reduction phi and if
1601 the other phi in the reduction is also relevant for vectorization.
1602 This rejects cases such as:
1603
1604 outer1:
1605 x_1 = PHI <x_3(outer2), ...>;
1606 ...
1607
1608 inner:
1609 x_2 = ...;
1610 ...
1611
1612 outer2:
1613 x_3 = PHI <x_2(inner)>;
1614
1615 if nothing in x_2 or elsewhere makes x_1 relevant. */
1616
1617 static bool
1618 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1619 {
1620 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1621 return false;
1622
1623 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1624 }
1625
1626 /* Function vect_analyze_loop_operations.
1627
1628 Scan the loop stmts and make sure they are all vectorizable. */
1629
1630 static opt_result
1631 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1632 {
1633 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1634 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1635 int nbbs = loop->num_nodes;
1636 int i;
1637 stmt_vec_info stmt_info;
1638 bool need_to_vectorize = false;
1639 bool ok;
1640
1641 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1642
1643 auto_vec<stmt_info_for_cost> cost_vec;
1644
1645 for (i = 0; i < nbbs; i++)
1646 {
1647 basic_block bb = bbs[i];
1648
1649 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1650 gsi_next (&si))
1651 {
1652 gphi *phi = si.phi ();
1653 ok = true;
1654
1655 stmt_info = loop_vinfo->lookup_stmt (phi);
1656 if (dump_enabled_p ())
1657 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1658 if (virtual_operand_p (gimple_phi_result (phi)))
1659 continue;
1660
1661 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1662 (i.e., a phi in the tail of the outer-loop). */
1663 if (! is_loop_header_bb_p (bb))
1664 {
1665 /* FORNOW: we currently don't support the case that these phis
1666 are not used in the outerloop (unless it is double reduction,
1667 i.e., this phi is vect_reduction_def), cause this case
1668 requires to actually do something here. */
1669 if (STMT_VINFO_LIVE_P (stmt_info)
1670 && !vect_active_double_reduction_p (stmt_info))
1671 return opt_result::failure_at (phi,
1672 "Unsupported loop-closed phi"
1673 " in outer-loop.\n");
1674
1675 /* If PHI is used in the outer loop, we check that its operand
1676 is defined in the inner loop. */
1677 if (STMT_VINFO_RELEVANT_P (stmt_info))
1678 {
1679 tree phi_op;
1680
1681 if (gimple_phi_num_args (phi) != 1)
1682 return opt_result::failure_at (phi, "unsupported phi");
1683
1684 phi_op = PHI_ARG_DEF (phi, 0);
1685 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1686 if (!op_def_info)
1687 return opt_result::failure_at (phi, "unsupported phi\n");
1688
1689 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1690 && (STMT_VINFO_RELEVANT (op_def_info)
1691 != vect_used_in_outer_by_reduction))
1692 return opt_result::failure_at (phi, "unsupported phi\n");
1693
1694 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1695 || (STMT_VINFO_DEF_TYPE (stmt_info)
1696 == vect_double_reduction_def))
1697 && !vectorizable_lc_phi (loop_vinfo,
1698 stmt_info, NULL, NULL))
1699 return opt_result::failure_at (phi, "unsupported phi\n");
1700 }
1701
1702 continue;
1703 }
1704
1705 gcc_assert (stmt_info);
1706
1707 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1708 || STMT_VINFO_LIVE_P (stmt_info))
1709 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1710 /* A scalar-dependence cycle that we don't support. */
1711 return opt_result::failure_at (phi,
1712 "not vectorized:"
1713 " scalar dependence cycle.\n");
1714
1715 if (STMT_VINFO_RELEVANT_P (stmt_info))
1716 {
1717 need_to_vectorize = true;
1718 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1719 && ! PURE_SLP_STMT (stmt_info))
1720 ok = vectorizable_induction (loop_vinfo,
1721 stmt_info, NULL, NULL,
1722 &cost_vec);
1723 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1724 || (STMT_VINFO_DEF_TYPE (stmt_info)
1725 == vect_double_reduction_def)
1726 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1727 && ! PURE_SLP_STMT (stmt_info))
1728 ok = vectorizable_reduction (loop_vinfo,
1729 stmt_info, NULL, NULL, &cost_vec);
1730 }
1731
1732 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1733 if (ok
1734 && STMT_VINFO_LIVE_P (stmt_info)
1735 && !PURE_SLP_STMT (stmt_info))
1736 ok = vectorizable_live_operation (loop_vinfo,
1737 stmt_info, NULL, NULL, NULL,
1738 -1, false, &cost_vec);
1739
1740 if (!ok)
1741 return opt_result::failure_at (phi,
1742 "not vectorized: relevant phi not "
1743 "supported: %G",
1744 static_cast <gimple *> (phi));
1745 }
1746
1747 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1748 gsi_next (&si))
1749 {
1750 gimple *stmt = gsi_stmt (si);
1751 if (!gimple_clobber_p (stmt)
1752 && !is_gimple_debug (stmt))
1753 {
1754 opt_result res
1755 = vect_analyze_stmt (loop_vinfo,
1756 loop_vinfo->lookup_stmt (stmt),
1757 &need_to_vectorize,
1758 NULL, NULL, &cost_vec);
1759 if (!res)
1760 return res;
1761 }
1762 }
1763 } /* bbs */
1764
1765 add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
1766
1767 /* All operations in the loop are either irrelevant (deal with loop
1768 control, or dead), or only used outside the loop and can be moved
1769 out of the loop (e.g. invariants, inductions). The loop can be
1770 optimized away by scalar optimizations. We're better off not
1771 touching this loop. */
1772 if (!need_to_vectorize)
1773 {
1774 if (dump_enabled_p ())
1775 dump_printf_loc (MSG_NOTE, vect_location,
1776 "All the computation can be taken out of the loop.\n");
1777 return opt_result::failure_at
1778 (vect_location,
1779 "not vectorized: redundant loop. no profit to vectorize.\n");
1780 }
1781
1782 return opt_result::success ();
1783 }
1784
1785 /* Return true if we know that the iteration count is smaller than the
1786 vectorization factor. Return false if it isn't, or if we can't be sure
1787 either way. */
1788
1789 static bool
1790 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1791 {
1792 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1793
1794 HOST_WIDE_INT max_niter;
1795 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1796 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1797 else
1798 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1799
1800 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1801 return true;
1802
1803 return false;
1804 }
1805
1806 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1807 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1808 definitely no, or -1 if it's worth retrying. */
1809
1810 static int
1811 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1812 {
1813 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1814 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1815
1816 /* Only loops that can handle partially-populated vectors can have iteration
1817 counts less than the vectorization factor. */
1818 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1819 {
1820 if (vect_known_niters_smaller_than_vf (loop_vinfo))
1821 {
1822 if (dump_enabled_p ())
1823 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1824 "not vectorized: iteration count smaller than "
1825 "vectorization factor.\n");
1826 return 0;
1827 }
1828 }
1829
1830 /* If using the "very cheap" model. reject cases in which we'd keep
1831 a copy of the scalar code (even if we might be able to vectorize it). */
1832 if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
1833 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1834 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1835 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1836 {
1837 if (dump_enabled_p ())
1838 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1839 "some scalar iterations would need to be peeled\n");
1840 return 0;
1841 }
1842
1843 int min_profitable_iters, min_profitable_estimate;
1844 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1845 &min_profitable_estimate);
1846
1847 if (min_profitable_iters < 0)
1848 {
1849 if (dump_enabled_p ())
1850 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1851 "not vectorized: vectorization not profitable.\n");
1852 if (dump_enabled_p ())
1853 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1854 "not vectorized: vector version will never be "
1855 "profitable.\n");
1856 return -1;
1857 }
1858
1859 int min_scalar_loop_bound = (param_min_vect_loop_bound
1860 * assumed_vf);
1861
1862 /* Use the cost model only if it is more conservative than user specified
1863 threshold. */
1864 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1865 min_profitable_iters);
1866
1867 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1868
1869 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1870 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1871 {
1872 if (dump_enabled_p ())
1873 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1874 "not vectorized: vectorization not profitable.\n");
1875 if (dump_enabled_p ())
1876 dump_printf_loc (MSG_NOTE, vect_location,
1877 "not vectorized: iteration count smaller than user "
1878 "specified loop bound parameter or minimum profitable "
1879 "iterations (whichever is more conservative).\n");
1880 return 0;
1881 }
1882
1883 /* The static profitablity threshold min_profitable_estimate includes
1884 the cost of having to check at runtime whether the scalar loop
1885 should be used instead. If it turns out that we don't need or want
1886 such a check, the threshold we should use for the static estimate
1887 is simply the point at which the vector loop becomes more profitable
1888 than the scalar loop. */
1889 if (min_profitable_estimate > min_profitable_iters
1890 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1891 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1892 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1893 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1894 {
1895 if (dump_enabled_p ())
1896 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1897 " choice between the scalar and vector loops\n");
1898 min_profitable_estimate = min_profitable_iters;
1899 }
1900
1901 /* If the vector loop needs multiple iterations to be beneficial then
1902 things are probably too close to call, and the conservative thing
1903 would be to stick with the scalar code. */
1904 if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
1905 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1906 {
1907 if (dump_enabled_p ())
1908 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1909 "one iteration of the vector loop would be"
1910 " more expensive than the equivalent number of"
1911 " iterations of the scalar loop\n");
1912 return 0;
1913 }
1914
1915 HOST_WIDE_INT estimated_niter;
1916
1917 /* If we are vectorizing an epilogue then we know the maximum number of
1918 scalar iterations it will cover is at least one lower than the
1919 vectorization factor of the main loop. */
1920 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1921 estimated_niter
1922 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1923 else
1924 {
1925 estimated_niter = estimated_stmt_executions_int (loop);
1926 if (estimated_niter == -1)
1927 estimated_niter = likely_max_stmt_executions_int (loop);
1928 }
1929 if (estimated_niter != -1
1930 && ((unsigned HOST_WIDE_INT) estimated_niter
1931 < MAX (th, (unsigned) min_profitable_estimate)))
1932 {
1933 if (dump_enabled_p ())
1934 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1935 "not vectorized: estimated iteration count too "
1936 "small.\n");
1937 if (dump_enabled_p ())
1938 dump_printf_loc (MSG_NOTE, vect_location,
1939 "not vectorized: estimated iteration count smaller "
1940 "than specified loop bound parameter or minimum "
1941 "profitable iterations (whichever is more "
1942 "conservative).\n");
1943 return -1;
1944 }
1945
1946 return 1;
1947 }
1948
1949 static opt_result
1950 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1951 vec<data_reference_p> *datarefs,
1952 unsigned int *n_stmts)
1953 {
1954 *n_stmts = 0;
1955 for (unsigned i = 0; i < loop->num_nodes; i++)
1956 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1957 !gsi_end_p (gsi); gsi_next (&gsi))
1958 {
1959 gimple *stmt = gsi_stmt (gsi);
1960 if (is_gimple_debug (stmt))
1961 continue;
1962 ++(*n_stmts);
1963 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1964 NULL, 0);
1965 if (!res)
1966 {
1967 if (is_gimple_call (stmt) && loop->safelen)
1968 {
1969 tree fndecl = gimple_call_fndecl (stmt), op;
1970 if (fndecl != NULL_TREE)
1971 {
1972 cgraph_node *node = cgraph_node::get (fndecl);
1973 if (node != NULL && node->simd_clones != NULL)
1974 {
1975 unsigned int j, n = gimple_call_num_args (stmt);
1976 for (j = 0; j < n; j++)
1977 {
1978 op = gimple_call_arg (stmt, j);
1979 if (DECL_P (op)
1980 || (REFERENCE_CLASS_P (op)
1981 && get_base_address (op)))
1982 break;
1983 }
1984 op = gimple_call_lhs (stmt);
1985 /* Ignore #pragma omp declare simd functions
1986 if they don't have data references in the
1987 call stmt itself. */
1988 if (j == n
1989 && !(op
1990 && (DECL_P (op)
1991 || (REFERENCE_CLASS_P (op)
1992 && get_base_address (op)))))
1993 continue;
1994 }
1995 }
1996 }
1997 return res;
1998 }
1999 /* If dependence analysis will give up due to the limit on the
2000 number of datarefs stop here and fail fatally. */
2001 if (datarefs->length ()
2002 > (unsigned)param_loop_max_datarefs_for_datadeps)
2003 return opt_result::failure_at (stmt, "exceeded param "
2004 "loop-max-datarefs-for-datadeps\n");
2005 }
2006 return opt_result::success ();
2007 }
2008
2009 /* Look for SLP-only access groups and turn each individual access into its own
2010 group. */
2011 static void
2012 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2013 {
2014 unsigned int i;
2015 struct data_reference *dr;
2016
2017 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2018
2019 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2020 FOR_EACH_VEC_ELT (datarefs, i, dr)
2021 {
2022 gcc_assert (DR_REF (dr));
2023 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2024
2025 /* Check if the load is a part of an interleaving chain. */
2026 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2027 {
2028 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2029 unsigned int group_size = DR_GROUP_SIZE (first_element);
2030
2031 /* Check if SLP-only groups. */
2032 if (!STMT_SLP_TYPE (stmt_info)
2033 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2034 {
2035 /* Dissolve the group. */
2036 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2037
2038 stmt_vec_info vinfo = first_element;
2039 while (vinfo)
2040 {
2041 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2042 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2043 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2044 DR_GROUP_SIZE (vinfo) = 1;
2045 if (STMT_VINFO_STRIDED_P (first_element))
2046 DR_GROUP_GAP (vinfo) = 0;
2047 else
2048 DR_GROUP_GAP (vinfo) = group_size - 1;
2049 vinfo = next;
2050 }
2051 }
2052 }
2053 }
2054 }
2055
2056 /* Determine if operating on full vectors for LOOP_VINFO might leave
2057 some scalar iterations still to do. If so, decide how we should
2058 handle those scalar iterations. The possibilities are:
2059
2060 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2061 In this case:
2062
2063 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2064 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2065 LOOP_VINFO_PEELING_FOR_NITER == false
2066
2067 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2068 to handle the remaining scalar iterations. In this case:
2069
2070 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2071 LOOP_VINFO_PEELING_FOR_NITER == true
2072
2073 There are two choices:
2074
2075 (2a) Consider vectorizing the epilogue loop at the same VF as the
2076 main loop, but using partial vectors instead of full vectors.
2077 In this case:
2078
2079 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2080
2081 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2082 In this case:
2083
2084 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2085
2086 When FOR_EPILOGUE_P is true, make this determination based on the
2087 assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2088 based on the assumption that LOOP_VINFO is the main loop. The caller
2089 has made sure that the number of iterations is set appropriately for
2090 this value of FOR_EPILOGUE_P. */
2091
2092 opt_result
2093 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2094 bool for_epilogue_p)
2095 {
2096 /* Determine whether there would be any scalar iterations left over. */
2097 bool need_peeling_or_partial_vectors_p
2098 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2099
2100 /* Decide whether to vectorize the loop with partial vectors. */
2101 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2102 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2103 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2104 && need_peeling_or_partial_vectors_p)
2105 {
2106 /* For partial-vector-usage=1, try to push the handling of partial
2107 vectors to the epilogue, with the main loop continuing to operate
2108 on full vectors.
2109
2110 ??? We could then end up failing to use partial vectors if we
2111 decide to peel iterations into a prologue, and if the main loop
2112 then ends up processing fewer than VF iterations. */
2113 if (param_vect_partial_vector_usage == 1
2114 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2115 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2116 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2117 else
2118 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2119 }
2120
2121 if (dump_enabled_p ())
2122 {
2123 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2124 dump_printf_loc (MSG_NOTE, vect_location,
2125 "operating on partial vectors%s.\n",
2126 for_epilogue_p ? " for epilogue loop" : "");
2127 else
2128 dump_printf_loc (MSG_NOTE, vect_location,
2129 "operating only on full vectors%s.\n",
2130 for_epilogue_p ? " for epilogue loop" : "");
2131 }
2132
2133 if (for_epilogue_p)
2134 {
2135 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2136 gcc_assert (orig_loop_vinfo);
2137 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2138 gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2139 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2140 }
2141
2142 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2143 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2144 {
2145 /* Check that the loop processes at least one full vector. */
2146 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2147 tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2148 if (known_lt (wi::to_widest (scalar_niters), vf))
2149 return opt_result::failure_at (vect_location,
2150 "loop does not have enough iterations"
2151 " to support vectorization.\n");
2152
2153 /* If we need to peel an extra epilogue iteration to handle data
2154 accesses with gaps, check that there are enough scalar iterations
2155 available.
2156
2157 The check above is redundant with this one when peeling for gaps,
2158 but the distinction is useful for diagnostics. */
2159 tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2160 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2161 && known_lt (wi::to_widest (scalar_nitersm1), vf))
2162 return opt_result::failure_at (vect_location,
2163 "loop does not have enough iterations"
2164 " to support peeling for gaps.\n");
2165 }
2166
2167 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2168 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2169 && need_peeling_or_partial_vectors_p);
2170
2171 return opt_result::success ();
2172 }
2173
2174 /* Function vect_analyze_loop_2.
2175
2176 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2177 for it. The different analyses will record information in the
2178 loop_vec_info struct. */
2179 static opt_result
2180 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
2181 {
2182 opt_result ok = opt_result::success ();
2183 int res;
2184 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2185 poly_uint64 min_vf = 2;
2186 loop_vec_info orig_loop_vinfo = NULL;
2187
2188 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2189 loop_vec_info of the first vectorized loop. */
2190 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2191 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2192 else
2193 orig_loop_vinfo = loop_vinfo;
2194 gcc_assert (orig_loop_vinfo);
2195
2196 /* The first group of checks is independent of the vector size. */
2197 fatal = true;
2198
2199 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2200 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2201 return opt_result::failure_at (vect_location,
2202 "not vectorized: simd if(0)\n");
2203
2204 /* Find all data references in the loop (which correspond to vdefs/vuses)
2205 and analyze their evolution in the loop. */
2206
2207 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2208
2209 /* Gather the data references and count stmts in the loop. */
2210 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2211 {
2212 opt_result res
2213 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2214 &LOOP_VINFO_DATAREFS (loop_vinfo),
2215 n_stmts);
2216 if (!res)
2217 {
2218 if (dump_enabled_p ())
2219 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2220 "not vectorized: loop contains function "
2221 "calls or data references that cannot "
2222 "be analyzed\n");
2223 return res;
2224 }
2225 loop_vinfo->shared->save_datarefs ();
2226 }
2227 else
2228 loop_vinfo->shared->check_datarefs ();
2229
2230 /* Analyze the data references and also adjust the minimal
2231 vectorization factor according to the loads and stores. */
2232
2233 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2234 if (!ok)
2235 {
2236 if (dump_enabled_p ())
2237 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2238 "bad data references.\n");
2239 return ok;
2240 }
2241
2242 /* Classify all cross-iteration scalar data-flow cycles.
2243 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2244 vect_analyze_scalar_cycles (loop_vinfo);
2245
2246 vect_pattern_recog (loop_vinfo);
2247
2248 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2249
2250 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2251 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2252
2253 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2254 if (!ok)
2255 {
2256 if (dump_enabled_p ())
2257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2258 "bad data access.\n");
2259 return ok;
2260 }
2261
2262 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2263
2264 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2265 if (!ok)
2266 {
2267 if (dump_enabled_p ())
2268 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2269 "unexpected pattern.\n");
2270 return ok;
2271 }
2272
2273 /* While the rest of the analysis below depends on it in some way. */
2274 fatal = false;
2275
2276 /* Analyze data dependences between the data-refs in the loop
2277 and adjust the maximum vectorization factor according to
2278 the dependences.
2279 FORNOW: fail at the first data dependence that we encounter. */
2280
2281 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2282 if (!ok)
2283 {
2284 if (dump_enabled_p ())
2285 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2286 "bad data dependence.\n");
2287 return ok;
2288 }
2289 if (max_vf != MAX_VECTORIZATION_FACTOR
2290 && maybe_lt (max_vf, min_vf))
2291 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2292 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2293
2294 ok = vect_determine_vectorization_factor (loop_vinfo);
2295 if (!ok)
2296 {
2297 if (dump_enabled_p ())
2298 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2299 "can't determine vectorization factor.\n");
2300 return ok;
2301 }
2302 if (max_vf != MAX_VECTORIZATION_FACTOR
2303 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2304 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2305
2306 /* Compute the scalar iteration cost. */
2307 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2308
2309 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2310
2311 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2312 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2313 if (!ok)
2314 return ok;
2315
2316 /* If there are any SLP instances mark them as pure_slp. */
2317 bool slp = vect_make_slp_decision (loop_vinfo);
2318 if (slp)
2319 {
2320 /* Find stmts that need to be both vectorized and SLPed. */
2321 vect_detect_hybrid_slp (loop_vinfo);
2322
2323 /* Update the vectorization factor based on the SLP decision. */
2324 vect_update_vf_for_slp (loop_vinfo);
2325
2326 /* Optimize the SLP graph with the vectorization factor fixed. */
2327 vect_optimize_slp (loop_vinfo);
2328
2329 /* Gather the loads reachable from the SLP graph entries. */
2330 vect_gather_slp_loads (loop_vinfo);
2331 }
2332
2333 bool saved_can_use_partial_vectors_p
2334 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2335
2336 /* We don't expect to have to roll back to anything other than an empty
2337 set of rgroups. */
2338 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2339
2340 /* This is the point where we can re-start analysis with SLP forced off. */
2341 start_over:
2342
2343 /* Now the vectorization factor is final. */
2344 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2345 gcc_assert (known_ne (vectorization_factor, 0U));
2346
2347 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2348 {
2349 dump_printf_loc (MSG_NOTE, vect_location,
2350 "vectorization_factor = ");
2351 dump_dec (MSG_NOTE, vectorization_factor);
2352 dump_printf (MSG_NOTE, ", niters = %wd\n",
2353 LOOP_VINFO_INT_NITERS (loop_vinfo));
2354 }
2355
2356 /* Analyze the alignment of the data-refs in the loop.
2357 Fail if a data reference is found that cannot be vectorized. */
2358
2359 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2360 if (!ok)
2361 {
2362 if (dump_enabled_p ())
2363 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2364 "bad data alignment.\n");
2365 return ok;
2366 }
2367
2368 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2369 It is important to call pruning after vect_analyze_data_ref_accesses,
2370 since we use grouping information gathered by interleaving analysis. */
2371 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2372 if (!ok)
2373 return ok;
2374
2375 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2376 vectorization, since we do not want to add extra peeling or
2377 add versioning for alignment. */
2378 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2379 /* This pass will decide on using loop versioning and/or loop peeling in
2380 order to enhance the alignment of data references in the loop. */
2381 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2382 if (!ok)
2383 return ok;
2384
2385 if (slp)
2386 {
2387 /* Analyze operations in the SLP instances. Note this may
2388 remove unsupported SLP instances which makes the above
2389 SLP kind detection invalid. */
2390 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2391 vect_slp_analyze_operations (loop_vinfo);
2392 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2393 {
2394 ok = opt_result::failure_at (vect_location,
2395 "unsupported SLP instances\n");
2396 goto again;
2397 }
2398
2399 /* Check whether any load in ALL SLP instances is possibly permuted. */
2400 slp_tree load_node, slp_root;
2401 unsigned i, x;
2402 slp_instance instance;
2403 bool can_use_lanes = true;
2404 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2405 {
2406 slp_root = SLP_INSTANCE_TREE (instance);
2407 int group_size = SLP_TREE_LANES (slp_root);
2408 tree vectype = SLP_TREE_VECTYPE (slp_root);
2409 bool loads_permuted = false;
2410 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2411 {
2412 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2413 continue;
2414 unsigned j;
2415 stmt_vec_info load_info;
2416 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2417 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2418 {
2419 loads_permuted = true;
2420 break;
2421 }
2422 }
2423
2424 /* If the loads and stores can be handled with load/store-lane
2425 instructions record it and move on to the next instance. */
2426 if (loads_permuted
2427 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2428 && vect_store_lanes_supported (vectype, group_size, false))
2429 {
2430 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2431 {
2432 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2433 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2434 /* Use SLP for strided accesses (or if we can't
2435 load-lanes). */
2436 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2437 || ! vect_load_lanes_supported
2438 (STMT_VINFO_VECTYPE (stmt_vinfo),
2439 DR_GROUP_SIZE (stmt_vinfo), false))
2440 break;
2441 }
2442
2443 can_use_lanes
2444 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2445
2446 if (can_use_lanes && dump_enabled_p ())
2447 dump_printf_loc (MSG_NOTE, vect_location,
2448 "SLP instance %p can use load/store-lanes\n",
2449 instance);
2450 }
2451 else
2452 {
2453 can_use_lanes = false;
2454 break;
2455 }
2456 }
2457
2458 /* If all SLP instances can use load/store-lanes abort SLP and try again
2459 with SLP disabled. */
2460 if (can_use_lanes)
2461 {
2462 ok = opt_result::failure_at (vect_location,
2463 "Built SLP cancelled: can use "
2464 "load/store-lanes\n");
2465 if (dump_enabled_p ())
2466 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2467 "Built SLP cancelled: all SLP instances support "
2468 "load/store-lanes\n");
2469 goto again;
2470 }
2471 }
2472
2473 /* Dissolve SLP-only groups. */
2474 vect_dissolve_slp_only_groups (loop_vinfo);
2475
2476 /* Scan all the remaining operations in the loop that are not subject
2477 to SLP and make sure they are vectorizable. */
2478 ok = vect_analyze_loop_operations (loop_vinfo);
2479 if (!ok)
2480 {
2481 if (dump_enabled_p ())
2482 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2483 "bad operation or unsupported loop bound.\n");
2484 return ok;
2485 }
2486
2487 /* For now, we don't expect to mix both masking and length approaches for one
2488 loop, disable it if both are recorded. */
2489 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2490 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2491 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2492 {
2493 if (dump_enabled_p ())
2494 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2495 "can't vectorize a loop with partial vectors"
2496 " because we don't expect to mix different"
2497 " approaches with partial vectors for the"
2498 " same loop.\n");
2499 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2500 }
2501
2502 /* If we still have the option of using partial vectors,
2503 check whether we can generate the necessary loop controls. */
2504 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2505 && !vect_verify_full_masking (loop_vinfo)
2506 && !vect_verify_loop_lens (loop_vinfo))
2507 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2508
2509 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2510 to be able to handle fewer than VF scalars, or needs to have a lower VF
2511 than the main loop. */
2512 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2513 && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2514 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2515 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2516 return opt_result::failure_at (vect_location,
2517 "Vectorization factor too high for"
2518 " epilogue loop.\n");
2519
2520 /* Decide whether this loop_vinfo should use partial vectors or peeling,
2521 assuming that the loop will be used as a main loop. We will redo
2522 this analysis later if we instead decide to use the loop as an
2523 epilogue loop. */
2524 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2525 if (!ok)
2526 return ok;
2527
2528 /* Check the costings of the loop make vectorizing worthwhile. */
2529 res = vect_analyze_loop_costing (loop_vinfo);
2530 if (res < 0)
2531 {
2532 ok = opt_result::failure_at (vect_location,
2533 "Loop costings may not be worthwhile.\n");
2534 goto again;
2535 }
2536 if (!res)
2537 return opt_result::failure_at (vect_location,
2538 "Loop costings not worthwhile.\n");
2539
2540 /* If an epilogue loop is required make sure we can create one. */
2541 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2542 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2543 {
2544 if (dump_enabled_p ())
2545 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2546 if (!vect_can_advance_ivs_p (loop_vinfo)
2547 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2548 single_exit (LOOP_VINFO_LOOP
2549 (loop_vinfo))))
2550 {
2551 ok = opt_result::failure_at (vect_location,
2552 "not vectorized: can't create required "
2553 "epilog loop\n");
2554 goto again;
2555 }
2556 }
2557
2558 /* During peeling, we need to check if number of loop iterations is
2559 enough for both peeled prolog loop and vector loop. This check
2560 can be merged along with threshold check of loop versioning, so
2561 increase threshold for this case if necessary.
2562
2563 If we are analyzing an epilogue we still want to check what its
2564 versioning threshold would be. If we decide to vectorize the epilogues we
2565 will want to use the lowest versioning threshold of all epilogues and main
2566 loop. This will enable us to enter a vectorized epilogue even when
2567 versioning the loop. We can't simply check whether the epilogue requires
2568 versioning though since we may have skipped some versioning checks when
2569 analyzing the epilogue. For instance, checks for alias versioning will be
2570 skipped when dealing with epilogues as we assume we already checked them
2571 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2572 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2573 {
2574 poly_uint64 niters_th = 0;
2575 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2576
2577 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2578 {
2579 /* Niters for peeled prolog loop. */
2580 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2581 {
2582 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2583 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2584 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2585 }
2586 else
2587 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2588 }
2589
2590 /* Niters for at least one iteration of vectorized loop. */
2591 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2592 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2593 /* One additional iteration because of peeling for gap. */
2594 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2595 niters_th += 1;
2596
2597 /* Use the same condition as vect_transform_loop to decide when to use
2598 the cost to determine a versioning threshold. */
2599 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2600 && ordered_p (th, niters_th))
2601 niters_th = ordered_max (poly_uint64 (th), niters_th);
2602
2603 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2604 }
2605
2606 gcc_assert (known_eq (vectorization_factor,
2607 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2608
2609 /* Ok to vectorize! */
2610 return opt_result::success ();
2611
2612 again:
2613 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2614 gcc_assert (!ok);
2615
2616 /* Try again with SLP forced off but if we didn't do any SLP there is
2617 no point in re-trying. */
2618 if (!slp)
2619 return ok;
2620
2621 /* If there are reduction chains re-trying will fail anyway. */
2622 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2623 return ok;
2624
2625 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2626 via interleaving or lane instructions. */
2627 slp_instance instance;
2628 slp_tree node;
2629 unsigned i, j;
2630 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2631 {
2632 stmt_vec_info vinfo;
2633 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2634 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2635 continue;
2636 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2637 unsigned int size = DR_GROUP_SIZE (vinfo);
2638 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2639 if (! vect_store_lanes_supported (vectype, size, false)
2640 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2641 && ! vect_grouped_store_supported (vectype, size))
2642 return opt_result::failure_at (vinfo->stmt,
2643 "unsupported grouped store\n");
2644 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2645 {
2646 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2647 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2648 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2649 size = DR_GROUP_SIZE (vinfo);
2650 vectype = STMT_VINFO_VECTYPE (vinfo);
2651 if (! vect_load_lanes_supported (vectype, size, false)
2652 && ! vect_grouped_load_supported (vectype, single_element_p,
2653 size))
2654 return opt_result::failure_at (vinfo->stmt,
2655 "unsupported grouped load\n");
2656 }
2657 }
2658
2659 if (dump_enabled_p ())
2660 dump_printf_loc (MSG_NOTE, vect_location,
2661 "re-trying with SLP disabled\n");
2662
2663 /* Roll back state appropriately. No SLP this time. */
2664 slp = false;
2665 /* Restore vectorization factor as it were without SLP. */
2666 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2667 /* Free the SLP instances. */
2668 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2669 vect_free_slp_instance (instance);
2670 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2671 /* Reset SLP type to loop_vect on all stmts. */
2672 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2673 {
2674 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2675 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2676 !gsi_end_p (si); gsi_next (&si))
2677 {
2678 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2679 STMT_SLP_TYPE (stmt_info) = loop_vect;
2680 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2681 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2682 {
2683 /* vectorizable_reduction adjusts reduction stmt def-types,
2684 restore them to that of the PHI. */
2685 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2686 = STMT_VINFO_DEF_TYPE (stmt_info);
2687 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2688 (STMT_VINFO_REDUC_DEF (stmt_info)))
2689 = STMT_VINFO_DEF_TYPE (stmt_info);
2690 }
2691 }
2692 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2693 !gsi_end_p (si); gsi_next (&si))
2694 {
2695 if (is_gimple_debug (gsi_stmt (si)))
2696 continue;
2697 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2698 STMT_SLP_TYPE (stmt_info) = loop_vect;
2699 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2700 {
2701 stmt_vec_info pattern_stmt_info
2702 = STMT_VINFO_RELATED_STMT (stmt_info);
2703 if (STMT_VINFO_SLP_VECT_ONLY (pattern_stmt_info))
2704 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2705
2706 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2707 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2708 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2709 !gsi_end_p (pi); gsi_next (&pi))
2710 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2711 = loop_vect;
2712 }
2713 }
2714 }
2715 /* Free optimized alias test DDRS. */
2716 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2717 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2718 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2719 /* Reset target cost data. */
2720 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2721 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2722 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2723 /* Reset accumulated rgroup information. */
2724 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2725 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2726 /* Reset assorted flags. */
2727 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2728 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2729 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2730 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2731 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2732 = saved_can_use_partial_vectors_p;
2733
2734 goto start_over;
2735 }
2736
2737 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2738 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2739 OLD_LOOP_VINFO is better unless something specifically indicates
2740 otherwise.
2741
2742 Note that this deliberately isn't a partial order. */
2743
2744 static bool
2745 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2746 loop_vec_info old_loop_vinfo)
2747 {
2748 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2749 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2750
2751 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2752 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2753
2754 /* Always prefer a VF of loop->simdlen over any other VF. */
2755 if (loop->simdlen)
2756 {
2757 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2758 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2759 if (new_simdlen_p != old_simdlen_p)
2760 return new_simdlen_p;
2761 }
2762
2763 /* Limit the VFs to what is likely to be the maximum number of iterations,
2764 to handle cases in which at least one loop_vinfo is fully-masked. */
2765 HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2766 if (estimated_max_niter != -1)
2767 {
2768 if (known_le (estimated_max_niter, new_vf))
2769 new_vf = estimated_max_niter;
2770 if (known_le (estimated_max_niter, old_vf))
2771 old_vf = estimated_max_niter;
2772 }
2773
2774 /* Check whether the (fractional) cost per scalar iteration is lower
2775 or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
2776 poly_int64 rel_new = new_loop_vinfo->vec_inside_cost * old_vf;
2777 poly_int64 rel_old = old_loop_vinfo->vec_inside_cost * new_vf;
2778
2779 HOST_WIDE_INT est_rel_new_min
2780 = estimated_poly_value (rel_new, POLY_VALUE_MIN);
2781 HOST_WIDE_INT est_rel_new_max
2782 = estimated_poly_value (rel_new, POLY_VALUE_MAX);
2783
2784 HOST_WIDE_INT est_rel_old_min
2785 = estimated_poly_value (rel_old, POLY_VALUE_MIN);
2786 HOST_WIDE_INT est_rel_old_max
2787 = estimated_poly_value (rel_old, POLY_VALUE_MAX);
2788
2789 /* Check first if we can make out an unambigous total order from the minimum
2790 and maximum estimates. */
2791 if (est_rel_new_min < est_rel_old_min
2792 && est_rel_new_max < est_rel_old_max)
2793 return true;
2794 else if (est_rel_old_min < est_rel_new_min
2795 && est_rel_old_max < est_rel_new_max)
2796 return false;
2797 /* When old_loop_vinfo uses a variable vectorization factor,
2798 we know that it has a lower cost for at least one runtime VF.
2799 However, we don't know how likely that VF is.
2800
2801 One option would be to compare the costs for the estimated VFs.
2802 The problem is that that can put too much pressure on the cost
2803 model. E.g. if the estimated VF is also the lowest possible VF,
2804 and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2805 for the estimated VF, we'd then choose new_loop_vinfo even
2806 though (a) new_loop_vinfo might not actually be better than
2807 old_loop_vinfo for that VF and (b) it would be significantly
2808 worse at larger VFs.
2809
2810 Here we go for a hacky compromise: pick new_loop_vinfo if it is
2811 no more expensive than old_loop_vinfo even after doubling the
2812 estimated old_loop_vinfo VF. For all but trivial loops, this
2813 ensures that we only pick new_loop_vinfo if it is significantly
2814 better than old_loop_vinfo at the estimated VF. */
2815
2816 if (est_rel_old_min != est_rel_new_min
2817 || est_rel_old_max != est_rel_new_max)
2818 {
2819 HOST_WIDE_INT est_rel_new_likely
2820 = estimated_poly_value (rel_new, POLY_VALUE_LIKELY);
2821 HOST_WIDE_INT est_rel_old_likely
2822 = estimated_poly_value (rel_old, POLY_VALUE_LIKELY);
2823
2824 return est_rel_new_likely * 2 <= est_rel_old_likely;
2825 }
2826
2827 /* If there's nothing to choose between the loop bodies, see whether
2828 there's a difference in the prologue and epilogue costs. */
2829 if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2830 return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2831
2832 return false;
2833 }
2834
2835 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2836 true if we should. */
2837
2838 static bool
2839 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2840 loop_vec_info old_loop_vinfo)
2841 {
2842 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2843 return false;
2844
2845 if (dump_enabled_p ())
2846 dump_printf_loc (MSG_NOTE, vect_location,
2847 "***** Preferring vector mode %s to vector mode %s\n",
2848 GET_MODE_NAME (new_loop_vinfo->vector_mode),
2849 GET_MODE_NAME (old_loop_vinfo->vector_mode));
2850 return true;
2851 }
2852
2853 /* If LOOP_VINFO is already a main loop, return it unmodified. Otherwise
2854 try to reanalyze it as a main loop. Return the loop_vinfo on success
2855 and null on failure. */
2856
2857 static loop_vec_info
2858 vect_reanalyze_as_main_loop (loop_vec_info loop_vinfo, unsigned int *n_stmts)
2859 {
2860 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2861 return loop_vinfo;
2862
2863 if (dump_enabled_p ())
2864 dump_printf_loc (MSG_NOTE, vect_location,
2865 "***** Reanalyzing as a main loop with vector mode %s\n",
2866 GET_MODE_NAME (loop_vinfo->vector_mode));
2867
2868 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2869 vec_info_shared *shared = loop_vinfo->shared;
2870 opt_loop_vec_info main_loop_vinfo = vect_analyze_loop_form (loop, shared);
2871 gcc_assert (main_loop_vinfo);
2872
2873 main_loop_vinfo->vector_mode = loop_vinfo->vector_mode;
2874
2875 bool fatal = false;
2876 bool res = vect_analyze_loop_2 (main_loop_vinfo, fatal, n_stmts);
2877 loop->aux = NULL;
2878 if (!res)
2879 {
2880 if (dump_enabled_p ())
2881 dump_printf_loc (MSG_NOTE, vect_location,
2882 "***** Failed to analyze main loop with vector"
2883 " mode %s\n",
2884 GET_MODE_NAME (loop_vinfo->vector_mode));
2885 delete main_loop_vinfo;
2886 return NULL;
2887 }
2888 LOOP_VINFO_VECTORIZABLE_P (main_loop_vinfo) = 1;
2889 return main_loop_vinfo;
2890 }
2891
2892 /* Function vect_analyze_loop.
2893
2894 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2895 for it. The different analyses will record information in the
2896 loop_vec_info struct. */
2897 opt_loop_vec_info
2898 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2899 {
2900 auto_vector_modes vector_modes;
2901
2902 /* Autodetect first vector size we try. */
2903 unsigned int autovec_flags
2904 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2905 loop->simdlen != 0);
2906 unsigned int mode_i = 0;
2907
2908 DUMP_VECT_SCOPE ("analyze_loop_nest");
2909
2910 if (loop_outer (loop)
2911 && loop_vec_info_for_loop (loop_outer (loop))
2912 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2913 return opt_loop_vec_info::failure_at (vect_location,
2914 "outer-loop already vectorized.\n");
2915
2916 if (!find_loop_nest (loop, &shared->loop_nest))
2917 return opt_loop_vec_info::failure_at
2918 (vect_location,
2919 "not vectorized: loop nest containing two or more consecutive inner"
2920 " loops cannot be vectorized\n");
2921
2922 unsigned n_stmts = 0;
2923 machine_mode autodetected_vector_mode = VOIDmode;
2924 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2925 machine_mode next_vector_mode = VOIDmode;
2926 poly_uint64 lowest_th = 0;
2927 unsigned vectorized_loops = 0;
2928 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2929 && !unlimited_cost_model (loop));
2930
2931 bool vect_epilogues = false;
2932 opt_result res = opt_result::success ();
2933 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2934 while (1)
2935 {
2936 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2937 opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2938 if (!loop_vinfo)
2939 {
2940 if (dump_enabled_p ())
2941 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2942 "bad loop form.\n");
2943 gcc_checking_assert (first_loop_vinfo == NULL);
2944 return loop_vinfo;
2945 }
2946 loop_vinfo->vector_mode = next_vector_mode;
2947
2948 bool fatal = false;
2949
2950 /* When pick_lowest_cost_p is true, we should in principle iterate
2951 over all the loop_vec_infos that LOOP_VINFO could replace and
2952 try to vectorize LOOP_VINFO under the same conditions.
2953 E.g. when trying to replace an epilogue loop, we should vectorize
2954 LOOP_VINFO as an epilogue loop with the same VF limit. When trying
2955 to replace the main loop, we should vectorize LOOP_VINFO as a main
2956 loop too.
2957
2958 However, autovectorize_vector_modes is usually sorted as follows:
2959
2960 - Modes that naturally produce lower VFs usually follow modes that
2961 naturally produce higher VFs.
2962
2963 - When modes naturally produce the same VF, maskable modes
2964 usually follow unmaskable ones, so that the maskable mode
2965 can be used to vectorize the epilogue of the unmaskable mode.
2966
2967 This order is preferred because it leads to the maximum
2968 epilogue vectorization opportunities. Targets should only use
2969 a different order if they want to make wide modes available while
2970 disparaging them relative to earlier, smaller modes. The assumption
2971 in that case is that the wider modes are more expensive in some
2972 way that isn't reflected directly in the costs.
2973
2974 There should therefore be few interesting cases in which
2975 LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2976 treated as a standalone loop, and ends up being genuinely cheaper
2977 than FIRST_LOOP_VINFO. */
2978 if (vect_epilogues)
2979 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2980
2981 res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2982 if (mode_i == 0)
2983 autodetected_vector_mode = loop_vinfo->vector_mode;
2984 if (dump_enabled_p ())
2985 {
2986 if (res)
2987 dump_printf_loc (MSG_NOTE, vect_location,
2988 "***** Analysis succeeded with vector mode %s\n",
2989 GET_MODE_NAME (loop_vinfo->vector_mode));
2990 else
2991 dump_printf_loc (MSG_NOTE, vect_location,
2992 "***** Analysis failed with vector mode %s\n",
2993 GET_MODE_NAME (loop_vinfo->vector_mode));
2994 }
2995
2996 loop->aux = NULL;
2997
2998 if (!fatal)
2999 while (mode_i < vector_modes.length ()
3000 && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
3001 {
3002 if (dump_enabled_p ())
3003 dump_printf_loc (MSG_NOTE, vect_location,
3004 "***** The result for vector mode %s would"
3005 " be the same\n",
3006 GET_MODE_NAME (vector_modes[mode_i]));
3007 mode_i += 1;
3008 }
3009
3010 if (res)
3011 {
3012 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3013 vectorized_loops++;
3014
3015 /* Once we hit the desired simdlen for the first time,
3016 discard any previous attempts. */
3017 if (simdlen
3018 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3019 {
3020 delete first_loop_vinfo;
3021 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3022 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
3023 simdlen = 0;
3024 }
3025 else if (pick_lowest_cost_p && first_loop_vinfo)
3026 {
3027 /* Keep trying to roll back vectorization attempts while the
3028 loop_vec_infos they produced were worse than this one. */
3029 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3030 while (!vinfos.is_empty ()
3031 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3032 {
3033 gcc_assert (vect_epilogues);
3034 delete vinfos.pop ();
3035 }
3036 if (vinfos.is_empty ()
3037 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3038 {
3039 loop_vec_info main_loop_vinfo
3040 = vect_reanalyze_as_main_loop (loop_vinfo, &n_stmts);
3041 if (main_loop_vinfo == loop_vinfo)
3042 {
3043 delete first_loop_vinfo;
3044 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3045 }
3046 else if (main_loop_vinfo
3047 && vect_joust_loop_vinfos (main_loop_vinfo,
3048 first_loop_vinfo))
3049 {
3050 delete first_loop_vinfo;
3051 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3052 delete loop_vinfo;
3053 loop_vinfo
3054 = opt_loop_vec_info::success (main_loop_vinfo);
3055 }
3056 else
3057 delete main_loop_vinfo;
3058 }
3059 }
3060
3061 if (first_loop_vinfo == NULL)
3062 {
3063 first_loop_vinfo = loop_vinfo;
3064 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3065 }
3066 else if (vect_epilogues
3067 /* For now only allow one epilogue loop. */
3068 && first_loop_vinfo->epilogue_vinfos.is_empty ())
3069 {
3070 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3071 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3072 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3073 || maybe_ne (lowest_th, 0U));
3074 /* Keep track of the known smallest versioning
3075 threshold. */
3076 if (ordered_p (lowest_th, th))
3077 lowest_th = ordered_min (lowest_th, th);
3078 }
3079 else
3080 {
3081 delete loop_vinfo;
3082 loop_vinfo = opt_loop_vec_info::success (NULL);
3083 }
3084
3085 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3086 enabled, SIMDUID is not set, it is the innermost loop and we have
3087 either already found the loop's SIMDLEN or there was no SIMDLEN to
3088 begin with.
3089 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3090 vect_epilogues = (!simdlen
3091 && loop->inner == NULL
3092 && param_vect_epilogues_nomask
3093 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3094 && !loop->simduid
3095 /* For now only allow one epilogue loop, but allow
3096 pick_lowest_cost_p to replace it. */
3097 && (first_loop_vinfo->epilogue_vinfos.is_empty ()
3098 || pick_lowest_cost_p));
3099
3100 /* Commit to first_loop_vinfo if we have no reason to try
3101 alternatives. */
3102 if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
3103 break;
3104 }
3105 else
3106 {
3107 delete loop_vinfo;
3108 loop_vinfo = opt_loop_vec_info::success (NULL);
3109 if (fatal)
3110 {
3111 gcc_checking_assert (first_loop_vinfo == NULL);
3112 break;
3113 }
3114 }
3115
3116 /* Handle the case that the original loop can use partial
3117 vectorization, but want to only adopt it for the epilogue.
3118 The retry should be in the same mode as original. */
3119 if (vect_epilogues
3120 && loop_vinfo
3121 && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
3122 {
3123 gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3124 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
3125 if (dump_enabled_p ())
3126 dump_printf_loc (MSG_NOTE, vect_location,
3127 "***** Re-trying analysis with same vector mode"
3128 " %s for epilogue with partial vectors.\n",
3129 GET_MODE_NAME (loop_vinfo->vector_mode));
3130 continue;
3131 }
3132
3133 if (mode_i < vector_modes.length ()
3134 && VECTOR_MODE_P (autodetected_vector_mode)
3135 && (related_vector_mode (vector_modes[mode_i],
3136 GET_MODE_INNER (autodetected_vector_mode))
3137 == autodetected_vector_mode)
3138 && (related_vector_mode (autodetected_vector_mode,
3139 GET_MODE_INNER (vector_modes[mode_i]))
3140 == vector_modes[mode_i]))
3141 {
3142 if (dump_enabled_p ())
3143 dump_printf_loc (MSG_NOTE, vect_location,
3144 "***** Skipping vector mode %s, which would"
3145 " repeat the analysis for %s\n",
3146 GET_MODE_NAME (vector_modes[mode_i]),
3147 GET_MODE_NAME (autodetected_vector_mode));
3148 mode_i += 1;
3149 }
3150
3151 if (mode_i == vector_modes.length ()
3152 || autodetected_vector_mode == VOIDmode)
3153 break;
3154
3155 /* Try the next biggest vector size. */
3156 next_vector_mode = vector_modes[mode_i++];
3157 if (dump_enabled_p ())
3158 dump_printf_loc (MSG_NOTE, vect_location,
3159 "***** Re-trying analysis with vector mode %s\n",
3160 GET_MODE_NAME (next_vector_mode));
3161 }
3162
3163 if (first_loop_vinfo)
3164 {
3165 loop->aux = (loop_vec_info) first_loop_vinfo;
3166 if (dump_enabled_p ())
3167 dump_printf_loc (MSG_NOTE, vect_location,
3168 "***** Choosing vector mode %s\n",
3169 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3170 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3171 return first_loop_vinfo;
3172 }
3173
3174 return opt_loop_vec_info::propagate_failure (res);
3175 }
3176
3177 /* Return true if there is an in-order reduction function for CODE, storing
3178 it in *REDUC_FN if so. */
3179
3180 static bool
3181 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
3182 {
3183 switch (code)
3184 {
3185 case PLUS_EXPR:
3186 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3187 return true;
3188
3189 default:
3190 return false;
3191 }
3192 }
3193
3194 /* Function reduction_fn_for_scalar_code
3195
3196 Input:
3197 CODE - tree_code of a reduction operations.
3198
3199 Output:
3200 REDUC_FN - the corresponding internal function to be used to reduce the
3201 vector of partial results into a single scalar result, or IFN_LAST
3202 if the operation is a supported reduction operation, but does not have
3203 such an internal function.
3204
3205 Return FALSE if CODE currently cannot be vectorized as reduction. */
3206
3207 static bool
3208 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
3209 {
3210 switch (code)
3211 {
3212 case MAX_EXPR:
3213 *reduc_fn = IFN_REDUC_MAX;
3214 return true;
3215
3216 case MIN_EXPR:
3217 *reduc_fn = IFN_REDUC_MIN;
3218 return true;
3219
3220 case PLUS_EXPR:
3221 *reduc_fn = IFN_REDUC_PLUS;
3222 return true;
3223
3224 case BIT_AND_EXPR:
3225 *reduc_fn = IFN_REDUC_AND;
3226 return true;
3227
3228 case BIT_IOR_EXPR:
3229 *reduc_fn = IFN_REDUC_IOR;
3230 return true;
3231
3232 case BIT_XOR_EXPR:
3233 *reduc_fn = IFN_REDUC_XOR;
3234 return true;
3235
3236 case MULT_EXPR:
3237 case MINUS_EXPR:
3238 *reduc_fn = IFN_LAST;
3239 return true;
3240
3241 default:
3242 return false;
3243 }
3244 }
3245
3246 /* If there is a neutral value X such that SLP reduction NODE would not
3247 be affected by the introduction of additional X elements, return that X,
3248 otherwise return null. CODE is the code of the reduction and VECTOR_TYPE
3249 is the vector type that would hold element X. REDUC_CHAIN is true if
3250 the SLP statements perform a single reduction, false if each statement
3251 performs an independent reduction. */
3252
3253 static tree
3254 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
3255 tree_code code, bool reduc_chain)
3256 {
3257 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3258 stmt_vec_info stmt_vinfo = stmts[0];
3259 tree scalar_type = TREE_TYPE (vector_type);
3260 class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
3261 gcc_assert (loop);
3262
3263 switch (code)
3264 {
3265 case WIDEN_SUM_EXPR:
3266 case DOT_PROD_EXPR:
3267 case SAD_EXPR:
3268 case PLUS_EXPR:
3269 case MINUS_EXPR:
3270 case BIT_IOR_EXPR:
3271 case BIT_XOR_EXPR:
3272 return build_zero_cst (scalar_type);
3273
3274 case MULT_EXPR:
3275 return build_one_cst (scalar_type);
3276
3277 case BIT_AND_EXPR:
3278 return build_all_ones_cst (scalar_type);
3279
3280 case MAX_EXPR:
3281 case MIN_EXPR:
3282 /* For MIN/MAX the initial values are neutral. A reduction chain
3283 has only a single initial value, so that value is neutral for
3284 all statements. */
3285 if (reduc_chain)
3286 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
3287 loop_preheader_edge (loop));
3288 return NULL_TREE;
3289
3290 default:
3291 return NULL_TREE;
3292 }
3293 }
3294
3295 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3296 STMT is printed with a message MSG. */
3297
3298 static void
3299 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3300 {
3301 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3302 }
3303
3304 /* Return true if we need an in-order reduction for operation CODE
3305 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3306 overflow must wrap. */
3307
3308 bool
3309 needs_fold_left_reduction_p (tree type, tree_code code)
3310 {
3311 /* CHECKME: check for !flag_finite_math_only too? */
3312 if (SCALAR_FLOAT_TYPE_P (type))
3313 switch (code)
3314 {
3315 case MIN_EXPR:
3316 case MAX_EXPR:
3317 return false;
3318
3319 default:
3320 return !flag_associative_math;
3321 }
3322
3323 if (INTEGRAL_TYPE_P (type))
3324 {
3325 if (!operation_no_trapping_overflow (type, code))
3326 return true;
3327 return false;
3328 }
3329
3330 if (SAT_FIXED_POINT_TYPE_P (type))
3331 return true;
3332
3333 return false;
3334 }
3335
3336 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3337 has a handled computation expression. Store the main reduction
3338 operation in *CODE. */
3339
3340 static bool
3341 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3342 tree loop_arg, enum tree_code *code,
3343 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3344 {
3345 auto_bitmap visited;
3346 tree lookfor = PHI_RESULT (phi);
3347 ssa_op_iter curri;
3348 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3349 while (USE_FROM_PTR (curr) != loop_arg)
3350 curr = op_iter_next_use (&curri);
3351 curri.i = curri.numops;
3352 do
3353 {
3354 path.safe_push (std::make_pair (curri, curr));
3355 tree use = USE_FROM_PTR (curr);
3356 if (use == lookfor)
3357 break;
3358 gimple *def = SSA_NAME_DEF_STMT (use);
3359 if (gimple_nop_p (def)
3360 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3361 {
3362 pop:
3363 do
3364 {
3365 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3366 curri = x.first;
3367 curr = x.second;
3368 do
3369 curr = op_iter_next_use (&curri);
3370 /* Skip already visited or non-SSA operands (from iterating
3371 over PHI args). */
3372 while (curr != NULL_USE_OPERAND_P
3373 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3374 || ! bitmap_set_bit (visited,
3375 SSA_NAME_VERSION
3376 (USE_FROM_PTR (curr)))));
3377 }
3378 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3379 if (curr == NULL_USE_OPERAND_P)
3380 break;
3381 }
3382 else
3383 {
3384 if (gimple_code (def) == GIMPLE_PHI)
3385 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3386 else
3387 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3388 while (curr != NULL_USE_OPERAND_P
3389 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3390 || ! bitmap_set_bit (visited,
3391 SSA_NAME_VERSION
3392 (USE_FROM_PTR (curr)))))
3393 curr = op_iter_next_use (&curri);
3394 if (curr == NULL_USE_OPERAND_P)
3395 goto pop;
3396 }
3397 }
3398 while (1);
3399 if (dump_file && (dump_flags & TDF_DETAILS))
3400 {
3401 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3402 unsigned i;
3403 std::pair<ssa_op_iter, use_operand_p> *x;
3404 FOR_EACH_VEC_ELT (path, i, x)
3405 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3406 dump_printf (MSG_NOTE, "\n");
3407 }
3408
3409 /* Check whether the reduction path detected is valid. */
3410 bool fail = path.length () == 0;
3411 bool neg = false;
3412 int sign = -1;
3413 *code = ERROR_MARK;
3414 for (unsigned i = 1; i < path.length (); ++i)
3415 {
3416 gimple *use_stmt = USE_STMT (path[i].second);
3417 tree op = USE_FROM_PTR (path[i].second);
3418 if (! is_gimple_assign (use_stmt)
3419 /* The following make sure we can compute the operand index
3420 easily plus it mostly disallows chaining via COND_EXPR condition
3421 operands. */
3422 || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3423 && (gimple_num_ops (use_stmt) <= 2
3424 || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3425 && (gimple_num_ops (use_stmt) <= 3
3426 || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3427 {
3428 fail = true;
3429 break;
3430 }
3431 /* Check there's only a single stmt the op is used on. For the
3432 not value-changing tail and the last stmt allow out-of-loop uses.
3433 ??? We could relax this and handle arbitrary live stmts by
3434 forcing a scalar epilogue for example. */
3435 imm_use_iterator imm_iter;
3436 gimple *op_use_stmt;
3437 unsigned cnt = 0;
3438 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3439 if (!is_gimple_debug (op_use_stmt)
3440 && (*code != ERROR_MARK
3441 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3442 {
3443 /* We want to allow x + x but not x < 1 ? x : 2. */
3444 if (is_gimple_assign (op_use_stmt)
3445 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3446 {
3447 use_operand_p use_p;
3448 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3449 cnt++;
3450 }
3451 else
3452 cnt++;
3453 }
3454 if (cnt != 1)
3455 {
3456 fail = true;
3457 break;
3458 }
3459 tree_code use_code = gimple_assign_rhs_code (use_stmt);
3460 if (use_code == MINUS_EXPR)
3461 {
3462 use_code = PLUS_EXPR;
3463 /* Track whether we negate the reduction value each iteration. */
3464 if (gimple_assign_rhs2 (use_stmt) == op)
3465 neg = ! neg;
3466 }
3467 if (CONVERT_EXPR_CODE_P (use_code)
3468 && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3469 TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3470 ;
3471 else if (*code == ERROR_MARK)
3472 {
3473 *code = use_code;
3474 sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3475 }
3476 else if (use_code != *code)
3477 {
3478 fail = true;
3479 break;
3480 }
3481 else if ((use_code == MIN_EXPR
3482 || use_code == MAX_EXPR)
3483 && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3484 {
3485 fail = true;
3486 break;
3487 }
3488 }
3489 return ! fail && ! neg && *code != ERROR_MARK;
3490 }
3491
3492 bool
3493 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3494 tree loop_arg, enum tree_code code)
3495 {
3496 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3497 enum tree_code code_;
3498 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3499 && code_ == code);
3500 }
3501
3502
3503
3504 /* Function vect_is_simple_reduction
3505
3506 (1) Detect a cross-iteration def-use cycle that represents a simple
3507 reduction computation. We look for the following pattern:
3508
3509 loop_header:
3510 a1 = phi < a0, a2 >
3511 a3 = ...
3512 a2 = operation (a3, a1)
3513
3514 or
3515
3516 a3 = ...
3517 loop_header:
3518 a1 = phi < a0, a2 >
3519 a2 = operation (a3, a1)
3520
3521 such that:
3522 1. operation is commutative and associative and it is safe to
3523 change the order of the computation
3524 2. no uses for a2 in the loop (a2 is used out of the loop)
3525 3. no uses of a1 in the loop besides the reduction operation
3526 4. no uses of a1 outside the loop.
3527
3528 Conditions 1,4 are tested here.
3529 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3530
3531 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3532 nested cycles.
3533
3534 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3535 reductions:
3536
3537 a1 = phi < a0, a2 >
3538 inner loop (def of a3)
3539 a2 = phi < a3 >
3540
3541 (4) Detect condition expressions, ie:
3542 for (int i = 0; i < N; i++)
3543 if (a[i] < val)
3544 ret_val = a[i];
3545
3546 */
3547
3548 static stmt_vec_info
3549 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3550 bool *double_reduc, bool *reduc_chain_p)
3551 {
3552 gphi *phi = as_a <gphi *> (phi_info->stmt);
3553 gimple *phi_use_stmt = NULL;
3554 imm_use_iterator imm_iter;
3555 use_operand_p use_p;
3556
3557 *double_reduc = false;
3558 *reduc_chain_p = false;
3559 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3560
3561 tree phi_name = PHI_RESULT (phi);
3562 /* ??? If there are no uses of the PHI result the inner loop reduction
3563 won't be detected as possibly double-reduction by vectorizable_reduction
3564 because that tries to walk the PHI arg from the preheader edge which
3565 can be constant. See PR60382. */
3566 if (has_zero_uses (phi_name))
3567 return NULL;
3568 class loop *loop = (gimple_bb (phi))->loop_father;
3569 unsigned nphi_def_loop_uses = 0;
3570 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3571 {
3572 gimple *use_stmt = USE_STMT (use_p);
3573 if (is_gimple_debug (use_stmt))
3574 continue;
3575
3576 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3577 {
3578 if (dump_enabled_p ())
3579 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3580 "intermediate value used outside loop.\n");
3581
3582 return NULL;
3583 }
3584
3585 nphi_def_loop_uses++;
3586 phi_use_stmt = use_stmt;
3587 }
3588
3589 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3590 if (TREE_CODE (latch_def) != SSA_NAME)
3591 {
3592 if (dump_enabled_p ())
3593 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3594 "reduction: not ssa_name: %T\n", latch_def);
3595 return NULL;
3596 }
3597
3598 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3599 if (!def_stmt_info
3600 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3601 return NULL;
3602
3603 bool nested_in_vect_loop
3604 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3605 unsigned nlatch_def_loop_uses = 0;
3606 auto_vec<gphi *, 3> lcphis;
3607 bool inner_loop_of_double_reduc = false;
3608 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3609 {
3610 gimple *use_stmt = USE_STMT (use_p);
3611 if (is_gimple_debug (use_stmt))
3612 continue;
3613 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3614 nlatch_def_loop_uses++;
3615 else
3616 {
3617 /* We can have more than one loop-closed PHI. */
3618 lcphis.safe_push (as_a <gphi *> (use_stmt));
3619 if (nested_in_vect_loop
3620 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3621 == vect_double_reduction_def))
3622 inner_loop_of_double_reduc = true;
3623 }
3624 }
3625
3626 /* If we are vectorizing an inner reduction we are executing that
3627 in the original order only in case we are not dealing with a
3628 double reduction. */
3629 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3630 {
3631 if (dump_enabled_p ())
3632 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3633 "detected nested cycle: ");
3634 return def_stmt_info;
3635 }
3636
3637 /* If this isn't a nested cycle or if the nested cycle reduction value
3638 is used ouside of the inner loop we cannot handle uses of the reduction
3639 value. */
3640 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3641 {
3642 if (dump_enabled_p ())
3643 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3644 "reduction used in loop.\n");
3645 return NULL;
3646 }
3647
3648 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3649 defined in the inner loop. */
3650 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3651 {
3652 tree op1 = PHI_ARG_DEF (def_stmt, 0);
3653 if (gimple_phi_num_args (def_stmt) != 1
3654 || TREE_CODE (op1) != SSA_NAME)
3655 {
3656 if (dump_enabled_p ())
3657 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3658 "unsupported phi node definition.\n");
3659
3660 return NULL;
3661 }
3662
3663 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3664 if (gimple_bb (def1)
3665 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3666 && loop->inner
3667 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3668 && is_gimple_assign (def1)
3669 && is_a <gphi *> (phi_use_stmt)
3670 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3671 {
3672 if (dump_enabled_p ())
3673 report_vect_op (MSG_NOTE, def_stmt,
3674 "detected double reduction: ");
3675
3676 *double_reduc = true;
3677 return def_stmt_info;
3678 }
3679
3680 return NULL;
3681 }
3682
3683 /* Look for the expression computing latch_def from then loop PHI result. */
3684 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3685 enum tree_code code;
3686 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3687 path))
3688 {
3689 STMT_VINFO_REDUC_CODE (phi_info) = code;
3690 if (code == COND_EXPR && !nested_in_vect_loop)
3691 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3692
3693 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3694 reduction chain for which the additional restriction is that
3695 all operations in the chain are the same. */
3696 auto_vec<stmt_vec_info, 8> reduc_chain;
3697 unsigned i;
3698 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3699 for (i = path.length () - 1; i >= 1; --i)
3700 {
3701 gimple *stmt = USE_STMT (path[i].second);
3702 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3703 STMT_VINFO_REDUC_IDX (stmt_info)
3704 = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3705 enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3706 bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3707 && (i == 1 || i == path.length () - 1));
3708 if ((stmt_code != code && !leading_conversion)
3709 /* We can only handle the final value in epilogue
3710 generation for reduction chains. */
3711 || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3712 is_slp_reduc = false;
3713 /* For reduction chains we support a trailing/leading
3714 conversions. We do not store those in the actual chain. */
3715 if (leading_conversion)
3716 continue;
3717 reduc_chain.safe_push (stmt_info);
3718 }
3719 if (is_slp_reduc && reduc_chain.length () > 1)
3720 {
3721 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3722 {
3723 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3724 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3725 }
3726 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3727 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3728
3729 /* Save the chain for further analysis in SLP detection. */
3730 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3731 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3732
3733 *reduc_chain_p = true;
3734 if (dump_enabled_p ())
3735 dump_printf_loc (MSG_NOTE, vect_location,
3736 "reduction: detected reduction chain\n");
3737 }
3738 else if (dump_enabled_p ())
3739 dump_printf_loc (MSG_NOTE, vect_location,
3740 "reduction: detected reduction\n");
3741
3742 return def_stmt_info;
3743 }
3744
3745 if (dump_enabled_p ())
3746 dump_printf_loc (MSG_NOTE, vect_location,
3747 "reduction: unknown pattern\n");
3748
3749 return NULL;
3750 }
3751
3752 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3753 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3754 or -1 if not known. */
3755
3756 static int
3757 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3758 {
3759 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3760 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3761 {
3762 if (dump_enabled_p ())
3763 dump_printf_loc (MSG_NOTE, vect_location,
3764 "cost model: epilogue peel iters set to vf/2 "
3765 "because loop iterations are unknown .\n");
3766 return assumed_vf / 2;
3767 }
3768 else
3769 {
3770 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3771 peel_iters_prologue = MIN (niters, peel_iters_prologue);
3772 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3773 /* If we need to peel for gaps, but no peeling is required, we have to
3774 peel VF iterations. */
3775 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3776 peel_iters_epilogue = assumed_vf;
3777 return peel_iters_epilogue;
3778 }
3779 }
3780
3781 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3782 int
3783 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3784 int *peel_iters_epilogue,
3785 stmt_vector_for_cost *scalar_cost_vec,
3786 stmt_vector_for_cost *prologue_cost_vec,
3787 stmt_vector_for_cost *epilogue_cost_vec)
3788 {
3789 int retval = 0;
3790
3791 *peel_iters_epilogue
3792 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3793
3794 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3795 {
3796 /* If peeled iterations are known but number of scalar loop
3797 iterations are unknown, count a taken branch per peeled loop. */
3798 if (peel_iters_prologue > 0)
3799 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3800 NULL, NULL_TREE, 0, vect_prologue);
3801 if (*peel_iters_epilogue > 0)
3802 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3803 NULL, NULL_TREE, 0, vect_epilogue);
3804 }
3805
3806 stmt_info_for_cost *si;
3807 int j;
3808 if (peel_iters_prologue)
3809 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3810 retval += record_stmt_cost (prologue_cost_vec,
3811 si->count * peel_iters_prologue,
3812 si->kind, si->stmt_info, si->misalign,
3813 vect_prologue);
3814 if (*peel_iters_epilogue)
3815 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3816 retval += record_stmt_cost (epilogue_cost_vec,
3817 si->count * *peel_iters_epilogue,
3818 si->kind, si->stmt_info, si->misalign,
3819 vect_epilogue);
3820
3821 return retval;
3822 }
3823
3824 /* Function vect_estimate_min_profitable_iters
3825
3826 Return the number of iterations required for the vector version of the
3827 loop to be profitable relative to the cost of the scalar version of the
3828 loop.
3829
3830 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3831 of iterations for vectorization. -1 value means loop vectorization
3832 is not profitable. This returned value may be used for dynamic
3833 profitability check.
3834
3835 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3836 for static check against estimated number of iterations. */
3837
3838 static void
3839 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3840 int *ret_min_profitable_niters,
3841 int *ret_min_profitable_estimate)
3842 {
3843 int min_profitable_iters;
3844 int min_profitable_estimate;
3845 int peel_iters_prologue;
3846 int peel_iters_epilogue;
3847 unsigned vec_inside_cost = 0;
3848 int vec_outside_cost = 0;
3849 unsigned vec_prologue_cost = 0;
3850 unsigned vec_epilogue_cost = 0;
3851 int scalar_single_iter_cost = 0;
3852 int scalar_outside_cost = 0;
3853 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3854 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3855 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3856
3857 /* Cost model disabled. */
3858 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3859 {
3860 if (dump_enabled_p ())
3861 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3862 *ret_min_profitable_niters = 0;
3863 *ret_min_profitable_estimate = 0;
3864 return;
3865 }
3866
3867 /* Requires loop versioning tests to handle misalignment. */
3868 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3869 {
3870 /* FIXME: Make cost depend on complexity of individual check. */
3871 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3872 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3873 NULL, NULL_TREE, 0, vect_prologue);
3874 if (dump_enabled_p ())
3875 dump_printf (MSG_NOTE,
3876 "cost model: Adding cost of checks for loop "
3877 "versioning to treat misalignment.\n");
3878 }
3879
3880 /* Requires loop versioning with alias checks. */
3881 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3882 {
3883 /* FIXME: Make cost depend on complexity of individual check. */
3884 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3885 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3886 NULL, NULL_TREE, 0, vect_prologue);
3887 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3888 if (len)
3889 /* Count LEN - 1 ANDs and LEN comparisons. */
3890 (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
3891 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3892 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3893 if (len)
3894 {
3895 /* Count LEN - 1 ANDs and LEN comparisons. */
3896 unsigned int nstmts = len * 2 - 1;
3897 /* +1 for each bias that needs adding. */
3898 for (unsigned int i = 0; i < len; ++i)
3899 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3900 nstmts += 1;
3901 (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
3902 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3903 }
3904 if (dump_enabled_p ())
3905 dump_printf (MSG_NOTE,
3906 "cost model: Adding cost of checks for loop "
3907 "versioning aliasing.\n");
3908 }
3909
3910 /* Requires loop versioning with niter checks. */
3911 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3912 {
3913 /* FIXME: Make cost depend on complexity of individual check. */
3914 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
3915 NULL, NULL_TREE, 0, vect_prologue);
3916 if (dump_enabled_p ())
3917 dump_printf (MSG_NOTE,
3918 "cost model: Adding cost of checks for loop "
3919 "versioning niters.\n");
3920 }
3921
3922 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3923 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3924 NULL, NULL_TREE, 0, vect_prologue);
3925
3926 /* Count statements in scalar loop. Using this as scalar cost for a single
3927 iteration for now.
3928
3929 TODO: Add outer loop support.
3930
3931 TODO: Consider assigning different costs to different scalar
3932 statements. */
3933
3934 scalar_single_iter_cost
3935 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3936
3937 /* Add additional cost for the peeled instructions in prologue and epilogue
3938 loop. (For fully-masked loops there will be no peeling.)
3939
3940 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3941 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3942
3943 TODO: Build an expression that represents peel_iters for prologue and
3944 epilogue to be used in a run-time test. */
3945
3946 bool prologue_need_br_taken_cost = false;
3947 bool prologue_need_br_not_taken_cost = false;
3948
3949 /* Calculate peel_iters_prologue. */
3950 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3951 peel_iters_prologue = 0;
3952 else if (npeel < 0)
3953 {
3954 peel_iters_prologue = assumed_vf / 2;
3955 if (dump_enabled_p ())
3956 dump_printf (MSG_NOTE, "cost model: "
3957 "prologue peel iters set to vf/2.\n");
3958
3959 /* If peeled iterations are unknown, count a taken branch and a not taken
3960 branch per peeled loop. Even if scalar loop iterations are known,
3961 vector iterations are not known since peeled prologue iterations are
3962 not known. Hence guards remain the same. */
3963 prologue_need_br_taken_cost = true;
3964 prologue_need_br_not_taken_cost = true;
3965 }
3966 else
3967 {
3968 peel_iters_prologue = npeel;
3969 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
3970 /* If peeled iterations are known but number of scalar loop
3971 iterations are unknown, count a taken branch per peeled loop. */
3972 prologue_need_br_taken_cost = true;
3973 }
3974
3975 bool epilogue_need_br_taken_cost = false;
3976 bool epilogue_need_br_not_taken_cost = false;
3977
3978 /* Calculate peel_iters_epilogue. */
3979 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3980 /* We need to peel exactly one iteration for gaps. */
3981 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
3982 else if (npeel < 0)
3983 {
3984 /* If peeling for alignment is unknown, loop bound of main loop
3985 becomes unknown. */
3986 peel_iters_epilogue = assumed_vf / 2;
3987 if (dump_enabled_p ())
3988 dump_printf (MSG_NOTE, "cost model: "
3989 "epilogue peel iters set to vf/2 because "
3990 "peeling for alignment is unknown.\n");
3991
3992 /* See the same reason above in peel_iters_prologue calculation. */
3993 epilogue_need_br_taken_cost = true;
3994 epilogue_need_br_not_taken_cost = true;
3995 }
3996 else
3997 {
3998 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
3999 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4000 /* If peeled iterations are known but number of scalar loop
4001 iterations are unknown, count a taken branch per peeled loop. */
4002 epilogue_need_br_taken_cost = true;
4003 }
4004
4005 stmt_info_for_cost *si;
4006 int j;
4007 /* Add costs associated with peel_iters_prologue. */
4008 if (peel_iters_prologue)
4009 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4010 {
4011 (void) add_stmt_cost (loop_vinfo, target_cost_data,
4012 si->count * peel_iters_prologue, si->kind,
4013 si->stmt_info, si->vectype, si->misalign,
4014 vect_prologue);
4015 }
4016
4017 /* Add costs associated with peel_iters_epilogue. */
4018 if (peel_iters_epilogue)
4019 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4020 {
4021 (void) add_stmt_cost (loop_vinfo, target_cost_data,
4022 si->count * peel_iters_epilogue, si->kind,
4023 si->stmt_info, si->vectype, si->misalign,
4024 vect_epilogue);
4025 }
4026
4027 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4028
4029 if (prologue_need_br_taken_cost)
4030 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
4031 NULL, NULL_TREE, 0, vect_prologue);
4032
4033 if (prologue_need_br_not_taken_cost)
4034 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
4035 cond_branch_not_taken, NULL, NULL_TREE, 0,
4036 vect_prologue);
4037
4038 if (epilogue_need_br_taken_cost)
4039 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
4040 NULL, NULL_TREE, 0, vect_epilogue);
4041
4042 if (epilogue_need_br_not_taken_cost)
4043 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
4044 cond_branch_not_taken, NULL, NULL_TREE, 0,
4045 vect_epilogue);
4046
4047 /* Take care of special costs for rgroup controls of partial vectors. */
4048 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4049 {
4050 /* Calculate how many masks we need to generate. */
4051 unsigned int num_masks = 0;
4052 rgroup_controls *rgm;
4053 unsigned int num_vectors_m1;
4054 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
4055 if (rgm->type)
4056 num_masks += num_vectors_m1 + 1;
4057 gcc_assert (num_masks > 0);
4058
4059 /* In the worst case, we need to generate each mask in the prologue
4060 and in the loop body. One of the loop body mask instructions
4061 replaces the comparison in the scalar loop, and since we don't
4062 count the scalar comparison against the scalar body, we shouldn't
4063 count that vector instruction against the vector body either.
4064
4065 Sometimes we can use unpacks instead of generating prologue
4066 masks and sometimes the prologue mask will fold to a constant,
4067 so the actual prologue cost might be smaller. However, it's
4068 simpler and safer to use the worst-case cost; if this ends up
4069 being the tie-breaker between vectorizing or not, then it's
4070 probably better not to vectorize. */
4071 (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks,
4072 vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
4073 (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
4074 vector_stmt, NULL, NULL_TREE, 0, vect_body);
4075 }
4076 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4077 {
4078 /* Referring to the functions vect_set_loop_condition_partial_vectors
4079 and vect_set_loop_controls_directly, we need to generate each
4080 length in the prologue and in the loop body if required. Although
4081 there are some possible optimizations, we consider the worst case
4082 here. */
4083
4084 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4085 bool need_iterate_p
4086 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4087 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4088
4089 /* Calculate how many statements to be added. */
4090 unsigned int prologue_stmts = 0;
4091 unsigned int body_stmts = 0;
4092
4093 rgroup_controls *rgc;
4094 unsigned int num_vectors_m1;
4095 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4096 if (rgc->type)
4097 {
4098 /* May need one SHIFT for nitems_total computation. */
4099 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4100 if (nitems != 1 && !niters_known_p)
4101 prologue_stmts += 1;
4102
4103 /* May need one MAX and one MINUS for wrap around. */
4104 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4105 prologue_stmts += 2;
4106
4107 /* Need one MAX and one MINUS for each batch limit excepting for
4108 the 1st one. */
4109 prologue_stmts += num_vectors_m1 * 2;
4110
4111 unsigned int num_vectors = num_vectors_m1 + 1;
4112
4113 /* Need to set up lengths in prologue, only one MIN required
4114 for each since start index is zero. */
4115 prologue_stmts += num_vectors;
4116
4117 /* Each may need two MINs and one MINUS to update lengths in body
4118 for next iteration. */
4119 if (need_iterate_p)
4120 body_stmts += 3 * num_vectors;
4121 }
4122
4123 (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts,
4124 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
4125 (void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts,
4126 scalar_stmt, NULL, NULL_TREE, 0, vect_body);
4127 }
4128
4129 /* FORNOW: The scalar outside cost is incremented in one of the
4130 following ways:
4131
4132 1. The vectorizer checks for alignment and aliasing and generates
4133 a condition that allows dynamic vectorization. A cost model
4134 check is ANDED with the versioning condition. Hence scalar code
4135 path now has the added cost of the versioning check.
4136
4137 if (cost > th & versioning_check)
4138 jmp to vector code
4139
4140 Hence run-time scalar is incremented by not-taken branch cost.
4141
4142 2. The vectorizer then checks if a prologue is required. If the
4143 cost model check was not done before during versioning, it has to
4144 be done before the prologue check.
4145
4146 if (cost <= th)
4147 prologue = scalar_iters
4148 if (prologue == 0)
4149 jmp to vector code
4150 else
4151 execute prologue
4152 if (prologue == num_iters)
4153 go to exit
4154
4155 Hence the run-time scalar cost is incremented by a taken branch,
4156 plus a not-taken branch, plus a taken branch cost.
4157
4158 3. The vectorizer then checks if an epilogue is required. If the
4159 cost model check was not done before during prologue check, it
4160 has to be done with the epilogue check.
4161
4162 if (prologue == 0)
4163 jmp to vector code
4164 else
4165 execute prologue
4166 if (prologue == num_iters)
4167 go to exit
4168 vector code:
4169 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4170 jmp to epilogue
4171
4172 Hence the run-time scalar cost should be incremented by 2 taken
4173 branches.
4174
4175 TODO: The back end may reorder the BBS's differently and reverse
4176 conditions/branch directions. Change the estimates below to
4177 something more reasonable. */
4178
4179 /* If the number of iterations is known and we do not do versioning, we can
4180 decide whether to vectorize at compile time. Hence the scalar version
4181 do not carry cost model guard costs. */
4182 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4183 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4184 {
4185 /* Cost model check occurs at versioning. */
4186 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4187 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4188 else
4189 {
4190 /* Cost model check occurs at prologue generation. */
4191 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4192 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4193 + vect_get_stmt_cost (cond_branch_not_taken);
4194 /* Cost model check occurs at epilogue generation. */
4195 else
4196 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4197 }
4198 }
4199
4200 /* Complete the target-specific cost calculations. */
4201 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
4202 &vec_inside_cost, &vec_epilogue_cost);
4203
4204 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4205
4206 /* Stash the costs so that we can compare two loop_vec_infos. */
4207 loop_vinfo->vec_inside_cost = vec_inside_cost;
4208 loop_vinfo->vec_outside_cost = vec_outside_cost;
4209
4210 if (dump_enabled_p ())
4211 {
4212 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4213 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4214 vec_inside_cost);
4215 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4216 vec_prologue_cost);
4217 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4218 vec_epilogue_cost);
4219 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4220 scalar_single_iter_cost);
4221 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4222 scalar_outside_cost);
4223 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4224 vec_outside_cost);
4225 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4226 peel_iters_prologue);
4227 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4228 peel_iters_epilogue);
4229 }
4230
4231 /* Calculate number of iterations required to make the vector version
4232 profitable, relative to the loop bodies only. The following condition
4233 must hold true:
4234 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4235 where
4236 SIC = scalar iteration cost, VIC = vector iteration cost,
4237 VOC = vector outside cost, VF = vectorization factor,
4238 NPEEL = prologue iterations + epilogue iterations,
4239 SOC = scalar outside cost for run time cost model check. */
4240
4241 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4242 - vec_inside_cost);
4243 if (saving_per_viter <= 0)
4244 {
4245 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4246 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4247 "vectorization did not happen for a simd loop");
4248
4249 if (dump_enabled_p ())
4250 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4251 "cost model: the vector iteration cost = %d "
4252 "divided by the scalar iteration cost = %d "
4253 "is greater or equal to the vectorization factor = %d"
4254 ".\n",
4255 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4256 *ret_min_profitable_niters = -1;
4257 *ret_min_profitable_estimate = -1;
4258 return;
4259 }
4260
4261 /* ??? The "if" arm is written to handle all cases; see below for what
4262 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4263 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4264 {
4265 /* Rewriting the condition above in terms of the number of
4266 vector iterations (vniters) rather than the number of
4267 scalar iterations (niters) gives:
4268
4269 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4270
4271 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4272
4273 For integer N, X and Y when X > 0:
4274
4275 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4276 int outside_overhead = (vec_outside_cost
4277 - scalar_single_iter_cost * peel_iters_prologue
4278 - scalar_single_iter_cost * peel_iters_epilogue
4279 - scalar_outside_cost);
4280 /* We're only interested in cases that require at least one
4281 vector iteration. */
4282 int min_vec_niters = 1;
4283 if (outside_overhead > 0)
4284 min_vec_niters = outside_overhead / saving_per_viter + 1;
4285
4286 if (dump_enabled_p ())
4287 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4288 min_vec_niters);
4289
4290 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4291 {
4292 /* Now that we know the minimum number of vector iterations,
4293 find the minimum niters for which the scalar cost is larger:
4294
4295 SIC * niters > VIC * vniters + VOC - SOC
4296
4297 We know that the minimum niters is no more than
4298 vniters * VF + NPEEL, but it might be (and often is) less
4299 than that if a partial vector iteration is cheaper than the
4300 equivalent scalar code. */
4301 int threshold = (vec_inside_cost * min_vec_niters
4302 + vec_outside_cost
4303 - scalar_outside_cost);
4304 if (threshold <= 0)
4305 min_profitable_iters = 1;
4306 else
4307 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4308 }
4309 else
4310 /* Convert the number of vector iterations into a number of
4311 scalar iterations. */
4312 min_profitable_iters = (min_vec_niters * assumed_vf
4313 + peel_iters_prologue
4314 + peel_iters_epilogue);
4315 }
4316 else
4317 {
4318 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4319 * assumed_vf
4320 - vec_inside_cost * peel_iters_prologue
4321 - vec_inside_cost * peel_iters_epilogue);
4322 if (min_profitable_iters <= 0)
4323 min_profitable_iters = 0;
4324 else
4325 {
4326 min_profitable_iters /= saving_per_viter;
4327
4328 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4329 <= (((int) vec_inside_cost * min_profitable_iters)
4330 + (((int) vec_outside_cost - scalar_outside_cost)
4331 * assumed_vf)))
4332 min_profitable_iters++;
4333 }
4334 }
4335
4336 if (dump_enabled_p ())
4337 dump_printf (MSG_NOTE,
4338 " Calculated minimum iters for profitability: %d\n",
4339 min_profitable_iters);
4340
4341 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4342 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4343 /* We want the vectorized loop to execute at least once. */
4344 min_profitable_iters = assumed_vf + peel_iters_prologue;
4345 else if (min_profitable_iters < peel_iters_prologue)
4346 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4347 vectorized loop executes at least once. */
4348 min_profitable_iters = peel_iters_prologue;
4349
4350 if (dump_enabled_p ())
4351 dump_printf_loc (MSG_NOTE, vect_location,
4352 " Runtime profitability threshold = %d\n",
4353 min_profitable_iters);
4354
4355 *ret_min_profitable_niters = min_profitable_iters;
4356
4357 /* Calculate number of iterations required to make the vector version
4358 profitable, relative to the loop bodies only.
4359
4360 Non-vectorized variant is SIC * niters and it must win over vector
4361 variant on the expected loop trip count. The following condition must hold true:
4362 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4363
4364 if (vec_outside_cost <= 0)
4365 min_profitable_estimate = 0;
4366 /* ??? This "else if" arm is written to handle all cases; see below for
4367 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4368 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4369 {
4370 /* This is a repeat of the code above, but with + SOC rather
4371 than - SOC. */
4372 int outside_overhead = (vec_outside_cost
4373 - scalar_single_iter_cost * peel_iters_prologue
4374 - scalar_single_iter_cost * peel_iters_epilogue
4375 + scalar_outside_cost);
4376 int min_vec_niters = 1;
4377 if (outside_overhead > 0)
4378 min_vec_niters = outside_overhead / saving_per_viter + 1;
4379
4380 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4381 {
4382 int threshold = (vec_inside_cost * min_vec_niters
4383 + vec_outside_cost
4384 + scalar_outside_cost);
4385 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4386 }
4387 else
4388 min_profitable_estimate = (min_vec_niters * assumed_vf
4389 + peel_iters_prologue
4390 + peel_iters_epilogue);
4391 }
4392 else
4393 {
4394 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4395 * assumed_vf
4396 - vec_inside_cost * peel_iters_prologue
4397 - vec_inside_cost * peel_iters_epilogue)
4398 / ((scalar_single_iter_cost * assumed_vf)
4399 - vec_inside_cost);
4400 }
4401 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4402 if (dump_enabled_p ())
4403 dump_printf_loc (MSG_NOTE, vect_location,
4404 " Static estimate profitability threshold = %d\n",
4405 min_profitable_estimate);
4406
4407 *ret_min_profitable_estimate = min_profitable_estimate;
4408 }
4409
4410 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4411 vector elements (not bits) for a vector with NELT elements. */
4412 static void
4413 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4414 vec_perm_builder *sel)
4415 {
4416 /* The encoding is a single stepped pattern. Any wrap-around is handled
4417 by vec_perm_indices. */
4418 sel->new_vector (nelt, 1, 3);
4419 for (unsigned int i = 0; i < 3; i++)
4420 sel->quick_push (i + offset);
4421 }
4422
4423 /* Checks whether the target supports whole-vector shifts for vectors of mode
4424 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4425 it supports vec_perm_const with masks for all necessary shift amounts. */
4426 static bool
4427 have_whole_vector_shift (machine_mode mode)
4428 {
4429 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4430 return true;
4431
4432 /* Variable-length vectors should be handled via the optab. */
4433 unsigned int nelt;
4434 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4435 return false;
4436
4437 vec_perm_builder sel;
4438 vec_perm_indices indices;
4439 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4440 {
4441 calc_vec_perm_mask_for_shift (i, nelt, &sel);
4442 indices.new_vector (sel, 2, nelt);
4443 if (!can_vec_perm_const_p (mode, indices, false))
4444 return false;
4445 }
4446 return true;
4447 }
4448
4449 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4450 functions. Design better to avoid maintenance issues. */
4451
4452 /* Function vect_model_reduction_cost.
4453
4454 Models cost for a reduction operation, including the vector ops
4455 generated within the strip-mine loop in some cases, the initial
4456 definition before the loop, and the epilogue code that must be generated. */
4457
4458 static void
4459 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4460 stmt_vec_info stmt_info, internal_fn reduc_fn,
4461 vect_reduction_type reduction_type,
4462 int ncopies, stmt_vector_for_cost *cost_vec)
4463 {
4464 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4465 enum tree_code code;
4466 optab optab;
4467 tree vectype;
4468 machine_mode mode;
4469 class loop *loop = NULL;
4470
4471 if (loop_vinfo)
4472 loop = LOOP_VINFO_LOOP (loop_vinfo);
4473
4474 /* Condition reductions generate two reductions in the loop. */
4475 if (reduction_type == COND_REDUCTION)
4476 ncopies *= 2;
4477
4478 vectype = STMT_VINFO_VECTYPE (stmt_info);
4479 mode = TYPE_MODE (vectype);
4480 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4481
4482 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4483
4484 if (reduction_type == EXTRACT_LAST_REDUCTION)
4485 /* No extra instructions are needed in the prologue. The loop body
4486 operations are costed in vectorizable_condition. */
4487 inside_cost = 0;
4488 else if (reduction_type == FOLD_LEFT_REDUCTION)
4489 {
4490 /* No extra instructions needed in the prologue. */
4491 prologue_cost = 0;
4492
4493 if (reduc_fn != IFN_LAST)
4494 /* Count one reduction-like operation per vector. */
4495 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4496 stmt_info, 0, vect_body);
4497 else
4498 {
4499 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4500 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4501 inside_cost = record_stmt_cost (cost_vec, nelements,
4502 vec_to_scalar, stmt_info, 0,
4503 vect_body);
4504 inside_cost += record_stmt_cost (cost_vec, nelements,
4505 scalar_stmt, stmt_info, 0,
4506 vect_body);
4507 }
4508 }
4509 else
4510 {
4511 /* Add in cost for initial definition.
4512 For cond reduction we have four vectors: initial index, step,
4513 initial result of the data reduction, initial value of the index
4514 reduction. */
4515 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4516 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4517 scalar_to_vec, stmt_info, 0,
4518 vect_prologue);
4519 }
4520
4521 /* Determine cost of epilogue code.
4522
4523 We have a reduction operator that will reduce the vector in one statement.
4524 Also requires scalar extract. */
4525
4526 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4527 {
4528 if (reduc_fn != IFN_LAST)
4529 {
4530 if (reduction_type == COND_REDUCTION)
4531 {
4532 /* An EQ stmt and an COND_EXPR stmt. */
4533 epilogue_cost += record_stmt_cost (cost_vec, 2,
4534 vector_stmt, stmt_info, 0,
4535 vect_epilogue);
4536 /* Reduction of the max index and a reduction of the found
4537 values. */
4538 epilogue_cost += record_stmt_cost (cost_vec, 2,
4539 vec_to_scalar, stmt_info, 0,
4540 vect_epilogue);
4541 /* A broadcast of the max value. */
4542 epilogue_cost += record_stmt_cost (cost_vec, 1,
4543 scalar_to_vec, stmt_info, 0,
4544 vect_epilogue);
4545 }
4546 else
4547 {
4548 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4549 stmt_info, 0, vect_epilogue);
4550 epilogue_cost += record_stmt_cost (cost_vec, 1,
4551 vec_to_scalar, stmt_info, 0,
4552 vect_epilogue);
4553 }
4554 }
4555 else if (reduction_type == COND_REDUCTION)
4556 {
4557 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4558 /* Extraction of scalar elements. */
4559 epilogue_cost += record_stmt_cost (cost_vec,
4560 2 * estimated_nunits,
4561 vec_to_scalar, stmt_info, 0,
4562 vect_epilogue);
4563 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4564 epilogue_cost += record_stmt_cost (cost_vec,
4565 2 * estimated_nunits - 3,
4566 scalar_stmt, stmt_info, 0,
4567 vect_epilogue);
4568 }
4569 else if (reduction_type == EXTRACT_LAST_REDUCTION
4570 || reduction_type == FOLD_LEFT_REDUCTION)
4571 /* No extra instructions need in the epilogue. */
4572 ;
4573 else
4574 {
4575 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4576 tree bitsize =
4577 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4578 int element_bitsize = tree_to_uhwi (bitsize);
4579 int nelements = vec_size_in_bits / element_bitsize;
4580
4581 if (code == COND_EXPR)
4582 code = MAX_EXPR;
4583
4584 optab = optab_for_tree_code (code, vectype, optab_default);
4585
4586 /* We have a whole vector shift available. */
4587 if (optab != unknown_optab
4588 && VECTOR_MODE_P (mode)
4589 && optab_handler (optab, mode) != CODE_FOR_nothing
4590 && have_whole_vector_shift (mode))
4591 {
4592 /* Final reduction via vector shifts and the reduction operator.
4593 Also requires scalar extract. */
4594 epilogue_cost += record_stmt_cost (cost_vec,
4595 exact_log2 (nelements) * 2,
4596 vector_stmt, stmt_info, 0,
4597 vect_epilogue);
4598 epilogue_cost += record_stmt_cost (cost_vec, 1,
4599 vec_to_scalar, stmt_info, 0,
4600 vect_epilogue);
4601 }
4602 else
4603 /* Use extracts and reduction op for final reduction. For N
4604 elements, we have N extracts and N-1 reduction ops. */
4605 epilogue_cost += record_stmt_cost (cost_vec,
4606 nelements + nelements - 1,
4607 vector_stmt, stmt_info, 0,
4608 vect_epilogue);
4609 }
4610 }
4611
4612 if (dump_enabled_p ())
4613 dump_printf (MSG_NOTE,
4614 "vect_model_reduction_cost: inside_cost = %d, "
4615 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4616 prologue_cost, epilogue_cost);
4617 }
4618
4619
4620
4621 /* Function get_initial_def_for_reduction
4622
4623 Input:
4624 STMT_VINFO - a stmt that performs a reduction operation in the loop.
4625 INIT_VAL - the initial value of the reduction variable
4626
4627 Output:
4628 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4629 of the reduction (used for adjusting the epilog - see below).
4630 Return a vector variable, initialized according to the operation that
4631 STMT_VINFO performs. This vector will be used as the initial value
4632 of the vector of partial results.
4633
4634 Option1 (adjust in epilog): Initialize the vector as follows:
4635 add/bit or/xor: [0,0,...,0,0]
4636 mult/bit and: [1,1,...,1,1]
4637 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4638 and when necessary (e.g. add/mult case) let the caller know
4639 that it needs to adjust the result by init_val.
4640
4641 Option2: Initialize the vector as follows:
4642 add/bit or/xor: [init_val,0,0,...,0]
4643 mult/bit and: [init_val,1,1,...,1]
4644 min/max/cond_expr: [init_val,init_val,...,init_val]
4645 and no adjustments are needed.
4646
4647 For example, for the following code:
4648
4649 s = init_val;
4650 for (i=0;i<n;i++)
4651 s = s + a[i];
4652
4653 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4654 For a vector of 4 units, we want to return either [0,0,0,init_val],
4655 or [0,0,0,0] and let the caller know that it needs to adjust
4656 the result at the end by 'init_val'.
4657
4658 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4659 initialization vector is simpler (same element in all entries), if
4660 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4661
4662 A cost model should help decide between these two schemes. */
4663
4664 static tree
4665 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4666 stmt_vec_info stmt_vinfo,
4667 enum tree_code code, tree init_val,
4668 tree *adjustment_def)
4669 {
4670 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4671 tree scalar_type = TREE_TYPE (init_val);
4672 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4673 tree def_for_init;
4674 tree init_def;
4675 REAL_VALUE_TYPE real_init_val = dconst0;
4676 int int_init_val = 0;
4677 gimple_seq stmts = NULL;
4678
4679 gcc_assert (vectype);
4680
4681 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4682 || SCALAR_FLOAT_TYPE_P (scalar_type));
4683
4684 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4685 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4686
4687 /* ADJUSTMENT_DEF is NULL when called from
4688 vect_create_epilog_for_reduction to vectorize double reduction. */
4689 if (adjustment_def)
4690 *adjustment_def = NULL;
4691
4692 switch (code)
4693 {
4694 case WIDEN_SUM_EXPR:
4695 case DOT_PROD_EXPR:
4696 case SAD_EXPR:
4697 case PLUS_EXPR:
4698 case MINUS_EXPR:
4699 case BIT_IOR_EXPR:
4700 case BIT_XOR_EXPR:
4701 case MULT_EXPR:
4702 case BIT_AND_EXPR:
4703 {
4704 if (code == MULT_EXPR)
4705 {
4706 real_init_val = dconst1;
4707 int_init_val = 1;
4708 }
4709
4710 if (code == BIT_AND_EXPR)
4711 int_init_val = -1;
4712
4713 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4714 def_for_init = build_real (scalar_type, real_init_val);
4715 else
4716 def_for_init = build_int_cst (scalar_type, int_init_val);
4717
4718 if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4719 {
4720 /* Option1: the first element is '0' or '1' as well. */
4721 if (!operand_equal_p (def_for_init, init_val, 0))
4722 *adjustment_def = init_val;
4723 init_def = gimple_build_vector_from_val (&stmts, vectype,
4724 def_for_init);
4725 }
4726 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4727 {
4728 /* Option2 (variable length): the first element is INIT_VAL. */
4729 init_def = gimple_build_vector_from_val (&stmts, vectype,
4730 def_for_init);
4731 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4732 vectype, init_def, init_val);
4733 }
4734 else
4735 {
4736 /* Option2: the first element is INIT_VAL. */
4737 tree_vector_builder elts (vectype, 1, 2);
4738 elts.quick_push (init_val);
4739 elts.quick_push (def_for_init);
4740 init_def = gimple_build_vector (&stmts, &elts);
4741 }
4742 }
4743 break;
4744
4745 case MIN_EXPR:
4746 case MAX_EXPR:
4747 case COND_EXPR:
4748 {
4749 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4750 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4751 }
4752 break;
4753
4754 default:
4755 gcc_unreachable ();
4756 }
4757
4758 if (stmts)
4759 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4760 return init_def;
4761 }
4762
4763 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4764 NUMBER_OF_VECTORS is the number of vector defs to create.
4765 If NEUTRAL_OP is nonnull, introducing extra elements of that
4766 value will not change the result. */
4767
4768 static void
4769 get_initial_defs_for_reduction (vec_info *vinfo,
4770 slp_tree slp_node,
4771 vec<tree> *vec_oprnds,
4772 unsigned int number_of_vectors,
4773 bool reduc_chain, tree neutral_op)
4774 {
4775 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4776 stmt_vec_info stmt_vinfo = stmts[0];
4777 unsigned HOST_WIDE_INT nunits;
4778 unsigned j, number_of_places_left_in_vector;
4779 tree vector_type;
4780 unsigned int group_size = stmts.length ();
4781 unsigned int i;
4782 class loop *loop;
4783
4784 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4785
4786 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4787
4788 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4789 gcc_assert (loop);
4790 edge pe = loop_preheader_edge (loop);
4791
4792 gcc_assert (!reduc_chain || neutral_op);
4793
4794 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4795 created vectors. It is greater than 1 if unrolling is performed.
4796
4797 For example, we have two scalar operands, s1 and s2 (e.g., group of
4798 strided accesses of size two), while NUNITS is four (i.e., four scalars
4799 of this type can be packed in a vector). The output vector will contain
4800 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4801 will be 2).
4802
4803 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4804 vectors containing the operands.
4805
4806 For example, NUNITS is four as before, and the group size is 8
4807 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4808 {s5, s6, s7, s8}. */
4809
4810 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4811 nunits = group_size;
4812
4813 number_of_places_left_in_vector = nunits;
4814 bool constant_p = true;
4815 tree_vector_builder elts (vector_type, nunits, 1);
4816 elts.quick_grow (nunits);
4817 gimple_seq ctor_seq = NULL;
4818 for (j = 0; j < nunits * number_of_vectors; ++j)
4819 {
4820 tree op;
4821 i = j % group_size;
4822 stmt_vinfo = stmts[i];
4823
4824 /* Get the def before the loop. In reduction chain we have only
4825 one initial value. Else we have as many as PHIs in the group. */
4826 if (reduc_chain)
4827 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4828 else if (((vec_oprnds->length () + 1) * nunits
4829 - number_of_places_left_in_vector >= group_size)
4830 && neutral_op)
4831 op = neutral_op;
4832 else
4833 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4834
4835 /* Create 'vect_ = {op0,op1,...,opn}'. */
4836 number_of_places_left_in_vector--;
4837 elts[nunits - number_of_places_left_in_vector - 1] = op;
4838 if (!CONSTANT_CLASS_P (op))
4839 constant_p = false;
4840
4841 if (number_of_places_left_in_vector == 0)
4842 {
4843 tree init;
4844 if (constant_p && !neutral_op
4845 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4846 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4847 /* Build the vector directly from ELTS. */
4848 init = gimple_build_vector (&ctor_seq, &elts);
4849 else if (neutral_op)
4850 {
4851 /* Build a vector of the neutral value and shift the
4852 other elements into place. */
4853 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4854 neutral_op);
4855 int k = nunits;
4856 while (k > 0 && elts[k - 1] == neutral_op)
4857 k -= 1;
4858 while (k > 0)
4859 {
4860 k -= 1;
4861 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4862 vector_type, init, elts[k]);
4863 }
4864 }
4865 else
4866 {
4867 /* First time round, duplicate ELTS to fill the
4868 required number of vectors. */
4869 duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4870 number_of_vectors, *vec_oprnds);
4871 break;
4872 }
4873 vec_oprnds->quick_push (init);
4874
4875 number_of_places_left_in_vector = nunits;
4876 elts.new_vector (vector_type, nunits, 1);
4877 elts.quick_grow (nunits);
4878 constant_p = true;
4879 }
4880 }
4881 if (ctor_seq != NULL)
4882 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4883 }
4884
4885 /* For a statement STMT_INFO taking part in a reduction operation return
4886 the stmt_vec_info the meta information is stored on. */
4887
4888 stmt_vec_info
4889 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4890 {
4891 stmt_info = vect_orig_stmt (stmt_info);
4892 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4893 if (!is_a <gphi *> (stmt_info->stmt)
4894 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4895 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4896 gphi *phi = as_a <gphi *> (stmt_info->stmt);
4897 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4898 {
4899 if (gimple_phi_num_args (phi) == 1)
4900 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4901 }
4902 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4903 {
4904 edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4905 stmt_vec_info info
4906 = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4907 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4908 stmt_info = info;
4909 }
4910 return stmt_info;
4911 }
4912
4913 /* Function vect_create_epilog_for_reduction
4914
4915 Create code at the loop-epilog to finalize the result of a reduction
4916 computation.
4917
4918 STMT_INFO is the scalar reduction stmt that is being vectorized.
4919 SLP_NODE is an SLP node containing a group of reduction statements. The
4920 first one in this group is STMT_INFO.
4921 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4922 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4923 (counting from 0)
4924
4925 This function:
4926 1. Completes the reduction def-use cycles.
4927 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4928 by calling the function specified by REDUC_FN if available, or by
4929 other means (whole-vector shifts or a scalar loop).
4930 The function also creates a new phi node at the loop exit to preserve
4931 loop-closed form, as illustrated below.
4932
4933 The flow at the entry to this function:
4934
4935 loop:
4936 vec_def = phi <vec_init, null> # REDUCTION_PHI
4937 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4938 s_loop = scalar_stmt # (scalar) STMT_INFO
4939 loop_exit:
4940 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4941 use <s_out0>
4942 use <s_out0>
4943
4944 The above is transformed by this function into:
4945
4946 loop:
4947 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4948 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4949 s_loop = scalar_stmt # (scalar) STMT_INFO
4950 loop_exit:
4951 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4952 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4953 v_out2 = reduce <v_out1>
4954 s_out3 = extract_field <v_out2, 0>
4955 s_out4 = adjust_result <s_out3>
4956 use <s_out4>
4957 use <s_out4>
4958 */
4959
4960 static void
4961 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
4962 stmt_vec_info stmt_info,
4963 slp_tree slp_node,
4964 slp_instance slp_node_instance)
4965 {
4966 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4967 gcc_assert (reduc_info->is_reduc_info);
4968 /* For double reductions we need to get at the inner loop reduction
4969 stmt which has the meta info attached. Our stmt_info is that of the
4970 loop-closed PHI of the inner loop which we remember as
4971 def for the reduction PHI generation. */
4972 bool double_reduc = false;
4973 stmt_vec_info rdef_info = stmt_info;
4974 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4975 {
4976 gcc_assert (!slp_node);
4977 double_reduc = true;
4978 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4979 (stmt_info->stmt, 0));
4980 stmt_info = vect_stmt_to_vectorize (stmt_info);
4981 }
4982 gphi *reduc_def_stmt
4983 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4984 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4985 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4986 tree vectype;
4987 machine_mode mode;
4988 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4989 basic_block exit_bb;
4990 tree scalar_dest;
4991 tree scalar_type;
4992 gimple *new_phi = NULL, *phi;
4993 gimple_stmt_iterator exit_gsi;
4994 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4995 gimple *epilog_stmt = NULL;
4996 gimple *exit_phi;
4997 tree bitsize;
4998 tree def;
4999 tree orig_name, scalar_result;
5000 imm_use_iterator imm_iter, phi_imm_iter;
5001 use_operand_p use_p, phi_use_p;
5002 gimple *use_stmt;
5003 bool nested_in_vect_loop = false;
5004 auto_vec<gimple *> new_phis;
5005 int j, i;
5006 auto_vec<tree> scalar_results;
5007 unsigned int group_size = 1, k;
5008 auto_vec<gimple *> phis;
5009 bool slp_reduc = false;
5010 bool direct_slp_reduc;
5011 tree new_phi_result;
5012 tree induction_index = NULL_TREE;
5013
5014 if (slp_node)
5015 group_size = SLP_TREE_LANES (slp_node);
5016
5017 if (nested_in_vect_loop_p (loop, stmt_info))
5018 {
5019 outer_loop = loop;
5020 loop = loop->inner;
5021 nested_in_vect_loop = true;
5022 gcc_assert (!slp_node);
5023 }
5024 gcc_assert (!nested_in_vect_loop || double_reduc);
5025
5026 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5027 gcc_assert (vectype);
5028 mode = TYPE_MODE (vectype);
5029
5030 tree initial_def = NULL;
5031 tree induc_val = NULL_TREE;
5032 tree adjustment_def = NULL;
5033 if (slp_node)
5034 ;
5035 else
5036 {
5037 /* Get at the scalar def before the loop, that defines the initial value
5038 of the reduction variable. */
5039 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5040 loop_preheader_edge (loop));
5041 /* Optimize: for induction condition reduction, if we can't use zero
5042 for induc_val, use initial_def. */
5043 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5044 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5045 else if (double_reduc)
5046 ;
5047 else if (nested_in_vect_loop)
5048 ;
5049 else
5050 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5051 }
5052
5053 unsigned vec_num;
5054 int ncopies;
5055 if (slp_node)
5056 {
5057 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5058 ncopies = 1;
5059 }
5060 else
5061 {
5062 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5063 vec_num = 1;
5064 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5065 }
5066
5067 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5068 which is updated with the current index of the loop for every match of
5069 the original loop's cond_expr (VEC_STMT). This results in a vector
5070 containing the last time the condition passed for that vector lane.
5071 The first match will be a 1 to allow 0 to be used for non-matching
5072 indexes. If there are no matches at all then the vector will be all
5073 zeroes.
5074
5075 PR92772: This algorithm is broken for architectures that support
5076 masked vectors, but do not provide fold_extract_last. */
5077 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5078 {
5079 auto_vec<std::pair<tree, bool>, 2> ccompares;
5080 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5081 cond_info = vect_stmt_to_vectorize (cond_info);
5082 while (cond_info != reduc_info)
5083 {
5084 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5085 {
5086 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5087 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5088 ccompares.safe_push
5089 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5090 STMT_VINFO_REDUC_IDX (cond_info) == 2));
5091 }
5092 cond_info
5093 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5094 1 + STMT_VINFO_REDUC_IDX
5095 (cond_info)));
5096 cond_info = vect_stmt_to_vectorize (cond_info);
5097 }
5098 gcc_assert (ccompares.length () != 0);
5099
5100 tree indx_before_incr, indx_after_incr;
5101 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5102 int scalar_precision
5103 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5104 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5105 tree cr_index_vector_type = get_related_vectype_for_scalar_type
5106 (TYPE_MODE (vectype), cr_index_scalar_type,
5107 TYPE_VECTOR_SUBPARTS (vectype));
5108
5109 /* First we create a simple vector induction variable which starts
5110 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5111 vector size (STEP). */
5112
5113 /* Create a {1,2,3,...} vector. */
5114 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5115
5116 /* Create a vector of the step value. */
5117 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5118 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5119
5120 /* Create an induction variable. */
5121 gimple_stmt_iterator incr_gsi;
5122 bool insert_after;
5123 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5124 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5125 insert_after, &indx_before_incr, &indx_after_incr);
5126
5127 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5128 filled with zeros (VEC_ZERO). */
5129
5130 /* Create a vector of 0s. */
5131 tree zero = build_zero_cst (cr_index_scalar_type);
5132 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5133
5134 /* Create a vector phi node. */
5135 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5136 new_phi = create_phi_node (new_phi_tree, loop->header);
5137 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5138 loop_preheader_edge (loop), UNKNOWN_LOCATION);
5139
5140 /* Now take the condition from the loops original cond_exprs
5141 and produce a new cond_exprs (INDEX_COND_EXPR) which for
5142 every match uses values from the induction variable
5143 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5144 (NEW_PHI_TREE).
5145 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5146 the new cond_expr (INDEX_COND_EXPR). */
5147 gimple_seq stmts = NULL;
5148 for (int i = ccompares.length () - 1; i != -1; --i)
5149 {
5150 tree ccompare = ccompares[i].first;
5151 if (ccompares[i].second)
5152 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5153 cr_index_vector_type,
5154 ccompare,
5155 indx_before_incr, new_phi_tree);
5156 else
5157 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5158 cr_index_vector_type,
5159 ccompare,
5160 new_phi_tree, indx_before_incr);
5161 }
5162 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5163
5164 /* Update the phi with the vec cond. */
5165 induction_index = new_phi_tree;
5166 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5167 loop_latch_edge (loop), UNKNOWN_LOCATION);
5168 }
5169
5170 /* 2. Create epilog code.
5171 The reduction epilog code operates across the elements of the vector
5172 of partial results computed by the vectorized loop.
5173 The reduction epilog code consists of:
5174
5175 step 1: compute the scalar result in a vector (v_out2)
5176 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5177 step 3: adjust the scalar result (s_out3) if needed.
5178
5179 Step 1 can be accomplished using one the following three schemes:
5180 (scheme 1) using reduc_fn, if available.
5181 (scheme 2) using whole-vector shifts, if available.
5182 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5183 combined.
5184
5185 The overall epilog code looks like this:
5186
5187 s_out0 = phi <s_loop> # original EXIT_PHI
5188 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5189 v_out2 = reduce <v_out1> # step 1
5190 s_out3 = extract_field <v_out2, 0> # step 2
5191 s_out4 = adjust_result <s_out3> # step 3
5192
5193 (step 3 is optional, and steps 1 and 2 may be combined).
5194 Lastly, the uses of s_out0 are replaced by s_out4. */
5195
5196
5197 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5198 v_out1 = phi <VECT_DEF>
5199 Store them in NEW_PHIS. */
5200 if (double_reduc)
5201 loop = outer_loop;
5202 exit_bb = single_exit (loop)->dest;
5203 new_phis.create (slp_node ? vec_num : ncopies);
5204 for (unsigned i = 0; i < vec_num; i++)
5205 {
5206 if (slp_node)
5207 def = vect_get_slp_vect_def (slp_node, i);
5208 else
5209 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5210 for (j = 0; j < ncopies; j++)
5211 {
5212 tree new_def = copy_ssa_name (def);
5213 phi = create_phi_node (new_def, exit_bb);
5214 if (j == 0)
5215 new_phis.quick_push (phi);
5216 else
5217 {
5218 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5219 new_phis.quick_push (phi);
5220 }
5221
5222 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5223 }
5224 }
5225
5226 exit_gsi = gsi_after_labels (exit_bb);
5227
5228 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5229 (i.e. when reduc_fn is not available) and in the final adjustment
5230 code (if needed). Also get the original scalar reduction variable as
5231 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5232 represents a reduction pattern), the tree-code and scalar-def are
5233 taken from the original stmt that the pattern-stmt (STMT) replaces.
5234 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5235 are taken from STMT. */
5236
5237 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5238 if (orig_stmt_info != stmt_info)
5239 {
5240 /* Reduction pattern */
5241 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5242 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5243 }
5244
5245 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
5246 scalar_type = TREE_TYPE (scalar_dest);
5247 scalar_results.create (group_size);
5248 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5249 bitsize = TYPE_SIZE (scalar_type);
5250
5251 /* SLP reduction without reduction chain, e.g.,
5252 # a1 = phi <a2, a0>
5253 # b1 = phi <b2, b0>
5254 a2 = operation (a1)
5255 b2 = operation (b1) */
5256 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5257
5258 /* True if we should implement SLP_REDUC using native reduction operations
5259 instead of scalar operations. */
5260 direct_slp_reduc = (reduc_fn != IFN_LAST
5261 && slp_reduc
5262 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5263
5264 /* In case of reduction chain, e.g.,
5265 # a1 = phi <a3, a0>
5266 a2 = operation (a1)
5267 a3 = operation (a2),
5268
5269 we may end up with more than one vector result. Here we reduce them to
5270 one vector. */
5271 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
5272 {
5273 gimple_seq stmts = NULL;
5274 tree first_vect = PHI_RESULT (new_phis[0]);
5275 first_vect = gimple_convert (&stmts, vectype, first_vect);
5276 for (k = 1; k < new_phis.length (); k++)
5277 {
5278 gimple *next_phi = new_phis[k];
5279 tree second_vect = PHI_RESULT (next_phi);
5280 second_vect = gimple_convert (&stmts, vectype, second_vect);
5281 first_vect = gimple_build (&stmts, code, vectype,
5282 first_vect, second_vect);
5283 }
5284 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5285
5286 new_phi_result = first_vect;
5287 new_phis.truncate (0);
5288 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5289 }
5290 /* Likewise if we couldn't use a single defuse cycle. */
5291 else if (ncopies > 1)
5292 {
5293 gimple_seq stmts = NULL;
5294 tree first_vect = PHI_RESULT (new_phis[0]);
5295 first_vect = gimple_convert (&stmts, vectype, first_vect);
5296 for (int k = 1; k < ncopies; ++k)
5297 {
5298 tree second_vect = PHI_RESULT (new_phis[k]);
5299 second_vect = gimple_convert (&stmts, vectype, second_vect);
5300 first_vect = gimple_build (&stmts, code, vectype,
5301 first_vect, second_vect);
5302 }
5303 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5304 new_phi_result = first_vect;
5305 new_phis.truncate (0);
5306 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5307 }
5308 else
5309 new_phi_result = PHI_RESULT (new_phis[0]);
5310
5311 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5312 && reduc_fn != IFN_LAST)
5313 {
5314 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5315 various data values where the condition matched and another vector
5316 (INDUCTION_INDEX) containing all the indexes of those matches. We
5317 need to extract the last matching index (which will be the index with
5318 highest value) and use this to index into the data vector.
5319 For the case where there were no matches, the data vector will contain
5320 all default values and the index vector will be all zeros. */
5321
5322 /* Get various versions of the type of the vector of indexes. */
5323 tree index_vec_type = TREE_TYPE (induction_index);
5324 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5325 tree index_scalar_type = TREE_TYPE (index_vec_type);
5326 tree index_vec_cmp_type = truth_type_for (index_vec_type);
5327
5328 /* Get an unsigned integer version of the type of the data vector. */
5329 int scalar_precision
5330 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5331 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5332 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5333 vectype);
5334
5335 /* First we need to create a vector (ZERO_VEC) of zeros and another
5336 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5337 can create using a MAX reduction and then expanding.
5338 In the case where the loop never made any matches, the max index will
5339 be zero. */
5340
5341 /* Vector of {0, 0, 0,...}. */
5342 tree zero_vec = build_zero_cst (vectype);
5343
5344 gimple_seq stmts = NULL;
5345 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5346 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5347
5348 /* Find maximum value from the vector of found indexes. */
5349 tree max_index = make_ssa_name (index_scalar_type);
5350 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5351 1, induction_index);
5352 gimple_call_set_lhs (max_index_stmt, max_index);
5353 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5354
5355 /* Vector of {max_index, max_index, max_index,...}. */
5356 tree max_index_vec = make_ssa_name (index_vec_type);
5357 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5358 max_index);
5359 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5360 max_index_vec_rhs);
5361 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5362
5363 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5364 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5365 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5366 otherwise. Only one value should match, resulting in a vector
5367 (VEC_COND) with one data value and the rest zeros.
5368 In the case where the loop never made any matches, every index will
5369 match, resulting in a vector with all data values (which will all be
5370 the default value). */
5371
5372 /* Compare the max index vector to the vector of found indexes to find
5373 the position of the max value. */
5374 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5375 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5376 induction_index,
5377 max_index_vec);
5378 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5379
5380 /* Use the compare to choose either values from the data vector or
5381 zero. */
5382 tree vec_cond = make_ssa_name (vectype);
5383 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5384 vec_compare, new_phi_result,
5385 zero_vec);
5386 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5387
5388 /* Finally we need to extract the data value from the vector (VEC_COND)
5389 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5390 reduction, but because this doesn't exist, we can use a MAX reduction
5391 instead. The data value might be signed or a float so we need to cast
5392 it first.
5393 In the case where the loop never made any matches, the data values are
5394 all identical, and so will reduce down correctly. */
5395
5396 /* Make the matched data values unsigned. */
5397 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5398 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5399 vec_cond);
5400 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5401 VIEW_CONVERT_EXPR,
5402 vec_cond_cast_rhs);
5403 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5404
5405 /* Reduce down to a scalar value. */
5406 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5407 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5408 1, vec_cond_cast);
5409 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5410 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5411
5412 /* Convert the reduced value back to the result type and set as the
5413 result. */
5414 stmts = NULL;
5415 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5416 data_reduc);
5417 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5418 scalar_results.safe_push (new_temp);
5419 }
5420 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5421 && reduc_fn == IFN_LAST)
5422 {
5423 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5424 idx = 0;
5425 idx_val = induction_index[0];
5426 val = data_reduc[0];
5427 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5428 if (induction_index[i] > idx_val)
5429 val = data_reduc[i], idx_val = induction_index[i];
5430 return val; */
5431
5432 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5433 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5434 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5435 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5436 /* Enforced by vectorizable_reduction, which ensures we have target
5437 support before allowing a conditional reduction on variable-length
5438 vectors. */
5439 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5440 tree idx_val = NULL_TREE, val = NULL_TREE;
5441 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5442 {
5443 tree old_idx_val = idx_val;
5444 tree old_val = val;
5445 idx_val = make_ssa_name (idx_eltype);
5446 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5447 build3 (BIT_FIELD_REF, idx_eltype,
5448 induction_index,
5449 bitsize_int (el_size),
5450 bitsize_int (off)));
5451 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5452 val = make_ssa_name (data_eltype);
5453 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5454 build3 (BIT_FIELD_REF,
5455 data_eltype,
5456 new_phi_result,
5457 bitsize_int (el_size),
5458 bitsize_int (off)));
5459 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5460 if (off != 0)
5461 {
5462 tree new_idx_val = idx_val;
5463 if (off != v_size - el_size)
5464 {
5465 new_idx_val = make_ssa_name (idx_eltype);
5466 epilog_stmt = gimple_build_assign (new_idx_val,
5467 MAX_EXPR, idx_val,
5468 old_idx_val);
5469 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5470 }
5471 tree new_val = make_ssa_name (data_eltype);
5472 epilog_stmt = gimple_build_assign (new_val,
5473 COND_EXPR,
5474 build2 (GT_EXPR,
5475 boolean_type_node,
5476 idx_val,
5477 old_idx_val),
5478 val, old_val);
5479 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5480 idx_val = new_idx_val;
5481 val = new_val;
5482 }
5483 }
5484 /* Convert the reduced value back to the result type and set as the
5485 result. */
5486 gimple_seq stmts = NULL;
5487 val = gimple_convert (&stmts, scalar_type, val);
5488 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5489 scalar_results.safe_push (val);
5490 }
5491
5492 /* 2.3 Create the reduction code, using one of the three schemes described
5493 above. In SLP we simply need to extract all the elements from the
5494 vector (without reducing them), so we use scalar shifts. */
5495 else if (reduc_fn != IFN_LAST && !slp_reduc)
5496 {
5497 tree tmp;
5498 tree vec_elem_type;
5499
5500 /* Case 1: Create:
5501 v_out2 = reduc_expr <v_out1> */
5502
5503 if (dump_enabled_p ())
5504 dump_printf_loc (MSG_NOTE, vect_location,
5505 "Reduce using direct vector reduction.\n");
5506
5507 gimple_seq stmts = NULL;
5508 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5509 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5510 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5511 vec_elem_type, new_phi_result);
5512 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5513 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5514
5515 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5516 && induc_val)
5517 {
5518 /* Earlier we set the initial value to be a vector if induc_val
5519 values. Check the result and if it is induc_val then replace
5520 with the original initial value, unless induc_val is
5521 the same as initial_def already. */
5522 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5523 induc_val);
5524
5525 tmp = make_ssa_name (new_scalar_dest);
5526 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5527 initial_def, new_temp);
5528 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5529 new_temp = tmp;
5530 }
5531
5532 scalar_results.safe_push (new_temp);
5533 }
5534 else if (direct_slp_reduc)
5535 {
5536 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5537 with the elements for other SLP statements replaced with the
5538 neutral value. We can then do a normal reduction on each vector. */
5539
5540 /* Enforced by vectorizable_reduction. */
5541 gcc_assert (new_phis.length () == 1);
5542 gcc_assert (pow2p_hwi (group_size));
5543
5544 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5545 vec<stmt_vec_info> orig_phis
5546 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5547 gimple_seq seq = NULL;
5548
5549 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5550 and the same element size as VECTYPE. */
5551 tree index = build_index_vector (vectype, 0, 1);
5552 tree index_type = TREE_TYPE (index);
5553 tree index_elt_type = TREE_TYPE (index_type);
5554 tree mask_type = truth_type_for (index_type);
5555
5556 /* Create a vector that, for each element, identifies which of
5557 the REDUC_GROUP_SIZE results should use it. */
5558 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5559 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5560 build_vector_from_val (index_type, index_mask));
5561
5562 /* Get a neutral vector value. This is simply a splat of the neutral
5563 scalar value if we have one, otherwise the initial scalar value
5564 is itself a neutral value. */
5565 tree vector_identity = NULL_TREE;
5566 tree neutral_op = NULL_TREE;
5567 if (slp_node)
5568 {
5569 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5570 neutral_op
5571 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5572 vectype, code, first != NULL);
5573 }
5574 if (neutral_op)
5575 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5576 neutral_op);
5577 for (unsigned int i = 0; i < group_size; ++i)
5578 {
5579 /* If there's no univeral neutral value, we can use the
5580 initial scalar value from the original PHI. This is used
5581 for MIN and MAX reduction, for example. */
5582 if (!neutral_op)
5583 {
5584 tree scalar_value
5585 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5586 loop_preheader_edge (loop));
5587 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5588 scalar_value);
5589 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5590 scalar_value);
5591 }
5592
5593 /* Calculate the equivalent of:
5594
5595 sel[j] = (index[j] == i);
5596
5597 which selects the elements of NEW_PHI_RESULT that should
5598 be included in the result. */
5599 tree compare_val = build_int_cst (index_elt_type, i);
5600 compare_val = build_vector_from_val (index_type, compare_val);
5601 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5602 index, compare_val);
5603
5604 /* Calculate the equivalent of:
5605
5606 vec = seq ? new_phi_result : vector_identity;
5607
5608 VEC is now suitable for a full vector reduction. */
5609 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5610 sel, new_phi_result, vector_identity);
5611
5612 /* Do the reduction and convert it to the appropriate type. */
5613 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5614 TREE_TYPE (vectype), vec);
5615 scalar = gimple_convert (&seq, scalar_type, scalar);
5616 scalar_results.safe_push (scalar);
5617 }
5618 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5619 }
5620 else
5621 {
5622 bool reduce_with_shift;
5623 tree vec_temp;
5624
5625 gcc_assert (slp_reduc || new_phis.length () == 1);
5626
5627 /* See if the target wants to do the final (shift) reduction
5628 in a vector mode of smaller size and first reduce upper/lower
5629 halves against each other. */
5630 enum machine_mode mode1 = mode;
5631 tree stype = TREE_TYPE (vectype);
5632 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5633 unsigned nunits1 = nunits;
5634 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5635 && new_phis.length () == 1)
5636 {
5637 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5638 /* For SLP reductions we have to make sure lanes match up, but
5639 since we're doing individual element final reduction reducing
5640 vector width here is even more important.
5641 ??? We can also separate lanes with permutes, for the common
5642 case of power-of-two group-size odd/even extracts would work. */
5643 if (slp_reduc && nunits != nunits1)
5644 {
5645 nunits1 = least_common_multiple (nunits1, group_size);
5646 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5647 }
5648 }
5649 if (!slp_reduc
5650 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5651 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5652
5653 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5654 stype, nunits1);
5655 reduce_with_shift = have_whole_vector_shift (mode1);
5656 if (!VECTOR_MODE_P (mode1))
5657 reduce_with_shift = false;
5658 else
5659 {
5660 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5661 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5662 reduce_with_shift = false;
5663 }
5664
5665 /* First reduce the vector to the desired vector size we should
5666 do shift reduction on by combining upper and lower halves. */
5667 new_temp = new_phi_result;
5668 while (nunits > nunits1)
5669 {
5670 nunits /= 2;
5671 vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5672 stype, nunits);
5673 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5674
5675 /* The target has to make sure we support lowpart/highpart
5676 extraction, either via direct vector extract or through
5677 an integer mode punning. */
5678 tree dst1, dst2;
5679 if (convert_optab_handler (vec_extract_optab,
5680 TYPE_MODE (TREE_TYPE (new_temp)),
5681 TYPE_MODE (vectype1))
5682 != CODE_FOR_nothing)
5683 {
5684 /* Extract sub-vectors directly once vec_extract becomes
5685 a conversion optab. */
5686 dst1 = make_ssa_name (vectype1);
5687 epilog_stmt
5688 = gimple_build_assign (dst1, BIT_FIELD_REF,
5689 build3 (BIT_FIELD_REF, vectype1,
5690 new_temp, TYPE_SIZE (vectype1),
5691 bitsize_int (0)));
5692 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5693 dst2 = make_ssa_name (vectype1);
5694 epilog_stmt
5695 = gimple_build_assign (dst2, BIT_FIELD_REF,
5696 build3 (BIT_FIELD_REF, vectype1,
5697 new_temp, TYPE_SIZE (vectype1),
5698 bitsize_int (bitsize)));
5699 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5700 }
5701 else
5702 {
5703 /* Extract via punning to appropriately sized integer mode
5704 vector. */
5705 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5706 tree etype = build_vector_type (eltype, 2);
5707 gcc_assert (convert_optab_handler (vec_extract_optab,
5708 TYPE_MODE (etype),
5709 TYPE_MODE (eltype))
5710 != CODE_FOR_nothing);
5711 tree tem = make_ssa_name (etype);
5712 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5713 build1 (VIEW_CONVERT_EXPR,
5714 etype, new_temp));
5715 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5716 new_temp = tem;
5717 tem = make_ssa_name (eltype);
5718 epilog_stmt
5719 = gimple_build_assign (tem, BIT_FIELD_REF,
5720 build3 (BIT_FIELD_REF, eltype,
5721 new_temp, TYPE_SIZE (eltype),
5722 bitsize_int (0)));
5723 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5724 dst1 = make_ssa_name (vectype1);
5725 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5726 build1 (VIEW_CONVERT_EXPR,
5727 vectype1, tem));
5728 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5729 tem = make_ssa_name (eltype);
5730 epilog_stmt
5731 = gimple_build_assign (tem, BIT_FIELD_REF,
5732 build3 (BIT_FIELD_REF, eltype,
5733 new_temp, TYPE_SIZE (eltype),
5734 bitsize_int (bitsize)));
5735 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5736 dst2 = make_ssa_name (vectype1);
5737 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5738 build1 (VIEW_CONVERT_EXPR,
5739 vectype1, tem));
5740 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5741 }
5742
5743 new_temp = make_ssa_name (vectype1);
5744 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5745 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5746 new_phis[0] = epilog_stmt;
5747 }
5748
5749 if (reduce_with_shift && !slp_reduc)
5750 {
5751 int element_bitsize = tree_to_uhwi (bitsize);
5752 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5753 for variable-length vectors and also requires direct target support
5754 for loop reductions. */
5755 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5756 int nelements = vec_size_in_bits / element_bitsize;
5757 vec_perm_builder sel;
5758 vec_perm_indices indices;
5759
5760 int elt_offset;
5761
5762 tree zero_vec = build_zero_cst (vectype1);
5763 /* Case 2: Create:
5764 for (offset = nelements/2; offset >= 1; offset/=2)
5765 {
5766 Create: va' = vec_shift <va, offset>
5767 Create: va = vop <va, va'>
5768 } */
5769
5770 tree rhs;
5771
5772 if (dump_enabled_p ())
5773 dump_printf_loc (MSG_NOTE, vect_location,
5774 "Reduce using vector shifts\n");
5775
5776 gimple_seq stmts = NULL;
5777 new_temp = gimple_convert (&stmts, vectype1, new_temp);
5778 for (elt_offset = nelements / 2;
5779 elt_offset >= 1;
5780 elt_offset /= 2)
5781 {
5782 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5783 indices.new_vector (sel, 2, nelements);
5784 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5785 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5786 new_temp, zero_vec, mask);
5787 new_temp = gimple_build (&stmts, code,
5788 vectype1, new_name, new_temp);
5789 }
5790 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5791
5792 /* 2.4 Extract the final scalar result. Create:
5793 s_out3 = extract_field <v_out2, bitpos> */
5794
5795 if (dump_enabled_p ())
5796 dump_printf_loc (MSG_NOTE, vect_location,
5797 "extract scalar result\n");
5798
5799 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5800 bitsize, bitsize_zero_node);
5801 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5802 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5803 gimple_assign_set_lhs (epilog_stmt, new_temp);
5804 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5805 scalar_results.safe_push (new_temp);
5806 }
5807 else
5808 {
5809 /* Case 3: Create:
5810 s = extract_field <v_out2, 0>
5811 for (offset = element_size;
5812 offset < vector_size;
5813 offset += element_size;)
5814 {
5815 Create: s' = extract_field <v_out2, offset>
5816 Create: s = op <s, s'> // For non SLP cases
5817 } */
5818
5819 if (dump_enabled_p ())
5820 dump_printf_loc (MSG_NOTE, vect_location,
5821 "Reduce using scalar code.\n");
5822
5823 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5824 int element_bitsize = tree_to_uhwi (bitsize);
5825 tree compute_type = TREE_TYPE (vectype);
5826 gimple_seq stmts = NULL;
5827 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5828 {
5829 int bit_offset;
5830 if (gimple_code (new_phi) == GIMPLE_PHI)
5831 vec_temp = PHI_RESULT (new_phi);
5832 else
5833 vec_temp = gimple_assign_lhs (new_phi);
5834 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5835 vec_temp, bitsize, bitsize_zero_node);
5836
5837 /* In SLP we don't need to apply reduction operation, so we just
5838 collect s' values in SCALAR_RESULTS. */
5839 if (slp_reduc)
5840 scalar_results.safe_push (new_temp);
5841
5842 for (bit_offset = element_bitsize;
5843 bit_offset < vec_size_in_bits;
5844 bit_offset += element_bitsize)
5845 {
5846 tree bitpos = bitsize_int (bit_offset);
5847 new_name = gimple_build (&stmts, BIT_FIELD_REF,
5848 compute_type, vec_temp,
5849 bitsize, bitpos);
5850 if (slp_reduc)
5851 {
5852 /* In SLP we don't need to apply reduction operation, so
5853 we just collect s' values in SCALAR_RESULTS. */
5854 new_temp = new_name;
5855 scalar_results.safe_push (new_name);
5856 }
5857 else
5858 new_temp = gimple_build (&stmts, code, compute_type,
5859 new_name, new_temp);
5860 }
5861 }
5862
5863 /* The only case where we need to reduce scalar results in SLP, is
5864 unrolling. If the size of SCALAR_RESULTS is greater than
5865 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5866 REDUC_GROUP_SIZE. */
5867 if (slp_reduc)
5868 {
5869 tree res, first_res, new_res;
5870
5871 /* Reduce multiple scalar results in case of SLP unrolling. */
5872 for (j = group_size; scalar_results.iterate (j, &res);
5873 j++)
5874 {
5875 first_res = scalar_results[j % group_size];
5876 new_res = gimple_build (&stmts, code, compute_type,
5877 first_res, res);
5878 scalar_results[j % group_size] = new_res;
5879 }
5880 for (k = 0; k < group_size; k++)
5881 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5882 scalar_results[k]);
5883 }
5884 else
5885 {
5886 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5887 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5888 scalar_results.safe_push (new_temp);
5889 }
5890
5891 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5892 }
5893
5894 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5895 && induc_val)
5896 {
5897 /* Earlier we set the initial value to be a vector if induc_val
5898 values. Check the result and if it is induc_val then replace
5899 with the original initial value, unless induc_val is
5900 the same as initial_def already. */
5901 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5902 induc_val);
5903
5904 tree tmp = make_ssa_name (new_scalar_dest);
5905 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5906 initial_def, new_temp);
5907 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5908 scalar_results[0] = tmp;
5909 }
5910 }
5911
5912 /* 2.5 Adjust the final result by the initial value of the reduction
5913 variable. (When such adjustment is not needed, then
5914 'adjustment_def' is zero). For example, if code is PLUS we create:
5915 new_temp = loop_exit_def + adjustment_def */
5916
5917 if (adjustment_def)
5918 {
5919 gcc_assert (!slp_reduc);
5920 gimple_seq stmts = NULL;
5921 if (nested_in_vect_loop)
5922 {
5923 new_phi = new_phis[0];
5924 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5925 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5926 new_temp = gimple_build (&stmts, code, vectype,
5927 PHI_RESULT (new_phi), adjustment_def);
5928 }
5929 else
5930 {
5931 new_temp = scalar_results[0];
5932 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5933 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5934 new_temp = gimple_build (&stmts, code, scalar_type,
5935 new_temp, adjustment_def);
5936 }
5937
5938 epilog_stmt = gimple_seq_last_stmt (stmts);
5939 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5940 if (nested_in_vect_loop)
5941 {
5942 if (!double_reduc)
5943 scalar_results.quick_push (new_temp);
5944 else
5945 scalar_results[0] = new_temp;
5946 }
5947 else
5948 scalar_results[0] = new_temp;
5949
5950 new_phis[0] = epilog_stmt;
5951 }
5952
5953 if (double_reduc)
5954 loop = loop->inner;
5955
5956 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5957 phis with new adjusted scalar results, i.e., replace use <s_out0>
5958 with use <s_out4>.
5959
5960 Transform:
5961 loop_exit:
5962 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5963 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5964 v_out2 = reduce <v_out1>
5965 s_out3 = extract_field <v_out2, 0>
5966 s_out4 = adjust_result <s_out3>
5967 use <s_out0>
5968 use <s_out0>
5969
5970 into:
5971
5972 loop_exit:
5973 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5974 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5975 v_out2 = reduce <v_out1>
5976 s_out3 = extract_field <v_out2, 0>
5977 s_out4 = adjust_result <s_out3>
5978 use <s_out4>
5979 use <s_out4> */
5980
5981
5982 /* In SLP reduction chain we reduce vector results into one vector if
5983 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5984 LHS of the last stmt in the reduction chain, since we are looking for
5985 the loop exit phi node. */
5986 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5987 {
5988 stmt_vec_info dest_stmt_info
5989 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5990 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5991 group_size = 1;
5992 }
5993
5994 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5995 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5996 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5997 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5998 correspond to the first vector stmt, etc.
5999 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
6000 if (group_size > new_phis.length ())
6001 gcc_assert (!(group_size % new_phis.length ()));
6002
6003 for (k = 0; k < group_size; k++)
6004 {
6005 if (slp_reduc)
6006 {
6007 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
6008
6009 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
6010 /* SLP statements can't participate in patterns. */
6011 gcc_assert (!orig_stmt_info);
6012 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
6013 }
6014
6015 if (nested_in_vect_loop)
6016 {
6017 if (double_reduc)
6018 loop = outer_loop;
6019 else
6020 gcc_unreachable ();
6021 }
6022
6023 phis.create (3);
6024 /* Find the loop-closed-use at the loop exit of the original scalar
6025 result. (The reduction result is expected to have two immediate uses,
6026 one at the latch block, and one at the loop exit). For double
6027 reductions we are looking for exit phis of the outer loop. */
6028 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6029 {
6030 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6031 {
6032 if (!is_gimple_debug (USE_STMT (use_p)))
6033 phis.safe_push (USE_STMT (use_p));
6034 }
6035 else
6036 {
6037 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6038 {
6039 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6040
6041 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6042 {
6043 if (!flow_bb_inside_loop_p (loop,
6044 gimple_bb (USE_STMT (phi_use_p)))
6045 && !is_gimple_debug (USE_STMT (phi_use_p)))
6046 phis.safe_push (USE_STMT (phi_use_p));
6047 }
6048 }
6049 }
6050 }
6051
6052 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6053 {
6054 /* Replace the uses: */
6055 orig_name = PHI_RESULT (exit_phi);
6056 scalar_result = scalar_results[k];
6057 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6058 {
6059 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6060 SET_USE (use_p, scalar_result);
6061 update_stmt (use_stmt);
6062 }
6063 }
6064
6065 phis.release ();
6066 }
6067 }
6068
6069 /* Return a vector of type VECTYPE that is equal to the vector select
6070 operation "MASK ? VEC : IDENTITY". Insert the select statements
6071 before GSI. */
6072
6073 static tree
6074 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6075 tree vec, tree identity)
6076 {
6077 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6078 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6079 mask, vec, identity);
6080 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6081 return cond;
6082 }
6083
6084 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6085 order, starting with LHS. Insert the extraction statements before GSI and
6086 associate the new scalar SSA names with variable SCALAR_DEST.
6087 Return the SSA name for the result. */
6088
6089 static tree
6090 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6091 tree_code code, tree lhs, tree vector_rhs)
6092 {
6093 tree vectype = TREE_TYPE (vector_rhs);
6094 tree scalar_type = TREE_TYPE (vectype);
6095 tree bitsize = TYPE_SIZE (scalar_type);
6096 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6097 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6098
6099 for (unsigned HOST_WIDE_INT bit_offset = 0;
6100 bit_offset < vec_size_in_bits;
6101 bit_offset += element_bitsize)
6102 {
6103 tree bitpos = bitsize_int (bit_offset);
6104 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6105 bitsize, bitpos);
6106
6107 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6108 rhs = make_ssa_name (scalar_dest, stmt);
6109 gimple_assign_set_lhs (stmt, rhs);
6110 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6111
6112 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6113 tree new_name = make_ssa_name (scalar_dest, stmt);
6114 gimple_assign_set_lhs (stmt, new_name);
6115 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6116 lhs = new_name;
6117 }
6118 return lhs;
6119 }
6120
6121 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6122 type of the vector input. */
6123
6124 static internal_fn
6125 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6126 {
6127 internal_fn mask_reduc_fn;
6128
6129 switch (reduc_fn)
6130 {
6131 case IFN_FOLD_LEFT_PLUS:
6132 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6133 break;
6134
6135 default:
6136 return IFN_LAST;
6137 }
6138
6139 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6140 OPTIMIZE_FOR_SPEED))
6141 return mask_reduc_fn;
6142 return IFN_LAST;
6143 }
6144
6145 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6146 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6147 statement. CODE is the operation performed by STMT_INFO and OPS are
6148 its scalar operands. REDUC_INDEX is the index of the operand in
6149 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6150 implements in-order reduction, or IFN_LAST if we should open-code it.
6151 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6152 that should be used to control the operation in a fully-masked loop. */
6153
6154 static bool
6155 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6156 stmt_vec_info stmt_info,
6157 gimple_stmt_iterator *gsi,
6158 gimple **vec_stmt, slp_tree slp_node,
6159 gimple *reduc_def_stmt,
6160 tree_code code, internal_fn reduc_fn,
6161 tree ops[3], tree vectype_in,
6162 int reduc_index, vec_loop_masks *masks)
6163 {
6164 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6165 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6166 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6167
6168 int ncopies;
6169 if (slp_node)
6170 ncopies = 1;
6171 else
6172 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6173
6174 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6175 gcc_assert (ncopies == 1);
6176 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6177
6178 if (slp_node)
6179 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6180 TYPE_VECTOR_SUBPARTS (vectype_in)));
6181
6182 tree op0 = ops[1 - reduc_index];
6183
6184 int group_size = 1;
6185 stmt_vec_info scalar_dest_def_info;
6186 auto_vec<tree> vec_oprnds0;
6187 if (slp_node)
6188 {
6189 auto_vec<vec<tree> > vec_defs (2);
6190 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6191 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6192 vec_defs[0].release ();
6193 vec_defs[1].release ();
6194 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6195 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6196 }
6197 else
6198 {
6199 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6200 op0, &vec_oprnds0);
6201 scalar_dest_def_info = stmt_info;
6202 }
6203
6204 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6205 tree scalar_type = TREE_TYPE (scalar_dest);
6206 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6207
6208 int vec_num = vec_oprnds0.length ();
6209 gcc_assert (vec_num == 1 || slp_node);
6210 tree vec_elem_type = TREE_TYPE (vectype_out);
6211 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6212
6213 tree vector_identity = NULL_TREE;
6214 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6215 vector_identity = build_zero_cst (vectype_out);
6216
6217 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6218 int i;
6219 tree def0;
6220 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6221 {
6222 gimple *new_stmt;
6223 tree mask = NULL_TREE;
6224 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6225 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6226
6227 /* Handle MINUS by adding the negative. */
6228 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6229 {
6230 tree negated = make_ssa_name (vectype_out);
6231 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6232 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6233 def0 = negated;
6234 }
6235
6236 if (mask && mask_reduc_fn == IFN_LAST)
6237 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6238 vector_identity);
6239
6240 /* On the first iteration the input is simply the scalar phi
6241 result, and for subsequent iterations it is the output of
6242 the preceding operation. */
6243 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6244 {
6245 if (mask && mask_reduc_fn != IFN_LAST)
6246 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6247 def0, mask);
6248 else
6249 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6250 def0);
6251 /* For chained SLP reductions the output of the previous reduction
6252 operation serves as the input of the next. For the final statement
6253 the output cannot be a temporary - we reuse the original
6254 scalar destination of the last statement. */
6255 if (i != vec_num - 1)
6256 {
6257 gimple_set_lhs (new_stmt, scalar_dest_var);
6258 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6259 gimple_set_lhs (new_stmt, reduc_var);
6260 }
6261 }
6262 else
6263 {
6264 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6265 reduc_var, def0);
6266 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6267 /* Remove the statement, so that we can use the same code paths
6268 as for statements that we've just created. */
6269 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6270 gsi_remove (&tmp_gsi, true);
6271 }
6272
6273 if (i == vec_num - 1)
6274 {
6275 gimple_set_lhs (new_stmt, scalar_dest);
6276 vect_finish_replace_stmt (loop_vinfo,
6277 scalar_dest_def_info,
6278 new_stmt);
6279 }
6280 else
6281 vect_finish_stmt_generation (loop_vinfo,
6282 scalar_dest_def_info,
6283 new_stmt, gsi);
6284
6285 if (slp_node)
6286 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6287 else
6288 {
6289 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6290 *vec_stmt = new_stmt;
6291 }
6292 }
6293
6294 return true;
6295 }
6296
6297 /* Function is_nonwrapping_integer_induction.
6298
6299 Check if STMT_VINO (which is part of loop LOOP) both increments and
6300 does not cause overflow. */
6301
6302 static bool
6303 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6304 {
6305 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6306 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6307 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6308 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6309 widest_int ni, max_loop_value, lhs_max;
6310 wi::overflow_type overflow = wi::OVF_NONE;
6311
6312 /* Make sure the loop is integer based. */
6313 if (TREE_CODE (base) != INTEGER_CST
6314 || TREE_CODE (step) != INTEGER_CST)
6315 return false;
6316
6317 /* Check that the max size of the loop will not wrap. */
6318
6319 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6320 return true;
6321
6322 if (! max_stmt_executions (loop, &ni))
6323 return false;
6324
6325 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6326 &overflow);
6327 if (overflow)
6328 return false;
6329
6330 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6331 TYPE_SIGN (lhs_type), &overflow);
6332 if (overflow)
6333 return false;
6334
6335 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6336 <= TYPE_PRECISION (lhs_type));
6337 }
6338
6339 /* Check if masking can be supported by inserting a conditional expression.
6340 CODE is the code for the operation. COND_FN is the conditional internal
6341 function, if it exists. VECTYPE_IN is the type of the vector input. */
6342 static bool
6343 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6344 tree vectype_in)
6345 {
6346 if (cond_fn != IFN_LAST
6347 && direct_internal_fn_supported_p (cond_fn, vectype_in,
6348 OPTIMIZE_FOR_SPEED))
6349 return false;
6350
6351 switch (code)
6352 {
6353 case DOT_PROD_EXPR:
6354 case SAD_EXPR:
6355 return true;
6356
6357 default:
6358 return false;
6359 }
6360 }
6361
6362 /* Insert a conditional expression to enable masked vectorization. CODE is the
6363 code for the operation. VOP is the array of operands. MASK is the loop
6364 mask. GSI is a statement iterator used to place the new conditional
6365 expression. */
6366 static void
6367 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6368 gimple_stmt_iterator *gsi)
6369 {
6370 switch (code)
6371 {
6372 case DOT_PROD_EXPR:
6373 {
6374 tree vectype = TREE_TYPE (vop[1]);
6375 tree zero = build_zero_cst (vectype);
6376 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6377 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6378 mask, vop[1], zero);
6379 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6380 vop[1] = masked_op1;
6381 break;
6382 }
6383
6384 case SAD_EXPR:
6385 {
6386 tree vectype = TREE_TYPE (vop[1]);
6387 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6388 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6389 mask, vop[1], vop[0]);
6390 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6391 vop[1] = masked_op1;
6392 break;
6393 }
6394
6395 default:
6396 gcc_unreachable ();
6397 }
6398 }
6399
6400 /* Function vectorizable_reduction.
6401
6402 Check if STMT_INFO performs a reduction operation that can be vectorized.
6403 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6404 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6405 Return true if STMT_INFO is vectorizable in this way.
6406
6407 This function also handles reduction idioms (patterns) that have been
6408 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6409 may be of this form:
6410 X = pattern_expr (arg0, arg1, ..., X)
6411 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6412 sequence that had been detected and replaced by the pattern-stmt
6413 (STMT_INFO).
6414
6415 This function also handles reduction of condition expressions, for example:
6416 for (int i = 0; i < N; i++)
6417 if (a[i] < value)
6418 last = a[i];
6419 This is handled by vectorising the loop and creating an additional vector
6420 containing the loop indexes for which "a[i] < value" was true. In the
6421 function epilogue this is reduced to a single max value and then used to
6422 index into the vector of results.
6423
6424 In some cases of reduction patterns, the type of the reduction variable X is
6425 different than the type of the other arguments of STMT_INFO.
6426 In such cases, the vectype that is used when transforming STMT_INFO into
6427 a vector stmt is different than the vectype that is used to determine the
6428 vectorization factor, because it consists of a different number of elements
6429 than the actual number of elements that are being operated upon in parallel.
6430
6431 For example, consider an accumulation of shorts into an int accumulator.
6432 On some targets it's possible to vectorize this pattern operating on 8
6433 shorts at a time (hence, the vectype for purposes of determining the
6434 vectorization factor should be V8HI); on the other hand, the vectype that
6435 is used to create the vector form is actually V4SI (the type of the result).
6436
6437 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6438 indicates what is the actual level of parallelism (V8HI in the example), so
6439 that the right vectorization factor would be derived. This vectype
6440 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6441 be used to create the vectorized stmt. The right vectype for the vectorized
6442 stmt is obtained from the type of the result X:
6443 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6444
6445 This means that, contrary to "regular" reductions (or "regular" stmts in
6446 general), the following equation:
6447 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6448 does *NOT* necessarily hold for reduction patterns. */
6449
6450 bool
6451 vectorizable_reduction (loop_vec_info loop_vinfo,
6452 stmt_vec_info stmt_info, slp_tree slp_node,
6453 slp_instance slp_node_instance,
6454 stmt_vector_for_cost *cost_vec)
6455 {
6456 tree scalar_dest;
6457 tree vectype_in = NULL_TREE;
6458 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6459 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6460 stmt_vec_info cond_stmt_vinfo = NULL;
6461 tree scalar_type;
6462 int i;
6463 int ncopies;
6464 bool single_defuse_cycle = false;
6465 bool nested_cycle = false;
6466 bool double_reduc = false;
6467 int vec_num;
6468 tree tem;
6469 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6470 tree cond_reduc_val = NULL_TREE;
6471
6472 /* Make sure it was already recognized as a reduction computation. */
6473 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6474 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6475 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6476 return false;
6477
6478 /* The stmt we store reduction analysis meta on. */
6479 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6480 reduc_info->is_reduc_info = true;
6481
6482 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6483 {
6484 if (is_a <gphi *> (stmt_info->stmt))
6485 {
6486 if (slp_node)
6487 {
6488 /* We eventually need to set a vector type on invariant
6489 arguments. */
6490 unsigned j;
6491 slp_tree child;
6492 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6493 if (!vect_maybe_update_slp_op_vectype
6494 (child, SLP_TREE_VECTYPE (slp_node)))
6495 {
6496 if (dump_enabled_p ())
6497 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6498 "incompatible vector types for "
6499 "invariants\n");
6500 return false;
6501 }
6502 }
6503 /* Analysis for double-reduction is done on the outer
6504 loop PHI, nested cycles have no further restrictions. */
6505 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6506 }
6507 else
6508 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6509 return true;
6510 }
6511
6512 stmt_vec_info orig_stmt_of_analysis = stmt_info;
6513 stmt_vec_info phi_info = stmt_info;
6514 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
6515 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6516 {
6517 if (!is_a <gphi *> (stmt_info->stmt))
6518 {
6519 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6520 return true;
6521 }
6522 if (slp_node)
6523 {
6524 slp_node_instance->reduc_phis = slp_node;
6525 /* ??? We're leaving slp_node to point to the PHIs, we only
6526 need it to get at the number of vector stmts which wasn't
6527 yet initialized for the instance root. */
6528 }
6529 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6530 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6531 else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6532 {
6533 use_operand_p use_p;
6534 gimple *use_stmt;
6535 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6536 &use_p, &use_stmt);
6537 gcc_assert (res);
6538 phi_info = loop_vinfo->lookup_stmt (use_stmt);
6539 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6540 }
6541 }
6542
6543 /* PHIs should not participate in patterns. */
6544 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6545 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6546
6547 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6548 and compute the reduction chain length. Discover the real
6549 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
6550 tree reduc_def
6551 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6552 loop_latch_edge
6553 (gimple_bb (reduc_def_phi)->loop_father));
6554 unsigned reduc_chain_length = 0;
6555 bool only_slp_reduc_chain = true;
6556 stmt_info = NULL;
6557 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6558 while (reduc_def != PHI_RESULT (reduc_def_phi))
6559 {
6560 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6561 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6562 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6563 {
6564 if (dump_enabled_p ())
6565 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6566 "reduction chain broken by patterns.\n");
6567 return false;
6568 }
6569 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6570 only_slp_reduc_chain = false;
6571 /* ??? For epilogue generation live members of the chain need
6572 to point back to the PHI via their original stmt for
6573 info_for_reduction to work. */
6574 if (STMT_VINFO_LIVE_P (vdef))
6575 STMT_VINFO_REDUC_DEF (def) = phi_info;
6576 gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6577 if (!assign)
6578 {
6579 if (dump_enabled_p ())
6580 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6581 "reduction chain includes calls.\n");
6582 return false;
6583 }
6584 if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6585 {
6586 if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6587 TREE_TYPE (gimple_assign_rhs1 (assign))))
6588 {
6589 if (dump_enabled_p ())
6590 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6591 "conversion in the reduction chain.\n");
6592 return false;
6593 }
6594 }
6595 else if (!stmt_info)
6596 /* First non-conversion stmt. */
6597 stmt_info = vdef;
6598 reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6599 reduc_chain_length++;
6600 if (!stmt_info && slp_node)
6601 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6602 }
6603 /* PHIs should not participate in patterns. */
6604 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6605
6606 if (nested_in_vect_loop_p (loop, stmt_info))
6607 {
6608 loop = loop->inner;
6609 nested_cycle = true;
6610 }
6611
6612 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6613 element. */
6614 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6615 {
6616 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6617 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6618 }
6619 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6620 gcc_assert (slp_node
6621 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6622
6623 /* 1. Is vectorizable reduction? */
6624 /* Not supportable if the reduction variable is used in the loop, unless
6625 it's a reduction chain. */
6626 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6627 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6628 return false;
6629
6630 /* Reductions that are not used even in an enclosing outer-loop,
6631 are expected to be "live" (used out of the loop). */
6632 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6633 && !STMT_VINFO_LIVE_P (stmt_info))
6634 return false;
6635
6636 /* 2. Has this been recognized as a reduction pattern?
6637
6638 Check if STMT represents a pattern that has been recognized
6639 in earlier analysis stages. For stmts that represent a pattern,
6640 the STMT_VINFO_RELATED_STMT field records the last stmt in
6641 the original sequence that constitutes the pattern. */
6642
6643 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6644 if (orig_stmt_info)
6645 {
6646 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6647 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6648 }
6649
6650 /* 3. Check the operands of the operation. The first operands are defined
6651 inside the loop body. The last operand is the reduction variable,
6652 which is defined by the loop-header-phi. */
6653
6654 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6655 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6656 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6657 enum tree_code code = gimple_assign_rhs_code (stmt);
6658 bool lane_reduc_code_p
6659 = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6660 int op_type = TREE_CODE_LENGTH (code);
6661
6662 scalar_dest = gimple_assign_lhs (stmt);
6663 scalar_type = TREE_TYPE (scalar_dest);
6664 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6665 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6666 return false;
6667
6668 /* Do not try to vectorize bit-precision reductions. */
6669 if (!type_has_mode_precision_p (scalar_type))
6670 return false;
6671
6672 /* For lane-reducing ops we're reducing the number of reduction PHIs
6673 which means the only use of that may be in the lane-reducing operation. */
6674 if (lane_reduc_code_p
6675 && reduc_chain_length != 1
6676 && !only_slp_reduc_chain)
6677 {
6678 if (dump_enabled_p ())
6679 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6680 "lane-reducing reduction with extra stmts.\n");
6681 return false;
6682 }
6683
6684 /* All uses but the last are expected to be defined in the loop.
6685 The last use is the reduction variable. In case of nested cycle this
6686 assumption is not true: we use reduc_index to record the index of the
6687 reduction variable. */
6688 slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6689 /* We need to skip an extra operand for COND_EXPRs with embedded
6690 comparison. */
6691 unsigned opno_adjust = 0;
6692 if (code == COND_EXPR
6693 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6694 opno_adjust = 1;
6695 for (i = 0; i < op_type; i++)
6696 {
6697 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6698 if (i == 0 && code == COND_EXPR)
6699 continue;
6700
6701 stmt_vec_info def_stmt_info;
6702 enum vect_def_type dt;
6703 tree op;
6704 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6705 i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6706 &def_stmt_info))
6707 {
6708 if (dump_enabled_p ())
6709 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6710 "use not simple.\n");
6711 return false;
6712 }
6713 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6714 continue;
6715
6716 /* There should be only one cycle def in the stmt, the one
6717 leading to reduc_def. */
6718 if (VECTORIZABLE_CYCLE_DEF (dt))
6719 return false;
6720
6721 /* To properly compute ncopies we are interested in the widest
6722 non-reduction input type in case we're looking at a widening
6723 accumulation that we later handle in vect_transform_reduction. */
6724 if (lane_reduc_code_p
6725 && tem
6726 && (!vectype_in
6727 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6728 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6729 vectype_in = tem;
6730
6731 if (code == COND_EXPR)
6732 {
6733 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6734 if (dt == vect_constant_def)
6735 {
6736 cond_reduc_dt = dt;
6737 cond_reduc_val = op;
6738 }
6739 if (dt == vect_induction_def
6740 && def_stmt_info
6741 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6742 {
6743 cond_reduc_dt = dt;
6744 cond_stmt_vinfo = def_stmt_info;
6745 }
6746 }
6747 }
6748 if (!vectype_in)
6749 vectype_in = STMT_VINFO_VECTYPE (phi_info);
6750 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6751
6752 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6753 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6754 /* If we have a condition reduction, see if we can simplify it further. */
6755 if (v_reduc_type == COND_REDUCTION)
6756 {
6757 if (slp_node)
6758 return false;
6759
6760 /* When the condition uses the reduction value in the condition, fail. */
6761 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6762 {
6763 if (dump_enabled_p ())
6764 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6765 "condition depends on previous iteration\n");
6766 return false;
6767 }
6768
6769 if (reduc_chain_length == 1
6770 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6771 vectype_in, OPTIMIZE_FOR_SPEED))
6772 {
6773 if (dump_enabled_p ())
6774 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6775 "optimizing condition reduction with"
6776 " FOLD_EXTRACT_LAST.\n");
6777 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6778 }
6779 else if (cond_reduc_dt == vect_induction_def)
6780 {
6781 tree base
6782 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6783 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6784
6785 gcc_assert (TREE_CODE (base) == INTEGER_CST
6786 && TREE_CODE (step) == INTEGER_CST);
6787 cond_reduc_val = NULL_TREE;
6788 enum tree_code cond_reduc_op_code = ERROR_MARK;
6789 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6790 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6791 ;
6792 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6793 above base; punt if base is the minimum value of the type for
6794 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6795 else if (tree_int_cst_sgn (step) == -1)
6796 {
6797 cond_reduc_op_code = MIN_EXPR;
6798 if (tree_int_cst_sgn (base) == -1)
6799 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6800 else if (tree_int_cst_lt (base,
6801 TYPE_MAX_VALUE (TREE_TYPE (base))))
6802 cond_reduc_val
6803 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6804 }
6805 else
6806 {
6807 cond_reduc_op_code = MAX_EXPR;
6808 if (tree_int_cst_sgn (base) == 1)
6809 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6810 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6811 base))
6812 cond_reduc_val
6813 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6814 }
6815 if (cond_reduc_val)
6816 {
6817 if (dump_enabled_p ())
6818 dump_printf_loc (MSG_NOTE, vect_location,
6819 "condition expression based on "
6820 "integer induction.\n");
6821 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6822 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6823 = cond_reduc_val;
6824 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6825 }
6826 }
6827 else if (cond_reduc_dt == vect_constant_def)
6828 {
6829 enum vect_def_type cond_initial_dt;
6830 tree cond_initial_val
6831 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6832
6833 gcc_assert (cond_reduc_val != NULL_TREE);
6834 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6835 if (cond_initial_dt == vect_constant_def
6836 && types_compatible_p (TREE_TYPE (cond_initial_val),
6837 TREE_TYPE (cond_reduc_val)))
6838 {
6839 tree e = fold_binary (LE_EXPR, boolean_type_node,
6840 cond_initial_val, cond_reduc_val);
6841 if (e && (integer_onep (e) || integer_zerop (e)))
6842 {
6843 if (dump_enabled_p ())
6844 dump_printf_loc (MSG_NOTE, vect_location,
6845 "condition expression based on "
6846 "compile time constant.\n");
6847 /* Record reduction code at analysis stage. */
6848 STMT_VINFO_REDUC_CODE (reduc_info)
6849 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6850 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6851 }
6852 }
6853 }
6854 }
6855
6856 if (STMT_VINFO_LIVE_P (phi_info))
6857 return false;
6858
6859 if (slp_node)
6860 ncopies = 1;
6861 else
6862 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6863
6864 gcc_assert (ncopies >= 1);
6865
6866 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6867
6868 if (nested_cycle)
6869 {
6870 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6871 == vect_double_reduction_def);
6872 double_reduc = true;
6873 }
6874
6875 /* 4.2. Check support for the epilog operation.
6876
6877 If STMT represents a reduction pattern, then the type of the
6878 reduction variable may be different than the type of the rest
6879 of the arguments. For example, consider the case of accumulation
6880 of shorts into an int accumulator; The original code:
6881 S1: int_a = (int) short_a;
6882 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6883
6884 was replaced with:
6885 STMT: int_acc = widen_sum <short_a, int_acc>
6886
6887 This means that:
6888 1. The tree-code that is used to create the vector operation in the
6889 epilog code (that reduces the partial results) is not the
6890 tree-code of STMT, but is rather the tree-code of the original
6891 stmt from the pattern that STMT is replacing. I.e, in the example
6892 above we want to use 'widen_sum' in the loop, but 'plus' in the
6893 epilog.
6894 2. The type (mode) we use to check available target support
6895 for the vector operation to be created in the *epilog*, is
6896 determined by the type of the reduction variable (in the example
6897 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6898 However the type (mode) we use to check available target support
6899 for the vector operation to be created *inside the loop*, is
6900 determined by the type of the other arguments to STMT (in the
6901 example we'd check this: optab_handler (widen_sum_optab,
6902 vect_short_mode)).
6903
6904 This is contrary to "regular" reductions, in which the types of all
6905 the arguments are the same as the type of the reduction variable.
6906 For "regular" reductions we can therefore use the same vector type
6907 (and also the same tree-code) when generating the epilog code and
6908 when generating the code inside the loop. */
6909
6910 enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6911 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6912
6913 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6914 if (reduction_type == TREE_CODE_REDUCTION)
6915 {
6916 /* Check whether it's ok to change the order of the computation.
6917 Generally, when vectorizing a reduction we change the order of the
6918 computation. This may change the behavior of the program in some
6919 cases, so we need to check that this is ok. One exception is when
6920 vectorizing an outer-loop: the inner-loop is executed sequentially,
6921 and therefore vectorizing reductions in the inner-loop during
6922 outer-loop vectorization is safe. Likewise when we are vectorizing
6923 a series of reductions using SLP and the VF is one the reductions
6924 are performed in scalar order. */
6925 if (slp_node
6926 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6927 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
6928 ;
6929 else if (needs_fold_left_reduction_p (scalar_type, orig_code))
6930 {
6931 /* When vectorizing a reduction chain w/o SLP the reduction PHI
6932 is not directy used in stmt. */
6933 if (!only_slp_reduc_chain
6934 && reduc_chain_length != 1)
6935 {
6936 if (dump_enabled_p ())
6937 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6938 "in-order reduction chain without SLP.\n");
6939 return false;
6940 }
6941 STMT_VINFO_REDUC_TYPE (reduc_info)
6942 = reduction_type = FOLD_LEFT_REDUCTION;
6943 }
6944 else if (!commutative_tree_code (orig_code)
6945 || !associative_tree_code (orig_code))
6946 {
6947 if (dump_enabled_p ())
6948 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6949 "reduction: not commutative/associative");
6950 return false;
6951 }
6952 }
6953
6954 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6955 && ncopies > 1)
6956 {
6957 if (dump_enabled_p ())
6958 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6959 "multiple types in double reduction or condition "
6960 "reduction or fold-left reduction.\n");
6961 return false;
6962 }
6963
6964 internal_fn reduc_fn = IFN_LAST;
6965 if (reduction_type == TREE_CODE_REDUCTION
6966 || reduction_type == FOLD_LEFT_REDUCTION
6967 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6968 || reduction_type == CONST_COND_REDUCTION)
6969 {
6970 if (reduction_type == FOLD_LEFT_REDUCTION
6971 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6972 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6973 {
6974 if (reduc_fn != IFN_LAST
6975 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6976 OPTIMIZE_FOR_SPEED))
6977 {
6978 if (dump_enabled_p ())
6979 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6980 "reduc op not supported by target.\n");
6981
6982 reduc_fn = IFN_LAST;
6983 }
6984 }
6985 else
6986 {
6987 if (!nested_cycle || double_reduc)
6988 {
6989 if (dump_enabled_p ())
6990 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6991 "no reduc code for scalar code.\n");
6992
6993 return false;
6994 }
6995 }
6996 }
6997 else if (reduction_type == COND_REDUCTION)
6998 {
6999 int scalar_precision
7000 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
7001 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7002 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7003 vectype_out);
7004
7005 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7006 OPTIMIZE_FOR_SPEED))
7007 reduc_fn = IFN_REDUC_MAX;
7008 }
7009 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7010
7011 if (reduction_type != EXTRACT_LAST_REDUCTION
7012 && (!nested_cycle || double_reduc)
7013 && reduc_fn == IFN_LAST
7014 && !nunits_out.is_constant ())
7015 {
7016 if (dump_enabled_p ())
7017 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7018 "missing target support for reduction on"
7019 " variable-length vectors.\n");
7020 return false;
7021 }
7022
7023 /* For SLP reductions, see if there is a neutral value we can use. */
7024 tree neutral_op = NULL_TREE;
7025 if (slp_node)
7026 neutral_op = neutral_op_for_slp_reduction
7027 (slp_node_instance->reduc_phis, vectype_out, orig_code,
7028 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
7029
7030 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7031 {
7032 /* We can't support in-order reductions of code such as this:
7033
7034 for (int i = 0; i < n1; ++i)
7035 for (int j = 0; j < n2; ++j)
7036 l += a[j];
7037
7038 since GCC effectively transforms the loop when vectorizing:
7039
7040 for (int i = 0; i < n1 / VF; ++i)
7041 for (int j = 0; j < n2; ++j)
7042 for (int k = 0; k < VF; ++k)
7043 l += a[j];
7044
7045 which is a reassociation of the original operation. */
7046 if (dump_enabled_p ())
7047 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7048 "in-order double reduction not supported.\n");
7049
7050 return false;
7051 }
7052
7053 if (reduction_type == FOLD_LEFT_REDUCTION
7054 && slp_node
7055 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7056 {
7057 /* We cannot use in-order reductions in this case because there is
7058 an implicit reassociation of the operations involved. */
7059 if (dump_enabled_p ())
7060 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7061 "in-order unchained SLP reductions not supported.\n");
7062 return false;
7063 }
7064
7065 /* For double reductions, and for SLP reductions with a neutral value,
7066 we construct a variable-length initial vector by loading a vector
7067 full of the neutral value and then shift-and-inserting the start
7068 values into the low-numbered elements. */
7069 if ((double_reduc || neutral_op)
7070 && !nunits_out.is_constant ()
7071 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7072 vectype_out, OPTIMIZE_FOR_SPEED))
7073 {
7074 if (dump_enabled_p ())
7075 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7076 "reduction on variable-length vectors requires"
7077 " target support for a vector-shift-and-insert"
7078 " operation.\n");
7079 return false;
7080 }
7081
7082 /* Check extra constraints for variable-length unchained SLP reductions. */
7083 if (STMT_SLP_TYPE (stmt_info)
7084 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7085 && !nunits_out.is_constant ())
7086 {
7087 /* We checked above that we could build the initial vector when
7088 there's a neutral element value. Check here for the case in
7089 which each SLP statement has its own initial value and in which
7090 that value needs to be repeated for every instance of the
7091 statement within the initial vector. */
7092 unsigned int group_size = SLP_TREE_LANES (slp_node);
7093 if (!neutral_op
7094 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7095 TREE_TYPE (vectype_out)))
7096 {
7097 if (dump_enabled_p ())
7098 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7099 "unsupported form of SLP reduction for"
7100 " variable-length vectors: cannot build"
7101 " initial vector.\n");
7102 return false;
7103 }
7104 /* The epilogue code relies on the number of elements being a multiple
7105 of the group size. The duplicate-and-interleave approach to setting
7106 up the initial vector does too. */
7107 if (!multiple_p (nunits_out, group_size))
7108 {
7109 if (dump_enabled_p ())
7110 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7111 "unsupported form of SLP reduction for"
7112 " variable-length vectors: the vector size"
7113 " is not a multiple of the number of results.\n");
7114 return false;
7115 }
7116 }
7117
7118 if (reduction_type == COND_REDUCTION)
7119 {
7120 widest_int ni;
7121
7122 if (! max_loop_iterations (loop, &ni))
7123 {
7124 if (dump_enabled_p ())
7125 dump_printf_loc (MSG_NOTE, vect_location,
7126 "loop count not known, cannot create cond "
7127 "reduction.\n");
7128 return false;
7129 }
7130 /* Convert backedges to iterations. */
7131 ni += 1;
7132
7133 /* The additional index will be the same type as the condition. Check
7134 that the loop can fit into this less one (because we'll use up the
7135 zero slot for when there are no matches). */
7136 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7137 if (wi::geu_p (ni, wi::to_widest (max_index)))
7138 {
7139 if (dump_enabled_p ())
7140 dump_printf_loc (MSG_NOTE, vect_location,
7141 "loop size is greater than data size.\n");
7142 return false;
7143 }
7144 }
7145
7146 /* In case the vectorization factor (VF) is bigger than the number
7147 of elements that we can fit in a vectype (nunits), we have to generate
7148 more than one vector stmt - i.e - we need to "unroll" the
7149 vector stmt by a factor VF/nunits. For more details see documentation
7150 in vectorizable_operation. */
7151
7152 /* If the reduction is used in an outer loop we need to generate
7153 VF intermediate results, like so (e.g. for ncopies=2):
7154 r0 = phi (init, r0)
7155 r1 = phi (init, r1)
7156 r0 = x0 + r0;
7157 r1 = x1 + r1;
7158 (i.e. we generate VF results in 2 registers).
7159 In this case we have a separate def-use cycle for each copy, and therefore
7160 for each copy we get the vector def for the reduction variable from the
7161 respective phi node created for this copy.
7162
7163 Otherwise (the reduction is unused in the loop nest), we can combine
7164 together intermediate results, like so (e.g. for ncopies=2):
7165 r = phi (init, r)
7166 r = x0 + r;
7167 r = x1 + r;
7168 (i.e. we generate VF/2 results in a single register).
7169 In this case for each copy we get the vector def for the reduction variable
7170 from the vectorized reduction operation generated in the previous iteration.
7171
7172 This only works when we see both the reduction PHI and its only consumer
7173 in vectorizable_reduction and there are no intermediate stmts
7174 participating. */
7175 if (ncopies > 1
7176 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7177 && reduc_chain_length == 1)
7178 single_defuse_cycle = true;
7179
7180 if (single_defuse_cycle || lane_reduc_code_p)
7181 {
7182 gcc_assert (code != COND_EXPR);
7183
7184 /* 4. Supportable by target? */
7185 bool ok = true;
7186
7187 /* 4.1. check support for the operation in the loop */
7188 optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
7189 if (!optab)
7190 {
7191 if (dump_enabled_p ())
7192 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7193 "no optab.\n");
7194 ok = false;
7195 }
7196
7197 machine_mode vec_mode = TYPE_MODE (vectype_in);
7198 if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
7199 {
7200 if (dump_enabled_p ())
7201 dump_printf (MSG_NOTE, "op not supported by target.\n");
7202 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7203 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
7204 ok = false;
7205 else
7206 if (dump_enabled_p ())
7207 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7208 }
7209
7210 /* Worthwhile without SIMD support? */
7211 if (ok
7212 && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
7213 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
7214 {
7215 if (dump_enabled_p ())
7216 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7217 "not worthwhile without SIMD support.\n");
7218 ok = false;
7219 }
7220
7221 /* lane-reducing operations have to go through vect_transform_reduction.
7222 For the other cases try without the single cycle optimization. */
7223 if (!ok)
7224 {
7225 if (lane_reduc_code_p)
7226 return false;
7227 else
7228 single_defuse_cycle = false;
7229 }
7230 }
7231 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7232
7233 /* If the reduction stmt is one of the patterns that have lane
7234 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7235 if ((ncopies > 1 && ! single_defuse_cycle)
7236 && lane_reduc_code_p)
7237 {
7238 if (dump_enabled_p ())
7239 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7240 "multi def-use cycle not possible for lane-reducing "
7241 "reduction operation\n");
7242 return false;
7243 }
7244
7245 if (slp_node
7246 && !(!single_defuse_cycle
7247 && code != DOT_PROD_EXPR
7248 && code != WIDEN_SUM_EXPR
7249 && code != SAD_EXPR
7250 && reduction_type != FOLD_LEFT_REDUCTION))
7251 for (i = 0; i < op_type; i++)
7252 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7253 {
7254 if (dump_enabled_p ())
7255 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7256 "incompatible vector types for invariants\n");
7257 return false;
7258 }
7259
7260 if (slp_node)
7261 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7262 else
7263 vec_num = 1;
7264
7265 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7266 reduction_type, ncopies, cost_vec);
7267 /* Cost the reduction op inside the loop if transformed via
7268 vect_transform_reduction. Otherwise this is costed by the
7269 separate vectorizable_* routines. */
7270 if (single_defuse_cycle
7271 || code == DOT_PROD_EXPR
7272 || code == WIDEN_SUM_EXPR
7273 || code == SAD_EXPR)
7274 record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
7275
7276 if (dump_enabled_p ()
7277 && reduction_type == FOLD_LEFT_REDUCTION)
7278 dump_printf_loc (MSG_NOTE, vect_location,
7279 "using an in-order (fold-left) reduction.\n");
7280 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7281 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7282 reductions go through their own vectorizable_* routines. */
7283 if (!single_defuse_cycle
7284 && code != DOT_PROD_EXPR
7285 && code != WIDEN_SUM_EXPR
7286 && code != SAD_EXPR
7287 && reduction_type != FOLD_LEFT_REDUCTION)
7288 {
7289 stmt_vec_info tem
7290 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7291 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7292 {
7293 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7294 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7295 }
7296 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7297 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7298 }
7299 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7300 {
7301 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7302 internal_fn cond_fn = get_conditional_internal_fn (code);
7303
7304 if (reduction_type != FOLD_LEFT_REDUCTION
7305 && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7306 && (cond_fn == IFN_LAST
7307 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7308 OPTIMIZE_FOR_SPEED)))
7309 {
7310 if (dump_enabled_p ())
7311 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7312 "can't operate on partial vectors because"
7313 " no conditional operation is available.\n");
7314 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7315 }
7316 else if (reduction_type == FOLD_LEFT_REDUCTION
7317 && reduc_fn == IFN_LAST
7318 && !expand_vec_cond_expr_p (vectype_in,
7319 truth_type_for (vectype_in),
7320 SSA_NAME))
7321 {
7322 if (dump_enabled_p ())
7323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7324 "can't operate on partial vectors because"
7325 " no conditional operation is available.\n");
7326 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7327 }
7328 else
7329 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7330 vectype_in, NULL);
7331 }
7332 return true;
7333 }
7334
7335 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7336 value. */
7337
7338 bool
7339 vect_transform_reduction (loop_vec_info loop_vinfo,
7340 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7341 gimple **vec_stmt, slp_tree slp_node)
7342 {
7343 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7344 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7345 int i;
7346 int ncopies;
7347 int vec_num;
7348
7349 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7350 gcc_assert (reduc_info->is_reduc_info);
7351
7352 if (nested_in_vect_loop_p (loop, stmt_info))
7353 {
7354 loop = loop->inner;
7355 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7356 }
7357
7358 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
7359 enum tree_code code = gimple_assign_rhs_code (stmt);
7360 int op_type = TREE_CODE_LENGTH (code);
7361
7362 /* Flatten RHS. */
7363 tree ops[3];
7364 switch (get_gimple_rhs_class (code))
7365 {
7366 case GIMPLE_TERNARY_RHS:
7367 ops[2] = gimple_assign_rhs3 (stmt);
7368 /* Fall thru. */
7369 case GIMPLE_BINARY_RHS:
7370 ops[0] = gimple_assign_rhs1 (stmt);
7371 ops[1] = gimple_assign_rhs2 (stmt);
7372 break;
7373 default:
7374 gcc_unreachable ();
7375 }
7376
7377 /* All uses but the last are expected to be defined in the loop.
7378 The last use is the reduction variable. In case of nested cycle this
7379 assumption is not true: we use reduc_index to record the index of the
7380 reduction variable. */
7381 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7382 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7383 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7384 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7385
7386 if (slp_node)
7387 {
7388 ncopies = 1;
7389 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7390 }
7391 else
7392 {
7393 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7394 vec_num = 1;
7395 }
7396
7397 internal_fn cond_fn = get_conditional_internal_fn (code);
7398 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7399 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7400
7401 /* Transform. */
7402 tree new_temp = NULL_TREE;
7403 auto_vec<tree> vec_oprnds0;
7404 auto_vec<tree> vec_oprnds1;
7405 auto_vec<tree> vec_oprnds2;
7406 tree def0;
7407
7408 if (dump_enabled_p ())
7409 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7410
7411 /* FORNOW: Multiple types are not supported for condition. */
7412 if (code == COND_EXPR)
7413 gcc_assert (ncopies == 1);
7414
7415 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7416
7417 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7418 if (reduction_type == FOLD_LEFT_REDUCTION)
7419 {
7420 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7421 return vectorize_fold_left_reduction
7422 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7423 reduc_fn, ops, vectype_in, reduc_index, masks);
7424 }
7425
7426 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7427 gcc_assert (single_defuse_cycle
7428 || code == DOT_PROD_EXPR
7429 || code == WIDEN_SUM_EXPR
7430 || code == SAD_EXPR);
7431
7432 /* Create the destination vector */
7433 tree scalar_dest = gimple_assign_lhs (stmt);
7434 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7435
7436 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7437 single_defuse_cycle && reduc_index == 0
7438 ? NULL_TREE : ops[0], &vec_oprnds0,
7439 single_defuse_cycle && reduc_index == 1
7440 ? NULL_TREE : ops[1], &vec_oprnds1,
7441 op_type == ternary_op
7442 && !(single_defuse_cycle && reduc_index == 2)
7443 ? ops[2] : NULL_TREE, &vec_oprnds2);
7444 if (single_defuse_cycle)
7445 {
7446 gcc_assert (!slp_node);
7447 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7448 ops[reduc_index],
7449 reduc_index == 0 ? &vec_oprnds0
7450 : (reduc_index == 1 ? &vec_oprnds1
7451 : &vec_oprnds2));
7452 }
7453
7454 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7455 {
7456 gimple *new_stmt;
7457 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7458 if (masked_loop_p && !mask_by_cond_expr)
7459 {
7460 /* Make sure that the reduction accumulator is vop[0]. */
7461 if (reduc_index == 1)
7462 {
7463 gcc_assert (commutative_tree_code (code));
7464 std::swap (vop[0], vop[1]);
7465 }
7466 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7467 vectype_in, i);
7468 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7469 vop[0], vop[1], vop[0]);
7470 new_temp = make_ssa_name (vec_dest, call);
7471 gimple_call_set_lhs (call, new_temp);
7472 gimple_call_set_nothrow (call, true);
7473 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7474 new_stmt = call;
7475 }
7476 else
7477 {
7478 if (op_type == ternary_op)
7479 vop[2] = vec_oprnds2[i];
7480
7481 if (masked_loop_p && mask_by_cond_expr)
7482 {
7483 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7484 vectype_in, i);
7485 build_vect_cond_expr (code, vop, mask, gsi);
7486 }
7487
7488 new_stmt = gimple_build_assign (vec_dest, code,
7489 vop[0], vop[1], vop[2]);
7490 new_temp = make_ssa_name (vec_dest, new_stmt);
7491 gimple_assign_set_lhs (new_stmt, new_temp);
7492 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7493 }
7494
7495 if (slp_node)
7496 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7497 else if (single_defuse_cycle
7498 && i < ncopies - 1)
7499 {
7500 if (reduc_index == 0)
7501 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7502 else if (reduc_index == 1)
7503 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7504 else if (reduc_index == 2)
7505 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7506 }
7507 else
7508 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7509 }
7510
7511 if (!slp_node)
7512 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7513
7514 return true;
7515 }
7516
7517 /* Transform phase of a cycle PHI. */
7518
7519 bool
7520 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7521 stmt_vec_info stmt_info, gimple **vec_stmt,
7522 slp_tree slp_node, slp_instance slp_node_instance)
7523 {
7524 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7525 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7526 int i;
7527 int ncopies;
7528 int j;
7529 bool nested_cycle = false;
7530 int vec_num;
7531
7532 if (nested_in_vect_loop_p (loop, stmt_info))
7533 {
7534 loop = loop->inner;
7535 nested_cycle = true;
7536 }
7537
7538 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7539 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7540 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7541 gcc_assert (reduc_info->is_reduc_info);
7542
7543 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7544 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7545 /* Leave the scalar phi in place. */
7546 return true;
7547
7548 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7549 /* For a nested cycle we do not fill the above. */
7550 if (!vectype_in)
7551 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7552 gcc_assert (vectype_in);
7553
7554 if (slp_node)
7555 {
7556 /* The size vect_schedule_slp_instance computes is off for us. */
7557 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7558 * SLP_TREE_LANES (slp_node), vectype_in);
7559 ncopies = 1;
7560 }
7561 else
7562 {
7563 vec_num = 1;
7564 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7565 }
7566
7567 /* Check whether we should use a single PHI node and accumulate
7568 vectors to one before the backedge. */
7569 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7570 ncopies = 1;
7571
7572 /* Create the destination vector */
7573 gphi *phi = as_a <gphi *> (stmt_info->stmt);
7574 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7575 vectype_out);
7576
7577 /* Get the loop-entry arguments. */
7578 tree vec_initial_def;
7579 auto_vec<tree> vec_initial_defs;
7580 if (slp_node)
7581 {
7582 vec_initial_defs.reserve (vec_num);
7583 if (nested_cycle)
7584 {
7585 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7586 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7587 &vec_initial_defs);
7588 }
7589 else
7590 {
7591 gcc_assert (slp_node == slp_node_instance->reduc_phis);
7592 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7593 tree neutral_op
7594 = neutral_op_for_slp_reduction (slp_node, vectype_out,
7595 STMT_VINFO_REDUC_CODE (reduc_info),
7596 first != NULL);
7597 get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
7598 &vec_initial_defs, vec_num,
7599 first != NULL, neutral_op);
7600 }
7601 }
7602 else
7603 {
7604 /* Get at the scalar def before the loop, that defines the initial
7605 value of the reduction variable. */
7606 tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7607 loop_preheader_edge (loop));
7608 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7609 and we can't use zero for induc_val, use initial_def. Similarly
7610 for REDUC_MIN and initial_def larger than the base. */
7611 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7612 {
7613 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7614 if (TREE_CODE (initial_def) == INTEGER_CST
7615 && !integer_zerop (induc_val)
7616 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7617 && tree_int_cst_lt (initial_def, induc_val))
7618 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7619 && tree_int_cst_lt (induc_val, initial_def))))
7620 {
7621 induc_val = initial_def;
7622 /* Communicate we used the initial_def to epilouge
7623 generation. */
7624 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7625 }
7626 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7627 vec_initial_defs.create (ncopies);
7628 for (i = 0; i < ncopies; ++i)
7629 vec_initial_defs.quick_push (vec_initial_def);
7630 }
7631 else if (nested_cycle)
7632 {
7633 /* Do not use an adjustment def as that case is not supported
7634 correctly if ncopies is not one. */
7635 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7636 ncopies, initial_def,
7637 &vec_initial_defs);
7638 }
7639 else
7640 {
7641 tree adjustment_def = NULL_TREE;
7642 tree *adjustment_defp = &adjustment_def;
7643 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7644 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7645 adjustment_defp = NULL;
7646 vec_initial_def
7647 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
7648 initial_def, adjustment_defp);
7649 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7650 vec_initial_defs.create (ncopies);
7651 for (i = 0; i < ncopies; ++i)
7652 vec_initial_defs.quick_push (vec_initial_def);
7653 }
7654 }
7655
7656 /* Generate the reduction PHIs upfront. */
7657 for (i = 0; i < vec_num; i++)
7658 {
7659 tree vec_init_def = vec_initial_defs[i];
7660 for (j = 0; j < ncopies; j++)
7661 {
7662 /* Create the reduction-phi that defines the reduction
7663 operand. */
7664 gphi *new_phi = create_phi_node (vec_dest, loop->header);
7665
7666 /* Set the loop-entry arg of the reduction-phi. */
7667 if (j != 0 && nested_cycle)
7668 vec_init_def = vec_initial_defs[j];
7669 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7670 UNKNOWN_LOCATION);
7671
7672 /* The loop-latch arg is set in epilogue processing. */
7673
7674 if (slp_node)
7675 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7676 else
7677 {
7678 if (j == 0)
7679 *vec_stmt = new_phi;
7680 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7681 }
7682 }
7683 }
7684
7685 return true;
7686 }
7687
7688 /* Vectorizes LC PHIs. */
7689
7690 bool
7691 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7692 stmt_vec_info stmt_info, gimple **vec_stmt,
7693 slp_tree slp_node)
7694 {
7695 if (!loop_vinfo
7696 || !is_a <gphi *> (stmt_info->stmt)
7697 || gimple_phi_num_args (stmt_info->stmt) != 1)
7698 return false;
7699
7700 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7701 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7702 return false;
7703
7704 if (!vec_stmt) /* transformation not required. */
7705 {
7706 /* Deal with copies from externs or constants that disguise as
7707 loop-closed PHI nodes (PR97886). */
7708 if (slp_node
7709 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
7710 SLP_TREE_VECTYPE (slp_node)))
7711 {
7712 if (dump_enabled_p ())
7713 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7714 "incompatible vector types for invariants\n");
7715 return false;
7716 }
7717 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7718 return true;
7719 }
7720
7721 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7722 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7723 basic_block bb = gimple_bb (stmt_info->stmt);
7724 edge e = single_pred_edge (bb);
7725 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7726 auto_vec<tree> vec_oprnds;
7727 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7728 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7729 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7730 for (unsigned i = 0; i < vec_oprnds.length (); i++)
7731 {
7732 /* Create the vectorized LC PHI node. */
7733 gphi *new_phi = create_phi_node (vec_dest, bb);
7734 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7735 if (slp_node)
7736 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7737 else
7738 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7739 }
7740 if (!slp_node)
7741 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7742
7743 return true;
7744 }
7745
7746 /* Vectorizes PHIs. */
7747
7748 bool
7749 vectorizable_phi (vec_info *,
7750 stmt_vec_info stmt_info, gimple **vec_stmt,
7751 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7752 {
7753 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7754 return false;
7755
7756 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7757 return false;
7758
7759 tree vectype = SLP_TREE_VECTYPE (slp_node);
7760
7761 if (!vec_stmt) /* transformation not required. */
7762 {
7763 slp_tree child;
7764 unsigned i;
7765 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7766 if (!child)
7767 {
7768 if (dump_enabled_p ())
7769 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7770 "PHI node with unvectorized backedge def\n");
7771 return false;
7772 }
7773 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7774 {
7775 if (dump_enabled_p ())
7776 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7777 "incompatible vector types for invariants\n");
7778 return false;
7779 }
7780 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7781 vector_stmt, stmt_info, vectype, 0, vect_body);
7782 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
7783 return true;
7784 }
7785
7786 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7787 basic_block bb = gimple_bb (stmt_info->stmt);
7788 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7789 auto_vec<gphi *> new_phis;
7790 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
7791 {
7792 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
7793
7794 /* Skip not yet vectorized defs. */
7795 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7796 && SLP_TREE_VEC_STMTS (child).is_empty ())
7797 continue;
7798
7799 auto_vec<tree> vec_oprnds;
7800 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
7801 if (!new_phis.exists ())
7802 {
7803 new_phis.create (vec_oprnds.length ());
7804 for (unsigned j = 0; j < vec_oprnds.length (); j++)
7805 {
7806 /* Create the vectorized LC PHI node. */
7807 new_phis.quick_push (create_phi_node (vec_dest, bb));
7808 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
7809 }
7810 }
7811 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
7812 for (unsigned j = 0; j < vec_oprnds.length (); j++)
7813 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
7814 }
7815 /* We should have at least one already vectorized child. */
7816 gcc_assert (new_phis.exists ());
7817
7818 return true;
7819 }
7820
7821
7822 /* Function vect_min_worthwhile_factor.
7823
7824 For a loop where we could vectorize the operation indicated by CODE,
7825 return the minimum vectorization factor that makes it worthwhile
7826 to use generic vectors. */
7827 static unsigned int
7828 vect_min_worthwhile_factor (enum tree_code code)
7829 {
7830 switch (code)
7831 {
7832 case PLUS_EXPR:
7833 case MINUS_EXPR:
7834 case NEGATE_EXPR:
7835 return 4;
7836
7837 case BIT_AND_EXPR:
7838 case BIT_IOR_EXPR:
7839 case BIT_XOR_EXPR:
7840 case BIT_NOT_EXPR:
7841 return 2;
7842
7843 default:
7844 return INT_MAX;
7845 }
7846 }
7847
7848 /* Return true if VINFO indicates we are doing loop vectorization and if
7849 it is worth decomposing CODE operations into scalar operations for
7850 that loop's vectorization factor. */
7851
7852 bool
7853 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7854 {
7855 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7856 unsigned HOST_WIDE_INT value;
7857 return (loop_vinfo
7858 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7859 && value >= vect_min_worthwhile_factor (code));
7860 }
7861
7862 /* Function vectorizable_induction
7863
7864 Check if STMT_INFO performs an induction computation that can be vectorized.
7865 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7866 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7867 Return true if STMT_INFO is vectorizable in this way. */
7868
7869 bool
7870 vectorizable_induction (loop_vec_info loop_vinfo,
7871 stmt_vec_info stmt_info,
7872 gimple **vec_stmt, slp_tree slp_node,
7873 stmt_vector_for_cost *cost_vec)
7874 {
7875 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7876 unsigned ncopies;
7877 bool nested_in_vect_loop = false;
7878 class loop *iv_loop;
7879 tree vec_def;
7880 edge pe = loop_preheader_edge (loop);
7881 basic_block new_bb;
7882 tree new_vec, vec_init, vec_step, t;
7883 tree new_name;
7884 gimple *new_stmt;
7885 gphi *induction_phi;
7886 tree induc_def, vec_dest;
7887 tree init_expr, step_expr;
7888 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7889 unsigned i;
7890 tree expr;
7891 gimple_stmt_iterator si;
7892
7893 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7894 if (!phi)
7895 return false;
7896
7897 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7898 return false;
7899
7900 /* Make sure it was recognized as induction computation. */
7901 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7902 return false;
7903
7904 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7905 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7906
7907 if (slp_node)
7908 ncopies = 1;
7909 else
7910 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7911 gcc_assert (ncopies >= 1);
7912
7913 /* FORNOW. These restrictions should be relaxed. */
7914 if (nested_in_vect_loop_p (loop, stmt_info))
7915 {
7916 imm_use_iterator imm_iter;
7917 use_operand_p use_p;
7918 gimple *exit_phi;
7919 edge latch_e;
7920 tree loop_arg;
7921
7922 if (ncopies > 1)
7923 {
7924 if (dump_enabled_p ())
7925 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7926 "multiple types in nested loop.\n");
7927 return false;
7928 }
7929
7930 exit_phi = NULL;
7931 latch_e = loop_latch_edge (loop->inner);
7932 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7933 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7934 {
7935 gimple *use_stmt = USE_STMT (use_p);
7936 if (is_gimple_debug (use_stmt))
7937 continue;
7938
7939 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7940 {
7941 exit_phi = use_stmt;
7942 break;
7943 }
7944 }
7945 if (exit_phi)
7946 {
7947 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7948 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7949 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7950 {
7951 if (dump_enabled_p ())
7952 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7953 "inner-loop induction only used outside "
7954 "of the outer vectorized loop.\n");
7955 return false;
7956 }
7957 }
7958
7959 nested_in_vect_loop = true;
7960 iv_loop = loop->inner;
7961 }
7962 else
7963 iv_loop = loop;
7964 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7965
7966 if (slp_node && !nunits.is_constant ())
7967 {
7968 /* The current SLP code creates the step value element-by-element. */
7969 if (dump_enabled_p ())
7970 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7971 "SLP induction not supported for variable-length"
7972 " vectors.\n");
7973 return false;
7974 }
7975
7976 if (!vec_stmt) /* transformation not required. */
7977 {
7978 unsigned inside_cost = 0, prologue_cost = 0;
7979 if (slp_node)
7980 {
7981 /* We eventually need to set a vector type on invariant
7982 arguments. */
7983 unsigned j;
7984 slp_tree child;
7985 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7986 if (!vect_maybe_update_slp_op_vectype
7987 (child, SLP_TREE_VECTYPE (slp_node)))
7988 {
7989 if (dump_enabled_p ())
7990 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7991 "incompatible vector types for "
7992 "invariants\n");
7993 return false;
7994 }
7995 /* loop cost for vec_loop. */
7996 inside_cost
7997 = record_stmt_cost (cost_vec,
7998 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7999 vector_stmt, stmt_info, 0, vect_body);
8000 /* prologue cost for vec_init (if not nested) and step. */
8001 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
8002 scalar_to_vec,
8003 stmt_info, 0, vect_prologue);
8004 }
8005 else /* if (!slp_node) */
8006 {
8007 /* loop cost for vec_loop. */
8008 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8009 stmt_info, 0, vect_body);
8010 /* prologue cost for vec_init and vec_step. */
8011 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
8012 stmt_info, 0, vect_prologue);
8013 }
8014 if (dump_enabled_p ())
8015 dump_printf_loc (MSG_NOTE, vect_location,
8016 "vect_model_induction_cost: inside_cost = %d, "
8017 "prologue_cost = %d .\n", inside_cost,
8018 prologue_cost);
8019
8020 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
8021 DUMP_VECT_SCOPE ("vectorizable_induction");
8022 return true;
8023 }
8024
8025 /* Transform. */
8026
8027 /* Compute a vector variable, initialized with the first VF values of
8028 the induction variable. E.g., for an iv with IV_PHI='X' and
8029 evolution S, for a vector of 4 units, we want to compute:
8030 [X, X + S, X + 2*S, X + 3*S]. */
8031
8032 if (dump_enabled_p ())
8033 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
8034
8035 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8036 gcc_assert (step_expr != NULL_TREE);
8037 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
8038
8039 pe = loop_preheader_edge (iv_loop);
8040 /* Find the first insertion point in the BB. */
8041 basic_block bb = gimple_bb (phi);
8042 si = gsi_after_labels (bb);
8043
8044 /* For SLP induction we have to generate several IVs as for example
8045 with group size 3 we need
8046 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
8047 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
8048 if (slp_node)
8049 {
8050 /* Enforced above. */
8051 unsigned int const_nunits = nunits.to_constant ();
8052
8053 /* The initial values are vectorized, but any lanes > group_size
8054 need adjustment. */
8055 slp_tree init_node
8056 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
8057
8058 /* Gather steps. Since we do not vectorize inductions as
8059 cycles we have to reconstruct the step from SCEV data. */
8060 unsigned group_size = SLP_TREE_LANES (slp_node);
8061 tree *steps = XALLOCAVEC (tree, group_size);
8062 tree *inits = XALLOCAVEC (tree, group_size);
8063 stmt_vec_info phi_info;
8064 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
8065 {
8066 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
8067 if (!init_node)
8068 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
8069 pe->dest_idx);
8070 }
8071
8072 /* Now generate the IVs. */
8073 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8074 gcc_assert ((const_nunits * nvects) % group_size == 0);
8075 unsigned nivs;
8076 if (nested_in_vect_loop)
8077 nivs = nvects;
8078 else
8079 {
8080 /* Compute the number of distinct IVs we need. First reduce
8081 group_size if it is a multiple of const_nunits so we get
8082 one IV for a group_size of 4 but const_nunits 2. */
8083 unsigned group_sizep = group_size;
8084 if (group_sizep % const_nunits == 0)
8085 group_sizep = group_sizep / const_nunits;
8086 nivs = least_common_multiple (group_sizep,
8087 const_nunits) / const_nunits;
8088 }
8089 tree stept = TREE_TYPE (step_vectype);
8090 tree lupdate_mul = NULL_TREE;
8091 if (!nested_in_vect_loop)
8092 {
8093 /* The number of iterations covered in one vector iteration. */
8094 unsigned lup_mul = (nvects * const_nunits) / group_size;
8095 lupdate_mul
8096 = build_vector_from_val (step_vectype,
8097 SCALAR_FLOAT_TYPE_P (stept)
8098 ? build_real_from_wide (stept, lup_mul,
8099 UNSIGNED)
8100 : build_int_cstu (stept, lup_mul));
8101 }
8102 tree peel_mul = NULL_TREE;
8103 gimple_seq init_stmts = NULL;
8104 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
8105 {
8106 if (SCALAR_FLOAT_TYPE_P (stept))
8107 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
8108 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8109 else
8110 peel_mul = gimple_convert (&init_stmts, stept,
8111 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8112 peel_mul = gimple_build_vector_from_val (&init_stmts,
8113 step_vectype, peel_mul);
8114 }
8115 unsigned ivn;
8116 auto_vec<tree> vec_steps;
8117 for (ivn = 0; ivn < nivs; ++ivn)
8118 {
8119 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
8120 tree_vector_builder init_elts (vectype, const_nunits, 1);
8121 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
8122 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
8123 {
8124 /* The scalar steps of the IVs. */
8125 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
8126 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
8127 step_elts.quick_push (elt);
8128 if (!init_node)
8129 {
8130 /* The scalar inits of the IVs if not vectorized. */
8131 elt = inits[(ivn*const_nunits + eltn) % group_size];
8132 if (!useless_type_conversion_p (TREE_TYPE (vectype),
8133 TREE_TYPE (elt)))
8134 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
8135 TREE_TYPE (vectype), elt);
8136 init_elts.quick_push (elt);
8137 }
8138 /* The number of steps to add to the initial values. */
8139 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
8140 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
8141 ? build_real_from_wide (stept,
8142 mul_elt, UNSIGNED)
8143 : build_int_cstu (stept, mul_elt));
8144 }
8145 vec_step = gimple_build_vector (&init_stmts, &step_elts);
8146 vec_steps.safe_push (vec_step);
8147 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
8148 if (peel_mul)
8149 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8150 step_mul, peel_mul);
8151 if (!init_node)
8152 vec_init = gimple_build_vector (&init_stmts, &init_elts);
8153
8154 /* Create the induction-phi that defines the induction-operand. */
8155 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
8156 "vec_iv_");
8157 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8158 induc_def = PHI_RESULT (induction_phi);
8159
8160 /* Create the iv update inside the loop */
8161 tree up = vec_step;
8162 if (lupdate_mul)
8163 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8164 vec_step, lupdate_mul);
8165 gimple_seq stmts = NULL;
8166 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8167 vec_def = gimple_build (&stmts,
8168 PLUS_EXPR, step_vectype, vec_def, up);
8169 vec_def = gimple_convert (&stmts, vectype, vec_def);
8170 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8171 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8172 UNKNOWN_LOCATION);
8173
8174 if (init_node)
8175 vec_init = vect_get_slp_vect_def (init_node, ivn);
8176 if (!nested_in_vect_loop
8177 && !integer_zerop (step_mul))
8178 {
8179 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
8180 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8181 vec_step, step_mul);
8182 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8183 vec_def, up);
8184 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
8185 }
8186
8187 /* Set the arguments of the phi node: */
8188 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8189
8190 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
8191 }
8192 if (!nested_in_vect_loop)
8193 {
8194 /* Fill up to the number of vectors we need for the whole group. */
8195 nivs = least_common_multiple (group_size,
8196 const_nunits) / const_nunits;
8197 for (; ivn < nivs; ++ivn)
8198 {
8199 SLP_TREE_VEC_STMTS (slp_node)
8200 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
8201 vec_steps.safe_push (vec_steps[0]);
8202 }
8203 }
8204
8205 /* Re-use IVs when we can. We are generating further vector
8206 stmts by adding VF' * stride to the IVs generated above. */
8207 if (ivn < nvects)
8208 {
8209 unsigned vfp
8210 = least_common_multiple (group_size, const_nunits) / group_size;
8211 tree lupdate_mul
8212 = build_vector_from_val (step_vectype,
8213 SCALAR_FLOAT_TYPE_P (stept)
8214 ? build_real_from_wide (stept,
8215 vfp, UNSIGNED)
8216 : build_int_cstu (stept, vfp));
8217 for (; ivn < nvects; ++ivn)
8218 {
8219 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8220 tree def = gimple_get_lhs (iv);
8221 if (ivn < 2*nivs)
8222 vec_steps[ivn - nivs]
8223 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8224 vec_steps[ivn - nivs], lupdate_mul);
8225 gimple_seq stmts = NULL;
8226 def = gimple_convert (&stmts, step_vectype, def);
8227 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8228 def, vec_steps[ivn % nivs]);
8229 def = gimple_convert (&stmts, vectype, def);
8230 if (gimple_code (iv) == GIMPLE_PHI)
8231 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8232 else
8233 {
8234 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8235 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8236 }
8237 SLP_TREE_VEC_STMTS (slp_node)
8238 .quick_push (SSA_NAME_DEF_STMT (def));
8239 }
8240 }
8241
8242 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8243 gcc_assert (!new_bb);
8244
8245 return true;
8246 }
8247
8248 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
8249 loop_preheader_edge (iv_loop));
8250
8251 gimple_seq stmts = NULL;
8252 if (!nested_in_vect_loop)
8253 {
8254 /* Convert the initial value to the IV update type. */
8255 tree new_type = TREE_TYPE (step_expr);
8256 init_expr = gimple_convert (&stmts, new_type, init_expr);
8257
8258 /* If we are using the loop mask to "peel" for alignment then we need
8259 to adjust the start value here. */
8260 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8261 if (skip_niters != NULL_TREE)
8262 {
8263 if (FLOAT_TYPE_P (vectype))
8264 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8265 skip_niters);
8266 else
8267 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8268 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8269 skip_niters, step_expr);
8270 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8271 init_expr, skip_step);
8272 }
8273 }
8274
8275 if (stmts)
8276 {
8277 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8278 gcc_assert (!new_bb);
8279 }
8280
8281 /* Create the vector that holds the initial_value of the induction. */
8282 if (nested_in_vect_loop)
8283 {
8284 /* iv_loop is nested in the loop to be vectorized. init_expr had already
8285 been created during vectorization of previous stmts. We obtain it
8286 from the STMT_VINFO_VEC_STMT of the defining stmt. */
8287 auto_vec<tree> vec_inits;
8288 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8289 init_expr, &vec_inits);
8290 vec_init = vec_inits[0];
8291 /* If the initial value is not of proper type, convert it. */
8292 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8293 {
8294 new_stmt
8295 = gimple_build_assign (vect_get_new_ssa_name (vectype,
8296 vect_simple_var,
8297 "vec_iv_"),
8298 VIEW_CONVERT_EXPR,
8299 build1 (VIEW_CONVERT_EXPR, vectype,
8300 vec_init));
8301 vec_init = gimple_assign_lhs (new_stmt);
8302 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8303 new_stmt);
8304 gcc_assert (!new_bb);
8305 }
8306 }
8307 else
8308 {
8309 /* iv_loop is the loop to be vectorized. Create:
8310 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
8311 stmts = NULL;
8312 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8313
8314 unsigned HOST_WIDE_INT const_nunits;
8315 if (nunits.is_constant (&const_nunits))
8316 {
8317 tree_vector_builder elts (step_vectype, const_nunits, 1);
8318 elts.quick_push (new_name);
8319 for (i = 1; i < const_nunits; i++)
8320 {
8321 /* Create: new_name_i = new_name + step_expr */
8322 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8323 new_name, step_expr);
8324 elts.quick_push (new_name);
8325 }
8326 /* Create a vector from [new_name_0, new_name_1, ...,
8327 new_name_nunits-1] */
8328 vec_init = gimple_build_vector (&stmts, &elts);
8329 }
8330 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8331 /* Build the initial value directly from a VEC_SERIES_EXPR. */
8332 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8333 new_name, step_expr);
8334 else
8335 {
8336 /* Build:
8337 [base, base, base, ...]
8338 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
8339 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8340 gcc_assert (flag_associative_math);
8341 tree index = build_index_vector (step_vectype, 0, 1);
8342 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8343 new_name);
8344 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8345 step_expr);
8346 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8347 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8348 vec_init, step_vec);
8349 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8350 vec_init, base_vec);
8351 }
8352 vec_init = gimple_convert (&stmts, vectype, vec_init);
8353
8354 if (stmts)
8355 {
8356 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8357 gcc_assert (!new_bb);
8358 }
8359 }
8360
8361
8362 /* Create the vector that holds the step of the induction. */
8363 if (nested_in_vect_loop)
8364 /* iv_loop is nested in the loop to be vectorized. Generate:
8365 vec_step = [S, S, S, S] */
8366 new_name = step_expr;
8367 else
8368 {
8369 /* iv_loop is the loop to be vectorized. Generate:
8370 vec_step = [VF*S, VF*S, VF*S, VF*S] */
8371 gimple_seq seq = NULL;
8372 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8373 {
8374 expr = build_int_cst (integer_type_node, vf);
8375 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8376 }
8377 else
8378 expr = build_int_cst (TREE_TYPE (step_expr), vf);
8379 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8380 expr, step_expr);
8381 if (seq)
8382 {
8383 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8384 gcc_assert (!new_bb);
8385 }
8386 }
8387
8388 t = unshare_expr (new_name);
8389 gcc_assert (CONSTANT_CLASS_P (new_name)
8390 || TREE_CODE (new_name) == SSA_NAME);
8391 new_vec = build_vector_from_val (step_vectype, t);
8392 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8393 new_vec, step_vectype, NULL);
8394
8395
8396 /* Create the following def-use cycle:
8397 loop prolog:
8398 vec_init = ...
8399 vec_step = ...
8400 loop:
8401 vec_iv = PHI <vec_init, vec_loop>
8402 ...
8403 STMT
8404 ...
8405 vec_loop = vec_iv + vec_step; */
8406
8407 /* Create the induction-phi that defines the induction-operand. */
8408 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8409 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8410 induc_def = PHI_RESULT (induction_phi);
8411
8412 /* Create the iv update inside the loop */
8413 stmts = NULL;
8414 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8415 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8416 vec_def = gimple_convert (&stmts, vectype, vec_def);
8417 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8418 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8419
8420 /* Set the arguments of the phi node: */
8421 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8422 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8423 UNKNOWN_LOCATION);
8424
8425 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8426 *vec_stmt = induction_phi;
8427
8428 /* In case that vectorization factor (VF) is bigger than the number
8429 of elements that we can fit in a vectype (nunits), we have to generate
8430 more than one vector stmt - i.e - we need to "unroll" the
8431 vector stmt by a factor VF/nunits. For more details see documentation
8432 in vectorizable_operation. */
8433
8434 if (ncopies > 1)
8435 {
8436 gimple_seq seq = NULL;
8437 /* FORNOW. This restriction should be relaxed. */
8438 gcc_assert (!nested_in_vect_loop);
8439
8440 /* Create the vector that holds the step of the induction. */
8441 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8442 {
8443 expr = build_int_cst (integer_type_node, nunits);
8444 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8445 }
8446 else
8447 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8448 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8449 expr, step_expr);
8450 if (seq)
8451 {
8452 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8453 gcc_assert (!new_bb);
8454 }
8455
8456 t = unshare_expr (new_name);
8457 gcc_assert (CONSTANT_CLASS_P (new_name)
8458 || TREE_CODE (new_name) == SSA_NAME);
8459 new_vec = build_vector_from_val (step_vectype, t);
8460 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8461 new_vec, step_vectype, NULL);
8462
8463 vec_def = induc_def;
8464 for (i = 1; i < ncopies; i++)
8465 {
8466 /* vec_i = vec_prev + vec_step */
8467 gimple_seq stmts = NULL;
8468 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8469 vec_def = gimple_build (&stmts,
8470 PLUS_EXPR, step_vectype, vec_def, vec_step);
8471 vec_def = gimple_convert (&stmts, vectype, vec_def);
8472
8473 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8474 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8475 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8476 }
8477 }
8478
8479 if (dump_enabled_p ())
8480 dump_printf_loc (MSG_NOTE, vect_location,
8481 "transform induction: created def-use cycle: %G%G",
8482 induction_phi, SSA_NAME_DEF_STMT (vec_def));
8483
8484 return true;
8485 }
8486
8487 /* Function vectorizable_live_operation.
8488
8489 STMT_INFO computes a value that is used outside the loop. Check if
8490 it can be supported. */
8491
8492 bool
8493 vectorizable_live_operation (vec_info *vinfo,
8494 stmt_vec_info stmt_info,
8495 gimple_stmt_iterator *gsi,
8496 slp_tree slp_node, slp_instance slp_node_instance,
8497 int slp_index, bool vec_stmt_p,
8498 stmt_vector_for_cost *cost_vec)
8499 {
8500 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8501 imm_use_iterator imm_iter;
8502 tree lhs, lhs_type, bitsize;
8503 tree vectype = (slp_node
8504 ? SLP_TREE_VECTYPE (slp_node)
8505 : STMT_VINFO_VECTYPE (stmt_info));
8506 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8507 int ncopies;
8508 gimple *use_stmt;
8509 auto_vec<tree> vec_oprnds;
8510 int vec_entry = 0;
8511 poly_uint64 vec_index = 0;
8512
8513 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8514
8515 /* If a stmt of a reduction is live, vectorize it via
8516 vect_create_epilog_for_reduction. vectorizable_reduction assessed
8517 validity so just trigger the transform here. */
8518 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8519 {
8520 if (!vec_stmt_p)
8521 return true;
8522 if (slp_node)
8523 {
8524 /* For reduction chains the meta-info is attached to
8525 the group leader. */
8526 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8527 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8528 /* For SLP reductions we vectorize the epilogue for
8529 all involved stmts together. */
8530 else if (slp_index != 0)
8531 return true;
8532 else
8533 /* For SLP reductions the meta-info is attached to
8534 the representative. */
8535 stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8536 }
8537 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8538 gcc_assert (reduc_info->is_reduc_info);
8539 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8540 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8541 return true;
8542 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8543 slp_node_instance);
8544 return true;
8545 }
8546
8547 /* If STMT is not relevant and it is a simple assignment and its inputs are
8548 invariant then it can remain in place, unvectorized. The original last
8549 scalar value that it computes will be used. */
8550 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8551 {
8552 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8553 if (dump_enabled_p ())
8554 dump_printf_loc (MSG_NOTE, vect_location,
8555 "statement is simple and uses invariant. Leaving in "
8556 "place.\n");
8557 return true;
8558 }
8559
8560 if (slp_node)
8561 ncopies = 1;
8562 else
8563 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8564
8565 if (slp_node)
8566 {
8567 gcc_assert (slp_index >= 0);
8568
8569 /* Get the last occurrence of the scalar index from the concatenation of
8570 all the slp vectors. Calculate which slp vector it is and the index
8571 within. */
8572 int num_scalar = SLP_TREE_LANES (slp_node);
8573 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8574 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8575
8576 /* Calculate which vector contains the result, and which lane of
8577 that vector we need. */
8578 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8579 {
8580 if (dump_enabled_p ())
8581 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8582 "Cannot determine which vector holds the"
8583 " final result.\n");
8584 return false;
8585 }
8586 }
8587
8588 if (!vec_stmt_p)
8589 {
8590 /* No transformation required. */
8591 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8592 {
8593 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8594 OPTIMIZE_FOR_SPEED))
8595 {
8596 if (dump_enabled_p ())
8597 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8598 "can't operate on partial vectors "
8599 "because the target doesn't support extract "
8600 "last reduction.\n");
8601 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8602 }
8603 else if (slp_node)
8604 {
8605 if (dump_enabled_p ())
8606 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8607 "can't operate on partial vectors "
8608 "because an SLP statement is live after "
8609 "the loop.\n");
8610 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8611 }
8612 else if (ncopies > 1)
8613 {
8614 if (dump_enabled_p ())
8615 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8616 "can't operate on partial vectors "
8617 "because ncopies is greater than 1.\n");
8618 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8619 }
8620 else
8621 {
8622 gcc_assert (ncopies == 1 && !slp_node);
8623 vect_record_loop_mask (loop_vinfo,
8624 &LOOP_VINFO_MASKS (loop_vinfo),
8625 1, vectype, NULL);
8626 }
8627 }
8628 /* ??? Enable for loop costing as well. */
8629 if (!loop_vinfo)
8630 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8631 0, vect_epilogue);
8632 return true;
8633 }
8634
8635 /* Use the lhs of the original scalar statement. */
8636 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8637 if (dump_enabled_p ())
8638 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8639 "stmt %G", stmt);
8640
8641 lhs = gimple_get_lhs (stmt);
8642 lhs_type = TREE_TYPE (lhs);
8643
8644 bitsize = vector_element_bits_tree (vectype);
8645
8646 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8647 tree vec_lhs, bitstart;
8648 gimple *vec_stmt;
8649 if (slp_node)
8650 {
8651 gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8652
8653 /* Get the correct slp vectorized stmt. */
8654 vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8655 vec_lhs = gimple_get_lhs (vec_stmt);
8656
8657 /* Get entry to use. */
8658 bitstart = bitsize_int (vec_index);
8659 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8660 }
8661 else
8662 {
8663 /* For multiple copies, get the last copy. */
8664 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8665 vec_lhs = gimple_get_lhs (vec_stmt);
8666
8667 /* Get the last lane in the vector. */
8668 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
8669 }
8670
8671 if (loop_vinfo)
8672 {
8673 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8674 requirement, insert one phi node for it. It looks like:
8675 loop;
8676 BB:
8677 # lhs' = PHI <lhs>
8678 ==>
8679 loop;
8680 BB:
8681 # vec_lhs' = PHI <vec_lhs>
8682 new_tree = lane_extract <vec_lhs', ...>;
8683 lhs' = new_tree; */
8684
8685 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8686 basic_block exit_bb = single_exit (loop)->dest;
8687 gcc_assert (single_pred_p (exit_bb));
8688
8689 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8690 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8691 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8692
8693 gimple_seq stmts = NULL;
8694 tree new_tree;
8695 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8696 {
8697 /* Emit:
8698
8699 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8700
8701 where VEC_LHS is the vectorized live-out result and MASK is
8702 the loop mask for the final iteration. */
8703 gcc_assert (ncopies == 1 && !slp_node);
8704 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8705 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8706 1, vectype, 0);
8707 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8708 mask, vec_lhs_phi);
8709
8710 /* Convert the extracted vector element to the scalar type. */
8711 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8712 }
8713 else
8714 {
8715 tree bftype = TREE_TYPE (vectype);
8716 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8717 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8718 new_tree = build3 (BIT_FIELD_REF, bftype,
8719 vec_lhs_phi, bitsize, bitstart);
8720 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8721 &stmts, true, NULL_TREE);
8722 }
8723
8724 if (stmts)
8725 {
8726 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8727 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8728
8729 /* Remove existing phi from lhs and create one copy from new_tree. */
8730 tree lhs_phi = NULL_TREE;
8731 gimple_stmt_iterator gsi;
8732 for (gsi = gsi_start_phis (exit_bb);
8733 !gsi_end_p (gsi); gsi_next (&gsi))
8734 {
8735 gimple *phi = gsi_stmt (gsi);
8736 if ((gimple_phi_arg_def (phi, 0) == lhs))
8737 {
8738 remove_phi_node (&gsi, false);
8739 lhs_phi = gimple_phi_result (phi);
8740 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8741 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8742 break;
8743 }
8744 }
8745 }
8746
8747 /* Replace use of lhs with newly computed result. If the use stmt is a
8748 single arg PHI, just replace all uses of PHI result. It's necessary
8749 because lcssa PHI defining lhs may be before newly inserted stmt. */
8750 use_operand_p use_p;
8751 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8752 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8753 && !is_gimple_debug (use_stmt))
8754 {
8755 if (gimple_code (use_stmt) == GIMPLE_PHI
8756 && gimple_phi_num_args (use_stmt) == 1)
8757 {
8758 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8759 }
8760 else
8761 {
8762 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8763 SET_USE (use_p, new_tree);
8764 }
8765 update_stmt (use_stmt);
8766 }
8767 }
8768 else
8769 {
8770 /* For basic-block vectorization simply insert the lane-extraction. */
8771 tree bftype = TREE_TYPE (vectype);
8772 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8773 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8774 tree new_tree = build3 (BIT_FIELD_REF, bftype,
8775 vec_lhs, bitsize, bitstart);
8776 gimple_seq stmts = NULL;
8777 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8778 &stmts, true, NULL_TREE);
8779 if (TREE_CODE (new_tree) == SSA_NAME
8780 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
8781 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
8782 if (is_a <gphi *> (vec_stmt))
8783 {
8784 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
8785 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8786 }
8787 else
8788 {
8789 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
8790 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
8791 }
8792
8793 /* Replace use of lhs with newly computed result. If the use stmt is a
8794 single arg PHI, just replace all uses of PHI result. It's necessary
8795 because lcssa PHI defining lhs may be before newly inserted stmt. */
8796 use_operand_p use_p;
8797 stmt_vec_info use_stmt_info;
8798 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8799 if (!is_gimple_debug (use_stmt)
8800 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
8801 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
8802 {
8803 /* ??? This can happen when the live lane ends up being
8804 used in a vector construction code-generated by an
8805 external SLP node (and code-generation for that already
8806 happened). See gcc.dg/vect/bb-slp-47.c.
8807 Doing this is what would happen if that vector CTOR
8808 were not code-generated yet so it is not too bad.
8809 ??? In fact we'd likely want to avoid this situation
8810 in the first place. */
8811 if (TREE_CODE (new_tree) == SSA_NAME
8812 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8813 && gimple_code (use_stmt) != GIMPLE_PHI
8814 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
8815 use_stmt))
8816 {
8817 enum tree_code code = gimple_assign_rhs_code (use_stmt);
8818 gcc_assert (code == CONSTRUCTOR
8819 || code == VIEW_CONVERT_EXPR
8820 || CONVERT_EXPR_CODE_P (code));
8821 if (dump_enabled_p ())
8822 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8823 "Using original scalar computation for "
8824 "live lane because use preceeds vector "
8825 "def\n");
8826 continue;
8827 }
8828 /* ??? It can also happen that we end up pulling a def into
8829 a loop where replacing out-of-loop uses would require
8830 a new LC SSA PHI node. Retain the original scalar in
8831 those cases as well. PR98064. */
8832 if (TREE_CODE (new_tree) == SSA_NAME
8833 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8834 && (gimple_bb (use_stmt)->loop_father
8835 != gimple_bb (vec_stmt)->loop_father)
8836 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
8837 gimple_bb (use_stmt)->loop_father))
8838 {
8839 if (dump_enabled_p ())
8840 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8841 "Using original scalar computation for "
8842 "live lane because there is an out-of-loop "
8843 "definition for it\n");
8844 continue;
8845 }
8846 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8847 SET_USE (use_p, new_tree);
8848 update_stmt (use_stmt);
8849 }
8850 }
8851
8852 return true;
8853 }
8854
8855 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8856
8857 static void
8858 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8859 {
8860 ssa_op_iter op_iter;
8861 imm_use_iterator imm_iter;
8862 def_operand_p def_p;
8863 gimple *ustmt;
8864
8865 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8866 {
8867 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8868 {
8869 basic_block bb;
8870
8871 if (!is_gimple_debug (ustmt))
8872 continue;
8873
8874 bb = gimple_bb (ustmt);
8875
8876 if (!flow_bb_inside_loop_p (loop, bb))
8877 {
8878 if (gimple_debug_bind_p (ustmt))
8879 {
8880 if (dump_enabled_p ())
8881 dump_printf_loc (MSG_NOTE, vect_location,
8882 "killing debug use\n");
8883
8884 gimple_debug_bind_reset_value (ustmt);
8885 update_stmt (ustmt);
8886 }
8887 else
8888 gcc_unreachable ();
8889 }
8890 }
8891 }
8892 }
8893
8894 /* Given loop represented by LOOP_VINFO, return true if computation of
8895 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8896 otherwise. */
8897
8898 static bool
8899 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8900 {
8901 /* Constant case. */
8902 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8903 {
8904 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8905 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8906
8907 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8908 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8909 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8910 return true;
8911 }
8912
8913 widest_int max;
8914 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8915 /* Check the upper bound of loop niters. */
8916 if (get_max_loop_iterations (loop, &max))
8917 {
8918 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8919 signop sgn = TYPE_SIGN (type);
8920 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8921 if (max < type_max)
8922 return true;
8923 }
8924 return false;
8925 }
8926
8927 /* Return a mask type with half the number of elements as OLD_TYPE,
8928 given that it should have mode NEW_MODE. */
8929
8930 tree
8931 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8932 {
8933 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8934 return build_truth_vector_type_for_mode (nunits, new_mode);
8935 }
8936
8937 /* Return a mask type with twice as many elements as OLD_TYPE,
8938 given that it should have mode NEW_MODE. */
8939
8940 tree
8941 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8942 {
8943 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8944 return build_truth_vector_type_for_mode (nunits, new_mode);
8945 }
8946
8947 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8948 contain a sequence of NVECTORS masks that each control a vector of type
8949 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
8950 these vector masks with the vector version of SCALAR_MASK. */
8951
8952 void
8953 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8954 unsigned int nvectors, tree vectype, tree scalar_mask)
8955 {
8956 gcc_assert (nvectors != 0);
8957 if (masks->length () < nvectors)
8958 masks->safe_grow_cleared (nvectors, true);
8959 rgroup_controls *rgm = &(*masks)[nvectors - 1];
8960 /* The number of scalars per iteration and the number of vectors are
8961 both compile-time constants. */
8962 unsigned int nscalars_per_iter
8963 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8964 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8965
8966 if (scalar_mask)
8967 {
8968 scalar_cond_masked_key cond (scalar_mask, nvectors);
8969 loop_vinfo->scalar_cond_masked_set.add (cond);
8970 }
8971
8972 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8973 {
8974 rgm->max_nscalars_per_iter = nscalars_per_iter;
8975 rgm->type = truth_type_for (vectype);
8976 rgm->factor = 1;
8977 }
8978 }
8979
8980 /* Given a complete set of masks MASKS, extract mask number INDEX
8981 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8982 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8983
8984 See the comment above vec_loop_masks for more details about the mask
8985 arrangement. */
8986
8987 tree
8988 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8989 unsigned int nvectors, tree vectype, unsigned int index)
8990 {
8991 rgroup_controls *rgm = &(*masks)[nvectors - 1];
8992 tree mask_type = rgm->type;
8993
8994 /* Populate the rgroup's mask array, if this is the first time we've
8995 used it. */
8996 if (rgm->controls.is_empty ())
8997 {
8998 rgm->controls.safe_grow_cleared (nvectors, true);
8999 for (unsigned int i = 0; i < nvectors; ++i)
9000 {
9001 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
9002 /* Provide a dummy definition until the real one is available. */
9003 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
9004 rgm->controls[i] = mask;
9005 }
9006 }
9007
9008 tree mask = rgm->controls[index];
9009 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
9010 TYPE_VECTOR_SUBPARTS (vectype)))
9011 {
9012 /* A loop mask for data type X can be reused for data type Y
9013 if X has N times more elements than Y and if Y's elements
9014 are N times bigger than X's. In this case each sequence
9015 of N elements in the loop mask will be all-zero or all-one.
9016 We can then view-convert the mask so that each sequence of
9017 N elements is replaced by a single element. */
9018 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
9019 TYPE_VECTOR_SUBPARTS (vectype)));
9020 gimple_seq seq = NULL;
9021 mask_type = truth_type_for (vectype);
9022 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
9023 if (seq)
9024 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
9025 }
9026 return mask;
9027 }
9028
9029 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
9030 lengths for controlling an operation on VECTYPE. The operation splits
9031 each element of VECTYPE into FACTOR separate subelements, measuring the
9032 length as a number of these subelements. */
9033
9034 void
9035 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9036 unsigned int nvectors, tree vectype, unsigned int factor)
9037 {
9038 gcc_assert (nvectors != 0);
9039 if (lens->length () < nvectors)
9040 lens->safe_grow_cleared (nvectors, true);
9041 rgroup_controls *rgl = &(*lens)[nvectors - 1];
9042
9043 /* The number of scalars per iteration, scalar occupied bytes and
9044 the number of vectors are both compile-time constants. */
9045 unsigned int nscalars_per_iter
9046 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9047 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9048
9049 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
9050 {
9051 /* For now, we only support cases in which all loads and stores fall back
9052 to VnQI or none do. */
9053 gcc_assert (!rgl->max_nscalars_per_iter
9054 || (rgl->factor == 1 && factor == 1)
9055 || (rgl->max_nscalars_per_iter * rgl->factor
9056 == nscalars_per_iter * factor));
9057 rgl->max_nscalars_per_iter = nscalars_per_iter;
9058 rgl->type = vectype;
9059 rgl->factor = factor;
9060 }
9061 }
9062
9063 /* Given a complete set of length LENS, extract length number INDEX for an
9064 rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
9065
9066 tree
9067 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9068 unsigned int nvectors, unsigned int index)
9069 {
9070 rgroup_controls *rgl = &(*lens)[nvectors - 1];
9071
9072 /* Populate the rgroup's len array, if this is the first time we've
9073 used it. */
9074 if (rgl->controls.is_empty ())
9075 {
9076 rgl->controls.safe_grow_cleared (nvectors, true);
9077 for (unsigned int i = 0; i < nvectors; ++i)
9078 {
9079 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9080 gcc_assert (len_type != NULL_TREE);
9081 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
9082
9083 /* Provide a dummy definition until the real one is available. */
9084 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
9085 rgl->controls[i] = len;
9086 }
9087 }
9088
9089 return rgl->controls[index];
9090 }
9091
9092 /* Scale profiling counters by estimation for LOOP which is vectorized
9093 by factor VF. */
9094
9095 static void
9096 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
9097 {
9098 edge preheader = loop_preheader_edge (loop);
9099 /* Reduce loop iterations by the vectorization factor. */
9100 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
9101 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
9102
9103 if (freq_h.nonzero_p ())
9104 {
9105 profile_probability p;
9106
9107 /* Avoid dropping loop body profile counter to 0 because of zero count
9108 in loop's preheader. */
9109 if (!(freq_e == profile_count::zero ()))
9110 freq_e = freq_e.force_nonzero ();
9111 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
9112 scale_loop_frequencies (loop, p);
9113 }
9114
9115 edge exit_e = single_exit (loop);
9116 exit_e->probability = profile_probability::always ()
9117 .apply_scale (1, new_est_niter + 1);
9118
9119 edge exit_l = single_pred_edge (loop->latch);
9120 profile_probability prob = exit_l->probability;
9121 exit_l->probability = exit_e->probability.invert ();
9122 if (prob.initialized_p () && exit_l->probability.initialized_p ())
9123 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
9124 }
9125
9126 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
9127 latch edge values originally defined by it. */
9128
9129 static void
9130 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
9131 stmt_vec_info def_stmt_info)
9132 {
9133 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
9134 if (!def || TREE_CODE (def) != SSA_NAME)
9135 return;
9136 stmt_vec_info phi_info;
9137 imm_use_iterator iter;
9138 use_operand_p use_p;
9139 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
9140 if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
9141 if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
9142 && (phi_info = loop_vinfo->lookup_stmt (phi))
9143 && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
9144 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
9145 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
9146 {
9147 loop_p loop = gimple_bb (phi)->loop_father;
9148 edge e = loop_latch_edge (loop);
9149 if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
9150 {
9151 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
9152 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
9153 gcc_assert (phi_defs.length () == latch_defs.length ());
9154 for (unsigned i = 0; i < phi_defs.length (); ++i)
9155 add_phi_arg (as_a <gphi *> (phi_defs[i]),
9156 gimple_get_lhs (latch_defs[i]), e,
9157 gimple_phi_arg_location (phi, e->dest_idx));
9158 }
9159 }
9160 }
9161
9162 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9163 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9164 stmt_vec_info. */
9165
9166 static bool
9167 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9168 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
9169 {
9170 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9171 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9172
9173 if (dump_enabled_p ())
9174 dump_printf_loc (MSG_NOTE, vect_location,
9175 "------>vectorizing statement: %G", stmt_info->stmt);
9176
9177 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9178 vect_loop_kill_debug_uses (loop, stmt_info);
9179
9180 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9181 && !STMT_VINFO_LIVE_P (stmt_info))
9182 return false;
9183
9184 if (STMT_VINFO_VECTYPE (stmt_info))
9185 {
9186 poly_uint64 nunits
9187 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
9188 if (!STMT_SLP_TYPE (stmt_info)
9189 && maybe_ne (nunits, vf)
9190 && dump_enabled_p ())
9191 /* For SLP VF is set according to unrolling factor, and not
9192 to vector size, hence for SLP this print is not valid. */
9193 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9194 }
9195
9196 /* Pure SLP statements have already been vectorized. We still need
9197 to apply loop vectorization to hybrid SLP statements. */
9198 if (PURE_SLP_STMT (stmt_info))
9199 return false;
9200
9201 if (dump_enabled_p ())
9202 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
9203
9204 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
9205 *seen_store = stmt_info;
9206
9207 return true;
9208 }
9209
9210 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9211 in the hash_map with its corresponding values. */
9212
9213 static tree
9214 find_in_mapping (tree t, void *context)
9215 {
9216 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
9217
9218 tree *value = mapping->get (t);
9219 return value ? *value : t;
9220 }
9221
9222 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
9223 original loop that has now been vectorized.
9224
9225 The inits of the data_references need to be advanced with the number of
9226 iterations of the main loop. This has been computed in vect_do_peeling and
9227 is stored in parameter ADVANCE. We first restore the data_references
9228 initial offset with the values recored in ORIG_DRS_INIT.
9229
9230 Since the loop_vec_info of this EPILOGUE was constructed for the original
9231 loop, its stmt_vec_infos all point to the original statements. These need
9232 to be updated to point to their corresponding copies as well as the SSA_NAMES
9233 in their PATTERN_DEF_SEQs and RELATED_STMTs.
9234
9235 The data_reference's connections also need to be updated. Their
9236 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9237 stmt_vec_infos, their statements need to point to their corresponding copy,
9238 if they are gather loads or scatter stores then their reference needs to be
9239 updated to point to its corresponding copy and finally we set
9240 'base_misaligned' to false as we have already peeled for alignment in the
9241 prologue of the main loop. */
9242
9243 static void
9244 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9245 {
9246 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9247 auto_vec<gimple *> stmt_worklist;
9248 hash_map<tree,tree> mapping;
9249 gimple *orig_stmt, *new_stmt;
9250 gimple_stmt_iterator epilogue_gsi;
9251 gphi_iterator epilogue_phi_gsi;
9252 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9253 basic_block *epilogue_bbs = get_loop_body (epilogue);
9254 unsigned i;
9255
9256 free (LOOP_VINFO_BBS (epilogue_vinfo));
9257 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9258
9259 /* Advance data_reference's with the number of iterations of the previous
9260 loop and its prologue. */
9261 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9262
9263
9264 /* The EPILOGUE loop is a copy of the original loop so they share the same
9265 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
9266 point to the copied statements. We also create a mapping of all LHS' in
9267 the original loop and all the LHS' in the EPILOGUE and create worklists to
9268 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
9269 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9270 {
9271 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9272 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9273 {
9274 new_stmt = epilogue_phi_gsi.phi ();
9275
9276 gcc_assert (gimple_uid (new_stmt) > 0);
9277 stmt_vinfo
9278 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9279
9280 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9281 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9282
9283 mapping.put (gimple_phi_result (orig_stmt),
9284 gimple_phi_result (new_stmt));
9285 /* PHI nodes can not have patterns or related statements. */
9286 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9287 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9288 }
9289
9290 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9291 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9292 {
9293 new_stmt = gsi_stmt (epilogue_gsi);
9294 if (is_gimple_debug (new_stmt))
9295 continue;
9296
9297 gcc_assert (gimple_uid (new_stmt) > 0);
9298 stmt_vinfo
9299 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9300
9301 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9302 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9303
9304 if (tree old_lhs = gimple_get_lhs (orig_stmt))
9305 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9306
9307 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9308 {
9309 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9310 for (gimple_stmt_iterator gsi = gsi_start (seq);
9311 !gsi_end_p (gsi); gsi_next (&gsi))
9312 stmt_worklist.safe_push (gsi_stmt (gsi));
9313 }
9314
9315 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9316 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9317 {
9318 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9319 stmt_worklist.safe_push (stmt);
9320 /* Set BB such that the assert in
9321 'get_initial_def_for_reduction' is able to determine that
9322 the BB of the related stmt is inside this loop. */
9323 gimple_set_bb (stmt,
9324 gimple_bb (new_stmt));
9325 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9326 gcc_assert (related_vinfo == NULL
9327 || related_vinfo == stmt_vinfo);
9328 }
9329 }
9330 }
9331
9332 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9333 using the original main loop and thus need to be updated to refer to the
9334 cloned variables used in the epilogue. */
9335 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9336 {
9337 gimple *stmt = stmt_worklist[i];
9338 tree *new_op;
9339
9340 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9341 {
9342 tree op = gimple_op (stmt, j);
9343 if ((new_op = mapping.get(op)))
9344 gimple_set_op (stmt, j, *new_op);
9345 else
9346 {
9347 /* PR92429: The last argument of simplify_replace_tree disables
9348 folding when replacing arguments. This is required as
9349 otherwise you might end up with different statements than the
9350 ones analyzed in vect_loop_analyze, leading to different
9351 vectorization. */
9352 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9353 &find_in_mapping, &mapping, false);
9354 gimple_set_op (stmt, j, op);
9355 }
9356 }
9357 }
9358
9359 struct data_reference *dr;
9360 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9361 FOR_EACH_VEC_ELT (datarefs, i, dr)
9362 {
9363 orig_stmt = DR_STMT (dr);
9364 gcc_assert (gimple_uid (orig_stmt) > 0);
9365 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9366 /* Data references for gather loads and scatter stores do not use the
9367 updated offset we set using ADVANCE. Instead we have to make sure the
9368 reference in the data references point to the corresponding copy of
9369 the original in the epilogue. */
9370 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9371 == VMAT_GATHER_SCATTER)
9372 {
9373 DR_REF (dr)
9374 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9375 &find_in_mapping, &mapping);
9376 DR_BASE_ADDRESS (dr)
9377 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9378 &find_in_mapping, &mapping);
9379 }
9380 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9381 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9382 /* The vector size of the epilogue is smaller than that of the main loop
9383 so the alignment is either the same or lower. This means the dr will
9384 thus by definition be aligned. */
9385 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
9386 }
9387
9388 epilogue_vinfo->shared->datarefs_copy.release ();
9389 epilogue_vinfo->shared->save_datarefs ();
9390 }
9391
9392 /* Function vect_transform_loop.
9393
9394 The analysis phase has determined that the loop is vectorizable.
9395 Vectorize the loop - created vectorized stmts to replace the scalar
9396 stmts in the loop, and update the loop exit condition.
9397 Returns scalar epilogue loop if any. */
9398
9399 class loop *
9400 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9401 {
9402 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9403 class loop *epilogue = NULL;
9404 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9405 int nbbs = loop->num_nodes;
9406 int i;
9407 tree niters_vector = NULL_TREE;
9408 tree step_vector = NULL_TREE;
9409 tree niters_vector_mult_vf = NULL_TREE;
9410 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9411 unsigned int lowest_vf = constant_lower_bound (vf);
9412 gimple *stmt;
9413 bool check_profitability = false;
9414 unsigned int th;
9415
9416 DUMP_VECT_SCOPE ("vec_transform_loop");
9417
9418 loop_vinfo->shared->check_datarefs ();
9419
9420 /* Use the more conservative vectorization threshold. If the number
9421 of iterations is constant assume the cost check has been performed
9422 by our caller. If the threshold makes all loops profitable that
9423 run at least the (estimated) vectorization factor number of times
9424 checking is pointless, too. */
9425 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9426 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9427 {
9428 if (dump_enabled_p ())
9429 dump_printf_loc (MSG_NOTE, vect_location,
9430 "Profitability threshold is %d loop iterations.\n",
9431 th);
9432 check_profitability = true;
9433 }
9434
9435 /* Make sure there exists a single-predecessor exit bb. Do this before
9436 versioning. */
9437 edge e = single_exit (loop);
9438 if (! single_pred_p (e->dest))
9439 {
9440 split_loop_exit_edge (e, true);
9441 if (dump_enabled_p ())
9442 dump_printf (MSG_NOTE, "split exit edge\n");
9443 }
9444
9445 /* Version the loop first, if required, so the profitability check
9446 comes first. */
9447
9448 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9449 {
9450 class loop *sloop
9451 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9452 sloop->force_vectorize = false;
9453 check_profitability = false;
9454 }
9455
9456 /* Make sure there exists a single-predecessor exit bb also on the
9457 scalar loop copy. Do this after versioning but before peeling
9458 so CFG structure is fine for both scalar and if-converted loop
9459 to make slpeel_duplicate_current_defs_from_edges face matched
9460 loop closed PHI nodes on the exit. */
9461 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9462 {
9463 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9464 if (! single_pred_p (e->dest))
9465 {
9466 split_loop_exit_edge (e, true);
9467 if (dump_enabled_p ())
9468 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9469 }
9470 }
9471
9472 tree niters = vect_build_loop_niters (loop_vinfo);
9473 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9474 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9475 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9476 tree advance;
9477 drs_init_vec orig_drs_init;
9478
9479 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9480 &step_vector, &niters_vector_mult_vf, th,
9481 check_profitability, niters_no_overflow,
9482 &advance);
9483
9484 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9485 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9486 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9487 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9488
9489 if (niters_vector == NULL_TREE)
9490 {
9491 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9492 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9493 && known_eq (lowest_vf, vf))
9494 {
9495 niters_vector
9496 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9497 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9498 step_vector = build_one_cst (TREE_TYPE (niters));
9499 }
9500 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9501 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9502 &step_vector, niters_no_overflow);
9503 else
9504 /* vect_do_peeling subtracted the number of peeled prologue
9505 iterations from LOOP_VINFO_NITERS. */
9506 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9507 &niters_vector, &step_vector,
9508 niters_no_overflow);
9509 }
9510
9511 /* 1) Make sure the loop header has exactly two entries
9512 2) Make sure we have a preheader basic block. */
9513
9514 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9515
9516 split_edge (loop_preheader_edge (loop));
9517
9518 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9519 /* This will deal with any possible peeling. */
9520 vect_prepare_for_masked_peels (loop_vinfo);
9521
9522 /* Schedule the SLP instances first, then handle loop vectorization
9523 below. */
9524 if (!loop_vinfo->slp_instances.is_empty ())
9525 {
9526 DUMP_VECT_SCOPE ("scheduling SLP instances");
9527 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9528 }
9529
9530 /* FORNOW: the vectorizer supports only loops which body consist
9531 of one basic block (header + empty latch). When the vectorizer will
9532 support more involved loop forms, the order by which the BBs are
9533 traversed need to be reconsidered. */
9534
9535 for (i = 0; i < nbbs; i++)
9536 {
9537 basic_block bb = bbs[i];
9538 stmt_vec_info stmt_info;
9539
9540 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9541 gsi_next (&si))
9542 {
9543 gphi *phi = si.phi ();
9544 if (dump_enabled_p ())
9545 dump_printf_loc (MSG_NOTE, vect_location,
9546 "------>vectorizing phi: %G", phi);
9547 stmt_info = loop_vinfo->lookup_stmt (phi);
9548 if (!stmt_info)
9549 continue;
9550
9551 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9552 vect_loop_kill_debug_uses (loop, stmt_info);
9553
9554 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9555 && !STMT_VINFO_LIVE_P (stmt_info))
9556 continue;
9557
9558 if (STMT_VINFO_VECTYPE (stmt_info)
9559 && (maybe_ne
9560 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9561 && dump_enabled_p ())
9562 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9563
9564 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9565 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9566 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9567 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9568 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9569 && ! PURE_SLP_STMT (stmt_info))
9570 {
9571 if (dump_enabled_p ())
9572 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9573 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9574 }
9575 }
9576
9577 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9578 gsi_next (&si))
9579 {
9580 gphi *phi = si.phi ();
9581 stmt_info = loop_vinfo->lookup_stmt (phi);
9582 if (!stmt_info)
9583 continue;
9584
9585 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9586 && !STMT_VINFO_LIVE_P (stmt_info))
9587 continue;
9588
9589 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9590 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9591 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9592 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9593 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9594 && ! PURE_SLP_STMT (stmt_info))
9595 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9596 }
9597
9598 for (gimple_stmt_iterator si = gsi_start_bb (bb);
9599 !gsi_end_p (si);)
9600 {
9601 stmt = gsi_stmt (si);
9602 /* During vectorization remove existing clobber stmts. */
9603 if (gimple_clobber_p (stmt))
9604 {
9605 unlink_stmt_vdef (stmt);
9606 gsi_remove (&si, true);
9607 release_defs (stmt);
9608 }
9609 else
9610 {
9611 /* Ignore vector stmts created in the outer loop. */
9612 stmt_info = loop_vinfo->lookup_stmt (stmt);
9613
9614 /* vector stmts created in the outer-loop during vectorization of
9615 stmts in an inner-loop may not have a stmt_info, and do not
9616 need to be vectorized. */
9617 stmt_vec_info seen_store = NULL;
9618 if (stmt_info)
9619 {
9620 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9621 {
9622 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9623 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9624 !gsi_end_p (subsi); gsi_next (&subsi))
9625 {
9626 stmt_vec_info pat_stmt_info
9627 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9628 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9629 &si, &seen_store);
9630 }
9631 stmt_vec_info pat_stmt_info
9632 = STMT_VINFO_RELATED_STMT (stmt_info);
9633 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9634 &si, &seen_store))
9635 maybe_set_vectorized_backedge_value (loop_vinfo,
9636 pat_stmt_info);
9637 }
9638 else
9639 {
9640 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9641 &seen_store))
9642 maybe_set_vectorized_backedge_value (loop_vinfo,
9643 stmt_info);
9644 }
9645 }
9646 gsi_next (&si);
9647 if (seen_store)
9648 {
9649 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9650 /* Interleaving. If IS_STORE is TRUE, the
9651 vectorization of the interleaving chain was
9652 completed - free all the stores in the chain. */
9653 vect_remove_stores (loop_vinfo,
9654 DR_GROUP_FIRST_ELEMENT (seen_store));
9655 else
9656 /* Free the attached stmt_vec_info and remove the stmt. */
9657 loop_vinfo->remove_stmt (stmt_info);
9658 }
9659 }
9660 }
9661
9662 /* Stub out scalar statements that must not survive vectorization.
9663 Doing this here helps with grouped statements, or statements that
9664 are involved in patterns. */
9665 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9666 !gsi_end_p (gsi); gsi_next (&gsi))
9667 {
9668 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9669 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
9670 {
9671 tree lhs = gimple_get_lhs (call);
9672 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9673 {
9674 tree zero = build_zero_cst (TREE_TYPE (lhs));
9675 gimple *new_stmt = gimple_build_assign (lhs, zero);
9676 gsi_replace (&gsi, new_stmt, true);
9677 }
9678 }
9679 }
9680 } /* BBs in loop */
9681
9682 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9683 a zero NITERS becomes a nonzero NITERS_VECTOR. */
9684 if (integer_onep (step_vector))
9685 niters_no_overflow = true;
9686 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9687 niters_vector_mult_vf, !niters_no_overflow);
9688
9689 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9690 scale_profile_for_vect_loop (loop, assumed_vf);
9691
9692 /* True if the final iteration might not handle a full vector's
9693 worth of scalar iterations. */
9694 bool final_iter_may_be_partial
9695 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9696 /* The minimum number of iterations performed by the epilogue. This
9697 is 1 when peeling for gaps because we always need a final scalar
9698 iteration. */
9699 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9700 /* +1 to convert latch counts to loop iteration counts,
9701 -min_epilogue_iters to remove iterations that cannot be performed
9702 by the vector code. */
9703 int bias_for_lowest = 1 - min_epilogue_iters;
9704 int bias_for_assumed = bias_for_lowest;
9705 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9706 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9707 {
9708 /* When the amount of peeling is known at compile time, the first
9709 iteration will have exactly alignment_npeels active elements.
9710 In the worst case it will have at least one. */
9711 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9712 bias_for_lowest += lowest_vf - min_first_active;
9713 bias_for_assumed += assumed_vf - min_first_active;
9714 }
9715 /* In these calculations the "- 1" converts loop iteration counts
9716 back to latch counts. */
9717 if (loop->any_upper_bound)
9718 loop->nb_iterations_upper_bound
9719 = (final_iter_may_be_partial
9720 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9721 lowest_vf) - 1
9722 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9723 lowest_vf) - 1);
9724 if (loop->any_likely_upper_bound)
9725 loop->nb_iterations_likely_upper_bound
9726 = (final_iter_may_be_partial
9727 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9728 + bias_for_lowest, lowest_vf) - 1
9729 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9730 + bias_for_lowest, lowest_vf) - 1);
9731 if (loop->any_estimate)
9732 loop->nb_iterations_estimate
9733 = (final_iter_may_be_partial
9734 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9735 assumed_vf) - 1
9736 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9737 assumed_vf) - 1);
9738
9739 if (dump_enabled_p ())
9740 {
9741 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9742 {
9743 dump_printf_loc (MSG_NOTE, vect_location,
9744 "LOOP VECTORIZED\n");
9745 if (loop->inner)
9746 dump_printf_loc (MSG_NOTE, vect_location,
9747 "OUTER LOOP VECTORIZED\n");
9748 dump_printf (MSG_NOTE, "\n");
9749 }
9750 else
9751 dump_printf_loc (MSG_NOTE, vect_location,
9752 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9753 GET_MODE_NAME (loop_vinfo->vector_mode));
9754 }
9755
9756 /* Loops vectorized with a variable factor won't benefit from
9757 unrolling/peeling. */
9758 if (!vf.is_constant ())
9759 {
9760 loop->unroll = 1;
9761 if (dump_enabled_p ())
9762 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9763 " variable-length vectorization factor\n");
9764 }
9765 /* Free SLP instances here because otherwise stmt reference counting
9766 won't work. */
9767 slp_instance instance;
9768 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9769 vect_free_slp_instance (instance);
9770 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9771 /* Clear-up safelen field since its value is invalid after vectorization
9772 since vectorized loop can have loop-carried dependencies. */
9773 loop->safelen = 0;
9774
9775 if (epilogue)
9776 {
9777 update_epilogue_loop_vinfo (epilogue, advance);
9778
9779 epilogue->simduid = loop->simduid;
9780 epilogue->force_vectorize = loop->force_vectorize;
9781 epilogue->dont_vectorize = false;
9782 }
9783
9784 return epilogue;
9785 }
9786
9787 /* The code below is trying to perform simple optimization - revert
9788 if-conversion for masked stores, i.e. if the mask of a store is zero
9789 do not perform it and all stored value producers also if possible.
9790 For example,
9791 for (i=0; i<n; i++)
9792 if (c[i])
9793 {
9794 p1[i] += 1;
9795 p2[i] = p3[i] +2;
9796 }
9797 this transformation will produce the following semi-hammock:
9798
9799 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9800 {
9801 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9802 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9803 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9804 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9805 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9806 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9807 }
9808 */
9809
9810 void
9811 optimize_mask_stores (class loop *loop)
9812 {
9813 basic_block *bbs = get_loop_body (loop);
9814 unsigned nbbs = loop->num_nodes;
9815 unsigned i;
9816 basic_block bb;
9817 class loop *bb_loop;
9818 gimple_stmt_iterator gsi;
9819 gimple *stmt;
9820 auto_vec<gimple *> worklist;
9821 auto_purge_vect_location sentinel;
9822
9823 vect_location = find_loop_location (loop);
9824 /* Pick up all masked stores in loop if any. */
9825 for (i = 0; i < nbbs; i++)
9826 {
9827 bb = bbs[i];
9828 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9829 gsi_next (&gsi))
9830 {
9831 stmt = gsi_stmt (gsi);
9832 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9833 worklist.safe_push (stmt);
9834 }
9835 }
9836
9837 free (bbs);
9838 if (worklist.is_empty ())
9839 return;
9840
9841 /* Loop has masked stores. */
9842 while (!worklist.is_empty ())
9843 {
9844 gimple *last, *last_store;
9845 edge e, efalse;
9846 tree mask;
9847 basic_block store_bb, join_bb;
9848 gimple_stmt_iterator gsi_to;
9849 tree vdef, new_vdef;
9850 gphi *phi;
9851 tree vectype;
9852 tree zero;
9853
9854 last = worklist.pop ();
9855 mask = gimple_call_arg (last, 2);
9856 bb = gimple_bb (last);
9857 /* Create then_bb and if-then structure in CFG, then_bb belongs to
9858 the same loop as if_bb. It could be different to LOOP when two
9859 level loop-nest is vectorized and mask_store belongs to the inner
9860 one. */
9861 e = split_block (bb, last);
9862 bb_loop = bb->loop_father;
9863 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9864 join_bb = e->dest;
9865 store_bb = create_empty_bb (bb);
9866 add_bb_to_loop (store_bb, bb_loop);
9867 e->flags = EDGE_TRUE_VALUE;
9868 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9869 /* Put STORE_BB to likely part. */
9870 efalse->probability = profile_probability::unlikely ();
9871 store_bb->count = efalse->count ();
9872 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9873 if (dom_info_available_p (CDI_DOMINATORS))
9874 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9875 if (dump_enabled_p ())
9876 dump_printf_loc (MSG_NOTE, vect_location,
9877 "Create new block %d to sink mask stores.",
9878 store_bb->index);
9879 /* Create vector comparison with boolean result. */
9880 vectype = TREE_TYPE (mask);
9881 zero = build_zero_cst (vectype);
9882 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9883 gsi = gsi_last_bb (bb);
9884 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9885 /* Create new PHI node for vdef of the last masked store:
9886 .MEM_2 = VDEF <.MEM_1>
9887 will be converted to
9888 .MEM.3 = VDEF <.MEM_1>
9889 and new PHI node will be created in join bb
9890 .MEM_2 = PHI <.MEM_1, .MEM_3>
9891 */
9892 vdef = gimple_vdef (last);
9893 new_vdef = make_ssa_name (gimple_vop (cfun), last);
9894 gimple_set_vdef (last, new_vdef);
9895 phi = create_phi_node (vdef, join_bb);
9896 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9897
9898 /* Put all masked stores with the same mask to STORE_BB if possible. */
9899 while (true)
9900 {
9901 gimple_stmt_iterator gsi_from;
9902 gimple *stmt1 = NULL;
9903
9904 /* Move masked store to STORE_BB. */
9905 last_store = last;
9906 gsi = gsi_for_stmt (last);
9907 gsi_from = gsi;
9908 /* Shift GSI to the previous stmt for further traversal. */
9909 gsi_prev (&gsi);
9910 gsi_to = gsi_start_bb (store_bb);
9911 gsi_move_before (&gsi_from, &gsi_to);
9912 /* Setup GSI_TO to the non-empty block start. */
9913 gsi_to = gsi_start_bb (store_bb);
9914 if (dump_enabled_p ())
9915 dump_printf_loc (MSG_NOTE, vect_location,
9916 "Move stmt to created bb\n%G", last);
9917 /* Move all stored value producers if possible. */
9918 while (!gsi_end_p (gsi))
9919 {
9920 tree lhs;
9921 imm_use_iterator imm_iter;
9922 use_operand_p use_p;
9923 bool res;
9924
9925 /* Skip debug statements. */
9926 if (is_gimple_debug (gsi_stmt (gsi)))
9927 {
9928 gsi_prev (&gsi);
9929 continue;
9930 }
9931 stmt1 = gsi_stmt (gsi);
9932 /* Do not consider statements writing to memory or having
9933 volatile operand. */
9934 if (gimple_vdef (stmt1)
9935 || gimple_has_volatile_ops (stmt1))
9936 break;
9937 gsi_from = gsi;
9938 gsi_prev (&gsi);
9939 lhs = gimple_get_lhs (stmt1);
9940 if (!lhs)
9941 break;
9942
9943 /* LHS of vectorized stmt must be SSA_NAME. */
9944 if (TREE_CODE (lhs) != SSA_NAME)
9945 break;
9946
9947 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9948 {
9949 /* Remove dead scalar statement. */
9950 if (has_zero_uses (lhs))
9951 {
9952 gsi_remove (&gsi_from, true);
9953 continue;
9954 }
9955 }
9956
9957 /* Check that LHS does not have uses outside of STORE_BB. */
9958 res = true;
9959 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9960 {
9961 gimple *use_stmt;
9962 use_stmt = USE_STMT (use_p);
9963 if (is_gimple_debug (use_stmt))
9964 continue;
9965 if (gimple_bb (use_stmt) != store_bb)
9966 {
9967 res = false;
9968 break;
9969 }
9970 }
9971 if (!res)
9972 break;
9973
9974 if (gimple_vuse (stmt1)
9975 && gimple_vuse (stmt1) != gimple_vuse (last_store))
9976 break;
9977
9978 /* Can move STMT1 to STORE_BB. */
9979 if (dump_enabled_p ())
9980 dump_printf_loc (MSG_NOTE, vect_location,
9981 "Move stmt to created bb\n%G", stmt1);
9982 gsi_move_before (&gsi_from, &gsi_to);
9983 /* Shift GSI_TO for further insertion. */
9984 gsi_prev (&gsi_to);
9985 }
9986 /* Put other masked stores with the same mask to STORE_BB. */
9987 if (worklist.is_empty ()
9988 || gimple_call_arg (worklist.last (), 2) != mask
9989 || worklist.last () != stmt1)
9990 break;
9991 last = worklist.pop ();
9992 }
9993 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9994 }
9995 }
9996
9997 /* Decide whether it is possible to use a zero-based induction variable
9998 when vectorizing LOOP_VINFO with partial vectors. If it is, return
9999 the value that the induction variable must be able to hold in order
10000 to ensure that the rgroups eventually have no active vector elements.
10001 Return -1 otherwise. */
10002
10003 widest_int
10004 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
10005 {
10006 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10007 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10008 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
10009
10010 /* Calculate the value that the induction variable must be able
10011 to hit in order to ensure that we end the loop with an all-false mask.
10012 This involves adding the maximum number of inactive trailing scalar
10013 iterations. */
10014 widest_int iv_limit = -1;
10015 if (max_loop_iterations (loop, &iv_limit))
10016 {
10017 if (niters_skip)
10018 {
10019 /* Add the maximum number of skipped iterations to the
10020 maximum iteration count. */
10021 if (TREE_CODE (niters_skip) == INTEGER_CST)
10022 iv_limit += wi::to_widest (niters_skip);
10023 else
10024 iv_limit += max_vf - 1;
10025 }
10026 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
10027 /* Make a conservatively-correct assumption. */
10028 iv_limit += max_vf - 1;
10029
10030 /* IV_LIMIT is the maximum number of latch iterations, which is also
10031 the maximum in-range IV value. Round this value down to the previous
10032 vector alignment boundary and then add an extra full iteration. */
10033 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10034 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
10035 }
10036 return iv_limit;
10037 }
10038
10039 /* For the given rgroup_controls RGC, check whether an induction variable
10040 would ever hit a value that produces a set of all-false masks or zero
10041 lengths before wrapping around. Return true if it's possible to wrap
10042 around before hitting the desirable value, otherwise return false. */
10043
10044 bool
10045 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
10046 {
10047 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
10048
10049 if (iv_limit == -1)
10050 return true;
10051
10052 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10053 unsigned int compare_precision = TYPE_PRECISION (compare_type);
10054 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
10055
10056 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
10057 return true;
10058
10059 return false;
10060 }