code generate live lanes in basic-block vectorization
[gcc.git] / gcc / tree-vect-loop.c
1 /* Loop Vectorization
2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
49 #include "cgraph.h"
50 #include "tree-cfg.h"
51 #include "tree-if-conv.h"
52 #include "internal-fn.h"
53 #include "tree-vector-builder.h"
54 #include "vec-perm-indices.h"
55 #include "tree-eh.h"
56
57 /* Loop Vectorization Pass.
58
59 This pass tries to vectorize loops.
60
61 For example, the vectorizer transforms the following simple loop:
62
63 short a[N]; short b[N]; short c[N]; int i;
64
65 for (i=0; i<N; i++){
66 a[i] = b[i] + c[i];
67 }
68
69 as if it was manually vectorized by rewriting the source code into:
70
71 typedef int __attribute__((mode(V8HI))) v8hi;
72 short a[N]; short b[N]; short c[N]; int i;
73 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
74 v8hi va, vb, vc;
75
76 for (i=0; i<N/8; i++){
77 vb = pb[i];
78 vc = pc[i];
79 va = vb + vc;
80 pa[i] = va;
81 }
82
83 The main entry to this pass is vectorize_loops(), in which
84 the vectorizer applies a set of analyses on a given set of loops,
85 followed by the actual vectorization transformation for the loops that
86 had successfully passed the analysis phase.
87 Throughout this pass we make a distinction between two types of
88 data: scalars (which are represented by SSA_NAMES), and memory references
89 ("data-refs"). These two types of data require different handling both
90 during analysis and transformation. The types of data-refs that the
91 vectorizer currently supports are ARRAY_REFS which base is an array DECL
92 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
93 accesses are required to have a simple (consecutive) access pattern.
94
95 Analysis phase:
96 ===============
97 The driver for the analysis phase is vect_analyze_loop().
98 It applies a set of analyses, some of which rely on the scalar evolution
99 analyzer (scev) developed by Sebastian Pop.
100
101 During the analysis phase the vectorizer records some information
102 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
103 loop, as well as general information about the loop as a whole, which is
104 recorded in a "loop_vec_info" struct attached to each loop.
105
106 Transformation phase:
107 =====================
108 The loop transformation phase scans all the stmts in the loop, and
109 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
110 the loop that needs to be vectorized. It inserts the vector code sequence
111 just before the scalar stmt S, and records a pointer to the vector code
112 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
113 attached to S). This pointer will be used for the vectorization of following
114 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
115 otherwise, we rely on dead code elimination for removing it.
116
117 For example, say stmt S1 was vectorized into stmt VS1:
118
119 VS1: vb = px[i];
120 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
121 S2: a = b;
122
123 To vectorize stmt S2, the vectorizer first finds the stmt that defines
124 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
125 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
126 resulting sequence would be:
127
128 VS1: vb = px[i];
129 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
130 VS2: va = vb;
131 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
132
133 Operands that are not SSA_NAMEs, are data-refs that appear in
134 load/store operations (like 'x[i]' in S1), and are handled differently.
135
136 Target modeling:
137 =================
138 Currently the only target specific information that is used is the
139 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
140 Targets that can support different sizes of vectors, for now will need
141 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
142 flexibility will be added in the future.
143
144 Since we only vectorize operations which vector form can be
145 expressed using existing tree codes, to verify that an operation is
146 supported, the vectorizer checks the relevant optab at the relevant
147 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
148 the value found is CODE_FOR_nothing, then there's no target support, and
149 we can't vectorize the stmt.
150
151 For additional information on this project see:
152 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
153 */
154
155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
157 bool *, bool *);
158
159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
160 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
161 may already be set for general statements (not just data refs). */
162
163 static opt_result
164 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
165 bool vectype_maybe_set_p,
166 poly_uint64 *vf)
167 {
168 gimple *stmt = stmt_info->stmt;
169
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
173 {
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return opt_result::success ();
177 }
178
179 tree stmt_vectype, nunits_vectype;
180 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
181 &stmt_vectype,
182 &nunits_vectype);
183 if (!res)
184 return res;
185
186 if (stmt_vectype)
187 {
188 if (STMT_VINFO_VECTYPE (stmt_info))
189 /* The only case when a vectype had been already set is for stmts
190 that contain a data ref, or for "pattern-stmts" (stmts generated
191 by the vectorizer to represent/replace a certain idiom). */
192 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
193 || vectype_maybe_set_p)
194 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
195 else
196 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
197 }
198
199 if (nunits_vectype)
200 vect_update_max_nunits (vf, nunits_vectype);
201
202 return opt_result::success ();
203 }
204
205 /* Subroutine of vect_determine_vectorization_factor. Set the vector
206 types of STMT_INFO and all attached pattern statements and update
207 the vectorization factor VF accordingly. Return true on success
208 or false if something prevented vectorization. */
209
210 static opt_result
211 vect_determine_vf_for_stmt (vec_info *vinfo,
212 stmt_vec_info stmt_info, poly_uint64 *vf)
213 {
214 if (dump_enabled_p ())
215 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
216 stmt_info->stmt);
217 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
218 if (!res)
219 return res;
220
221 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
222 && STMT_VINFO_RELATED_STMT (stmt_info))
223 {
224 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
225 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
226
227 /* If a pattern statement has def stmts, analyze them too. */
228 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
229 !gsi_end_p (si); gsi_next (&si))
230 {
231 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
232 if (dump_enabled_p ())
233 dump_printf_loc (MSG_NOTE, vect_location,
234 "==> examining pattern def stmt: %G",
235 def_stmt_info->stmt);
236 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
237 if (!res)
238 return res;
239 }
240
241 if (dump_enabled_p ())
242 dump_printf_loc (MSG_NOTE, vect_location,
243 "==> examining pattern statement: %G",
244 stmt_info->stmt);
245 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
246 if (!res)
247 return res;
248 }
249
250 return opt_result::success ();
251 }
252
253 /* Function vect_determine_vectorization_factor
254
255 Determine the vectorization factor (VF). VF is the number of data elements
256 that are operated upon in parallel in a single iteration of the vectorized
257 loop. For example, when vectorizing a loop that operates on 4byte elements,
258 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
259 elements can fit in a single vector register.
260
261 We currently support vectorization of loops in which all types operated upon
262 are of the same size. Therefore this function currently sets VF according to
263 the size of the types operated upon, and fails if there are multiple sizes
264 in the loop.
265
266 VF is also the factor by which the loop iterations are strip-mined, e.g.:
267 original loop:
268 for (i=0; i<N; i++){
269 a[i] = b[i] + c[i];
270 }
271
272 vectorized loop:
273 for (i=0; i<N; i+=VF){
274 a[i:VF] = b[i:VF] + c[i:VF];
275 }
276 */
277
278 static opt_result
279 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
280 {
281 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
282 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
283 unsigned nbbs = loop->num_nodes;
284 poly_uint64 vectorization_factor = 1;
285 tree scalar_type = NULL_TREE;
286 gphi *phi;
287 tree vectype;
288 stmt_vec_info stmt_info;
289 unsigned i;
290
291 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
292
293 for (i = 0; i < nbbs; i++)
294 {
295 basic_block bb = bbs[i];
296
297 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
298 gsi_next (&si))
299 {
300 phi = si.phi ();
301 stmt_info = loop_vinfo->lookup_stmt (phi);
302 if (dump_enabled_p ())
303 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
304 phi);
305
306 gcc_assert (stmt_info);
307
308 if (STMT_VINFO_RELEVANT_P (stmt_info)
309 || STMT_VINFO_LIVE_P (stmt_info))
310 {
311 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
312 scalar_type = TREE_TYPE (PHI_RESULT (phi));
313
314 if (dump_enabled_p ())
315 dump_printf_loc (MSG_NOTE, vect_location,
316 "get vectype for scalar type: %T\n",
317 scalar_type);
318
319 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
320 if (!vectype)
321 return opt_result::failure_at (phi,
322 "not vectorized: unsupported "
323 "data-type %T\n",
324 scalar_type);
325 STMT_VINFO_VECTYPE (stmt_info) = vectype;
326
327 if (dump_enabled_p ())
328 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
329 vectype);
330
331 if (dump_enabled_p ())
332 {
333 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
334 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
335 dump_printf (MSG_NOTE, "\n");
336 }
337
338 vect_update_max_nunits (&vectorization_factor, vectype);
339 }
340 }
341
342 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
343 gsi_next (&si))
344 {
345 if (is_gimple_debug (gsi_stmt (si)))
346 continue;
347 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
348 opt_result res
349 = vect_determine_vf_for_stmt (loop_vinfo,
350 stmt_info, &vectorization_factor);
351 if (!res)
352 return res;
353 }
354 }
355
356 /* TODO: Analyze cost. Decide if worth while to vectorize. */
357 if (dump_enabled_p ())
358 {
359 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
360 dump_dec (MSG_NOTE, vectorization_factor);
361 dump_printf (MSG_NOTE, "\n");
362 }
363
364 if (known_le (vectorization_factor, 1U))
365 return opt_result::failure_at (vect_location,
366 "not vectorized: unsupported data-type\n");
367 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
368 return opt_result::success ();
369 }
370
371
372 /* Function vect_is_simple_iv_evolution.
373
374 FORNOW: A simple evolution of an induction variables in the loop is
375 considered a polynomial evolution. */
376
377 static bool
378 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
379 tree * step)
380 {
381 tree init_expr;
382 tree step_expr;
383 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
384 basic_block bb;
385
386 /* When there is no evolution in this loop, the evolution function
387 is not "simple". */
388 if (evolution_part == NULL_TREE)
389 return false;
390
391 /* When the evolution is a polynomial of degree >= 2
392 the evolution function is not "simple". */
393 if (tree_is_chrec (evolution_part))
394 return false;
395
396 step_expr = evolution_part;
397 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
398
399 if (dump_enabled_p ())
400 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
401 step_expr, init_expr);
402
403 *init = init_expr;
404 *step = step_expr;
405
406 if (TREE_CODE (step_expr) != INTEGER_CST
407 && (TREE_CODE (step_expr) != SSA_NAME
408 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
409 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
410 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
411 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
412 || !flag_associative_math)))
413 && (TREE_CODE (step_expr) != REAL_CST
414 || !flag_associative_math))
415 {
416 if (dump_enabled_p ())
417 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
418 "step unknown.\n");
419 return false;
420 }
421
422 return true;
423 }
424
425 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
426 what we are assuming is a double reduction. For example, given
427 a structure like this:
428
429 outer1:
430 x_1 = PHI <x_4(outer2), ...>;
431 ...
432
433 inner:
434 x_2 = PHI <x_1(outer1), ...>;
435 ...
436 x_3 = ...;
437 ...
438
439 outer2:
440 x_4 = PHI <x_3(inner)>;
441 ...
442
443 outer loop analysis would treat x_1 as a double reduction phi and
444 this function would then return true for x_2. */
445
446 static bool
447 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
448 {
449 use_operand_p use_p;
450 ssa_op_iter op_iter;
451 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
452 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
453 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
454 return true;
455 return false;
456 }
457
458 /* Function vect_analyze_scalar_cycles_1.
459
460 Examine the cross iteration def-use cycles of scalar variables
461 in LOOP. LOOP_VINFO represents the loop that is now being
462 considered for vectorization (can be LOOP, or an outer-loop
463 enclosing LOOP). */
464
465 static void
466 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
467 {
468 basic_block bb = loop->header;
469 tree init, step;
470 auto_vec<stmt_vec_info, 64> worklist;
471 gphi_iterator gsi;
472 bool double_reduc, reduc_chain;
473
474 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
475
476 /* First - identify all inductions. Reduction detection assumes that all the
477 inductions have been identified, therefore, this order must not be
478 changed. */
479 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
480 {
481 gphi *phi = gsi.phi ();
482 tree access_fn = NULL;
483 tree def = PHI_RESULT (phi);
484 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
485
486 if (dump_enabled_p ())
487 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
488
489 /* Skip virtual phi's. The data dependences that are associated with
490 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
491 if (virtual_operand_p (def))
492 continue;
493
494 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
495
496 /* Analyze the evolution function. */
497 access_fn = analyze_scalar_evolution (loop, def);
498 if (access_fn)
499 {
500 STRIP_NOPS (access_fn);
501 if (dump_enabled_p ())
502 dump_printf_loc (MSG_NOTE, vect_location,
503 "Access function of PHI: %T\n", access_fn);
504 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
505 = initial_condition_in_loop_num (access_fn, loop->num);
506 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
507 = evolution_part_in_loop_num (access_fn, loop->num);
508 }
509
510 if (!access_fn
511 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
512 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
513 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
514 && TREE_CODE (step) != INTEGER_CST))
515 {
516 worklist.safe_push (stmt_vinfo);
517 continue;
518 }
519
520 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
521 != NULL_TREE);
522 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
523
524 if (dump_enabled_p ())
525 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
526 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
527 }
528
529
530 /* Second - identify all reductions and nested cycles. */
531 while (worklist.length () > 0)
532 {
533 stmt_vec_info stmt_vinfo = worklist.pop ();
534 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
535 tree def = PHI_RESULT (phi);
536
537 if (dump_enabled_p ())
538 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
539
540 gcc_assert (!virtual_operand_p (def)
541 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
542
543 stmt_vec_info reduc_stmt_info
544 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
545 &reduc_chain);
546 if (reduc_stmt_info)
547 {
548 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
549 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
550 if (double_reduc)
551 {
552 if (dump_enabled_p ())
553 dump_printf_loc (MSG_NOTE, vect_location,
554 "Detected double reduction.\n");
555
556 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
557 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
558 }
559 else
560 {
561 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
562 {
563 if (dump_enabled_p ())
564 dump_printf_loc (MSG_NOTE, vect_location,
565 "Detected vectorizable nested cycle.\n");
566
567 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
568 }
569 else
570 {
571 if (dump_enabled_p ())
572 dump_printf_loc (MSG_NOTE, vect_location,
573 "Detected reduction.\n");
574
575 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
576 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
577 /* Store the reduction cycles for possible vectorization in
578 loop-aware SLP if it was not detected as reduction
579 chain. */
580 if (! reduc_chain)
581 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
582 (reduc_stmt_info);
583 }
584 }
585 }
586 else
587 if (dump_enabled_p ())
588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
589 "Unknown def-use cycle pattern.\n");
590 }
591 }
592
593
594 /* Function vect_analyze_scalar_cycles.
595
596 Examine the cross iteration def-use cycles of scalar variables, by
597 analyzing the loop-header PHIs of scalar variables. Classify each
598 cycle as one of the following: invariant, induction, reduction, unknown.
599 We do that for the loop represented by LOOP_VINFO, and also to its
600 inner-loop, if exists.
601 Examples for scalar cycles:
602
603 Example1: reduction:
604
605 loop1:
606 for (i=0; i<N; i++)
607 sum += a[i];
608
609 Example2: induction:
610
611 loop2:
612 for (i=0; i<N; i++)
613 a[i] = i; */
614
615 static void
616 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
617 {
618 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
619
620 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
621
622 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
623 Reductions in such inner-loop therefore have different properties than
624 the reductions in the nest that gets vectorized:
625 1. When vectorized, they are executed in the same order as in the original
626 scalar loop, so we can't change the order of computation when
627 vectorizing them.
628 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
629 current checks are too strict. */
630
631 if (loop->inner)
632 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
633 }
634
635 /* Transfer group and reduction information from STMT_INFO to its
636 pattern stmt. */
637
638 static void
639 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
640 {
641 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
642 stmt_vec_info stmtp;
643 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
644 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
645 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
646 do
647 {
648 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
649 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
650 == STMT_VINFO_DEF_TYPE (stmt_info));
651 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
652 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
653 if (stmt_info)
654 REDUC_GROUP_NEXT_ELEMENT (stmtp)
655 = STMT_VINFO_RELATED_STMT (stmt_info);
656 }
657 while (stmt_info);
658 }
659
660 /* Fixup scalar cycles that now have their stmts detected as patterns. */
661
662 static void
663 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
664 {
665 stmt_vec_info first;
666 unsigned i;
667
668 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
669 if (STMT_VINFO_IN_PATTERN_P (first))
670 {
671 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
672 while (next)
673 {
674 if (! STMT_VINFO_IN_PATTERN_P (next)
675 || STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1)
676 break;
677 next = REDUC_GROUP_NEXT_ELEMENT (next);
678 }
679 /* If not all stmt in the chain are patterns or if we failed
680 to update STMT_VINFO_REDUC_IDX try to handle the chain
681 without patterns. */
682 if (! next
683 && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1)
684 {
685 vect_fixup_reduc_chain (first);
686 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
687 = STMT_VINFO_RELATED_STMT (first);
688 }
689 }
690 }
691
692 /* Function vect_get_loop_niters.
693
694 Determine how many iterations the loop is executed and place it
695 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
696 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
697 niter information holds in ASSUMPTIONS.
698
699 Return the loop exit condition. */
700
701
702 static gcond *
703 vect_get_loop_niters (class loop *loop, tree *assumptions,
704 tree *number_of_iterations, tree *number_of_iterationsm1)
705 {
706 edge exit = single_exit (loop);
707 class tree_niter_desc niter_desc;
708 tree niter_assumptions, niter, may_be_zero;
709 gcond *cond = get_loop_exit_condition (loop);
710
711 *assumptions = boolean_true_node;
712 *number_of_iterationsm1 = chrec_dont_know;
713 *number_of_iterations = chrec_dont_know;
714 DUMP_VECT_SCOPE ("get_loop_niters");
715
716 if (!exit)
717 return cond;
718
719 may_be_zero = NULL_TREE;
720 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
721 || chrec_contains_undetermined (niter_desc.niter))
722 return cond;
723
724 niter_assumptions = niter_desc.assumptions;
725 may_be_zero = niter_desc.may_be_zero;
726 niter = niter_desc.niter;
727
728 if (may_be_zero && integer_zerop (may_be_zero))
729 may_be_zero = NULL_TREE;
730
731 if (may_be_zero)
732 {
733 if (COMPARISON_CLASS_P (may_be_zero))
734 {
735 /* Try to combine may_be_zero with assumptions, this can simplify
736 computation of niter expression. */
737 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
738 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
739 niter_assumptions,
740 fold_build1 (TRUTH_NOT_EXPR,
741 boolean_type_node,
742 may_be_zero));
743 else
744 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
745 build_int_cst (TREE_TYPE (niter), 0),
746 rewrite_to_non_trapping_overflow (niter));
747
748 may_be_zero = NULL_TREE;
749 }
750 else if (integer_nonzerop (may_be_zero))
751 {
752 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
753 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
754 return cond;
755 }
756 else
757 return cond;
758 }
759
760 *assumptions = niter_assumptions;
761 *number_of_iterationsm1 = niter;
762
763 /* We want the number of loop header executions which is the number
764 of latch executions plus one.
765 ??? For UINT_MAX latch executions this number overflows to zero
766 for loops like do { n++; } while (n != 0); */
767 if (niter && !chrec_contains_undetermined (niter))
768 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
769 build_int_cst (TREE_TYPE (niter), 1));
770 *number_of_iterations = niter;
771
772 return cond;
773 }
774
775 /* Function bb_in_loop_p
776
777 Used as predicate for dfs order traversal of the loop bbs. */
778
779 static bool
780 bb_in_loop_p (const_basic_block bb, const void *data)
781 {
782 const class loop *const loop = (const class loop *)data;
783 if (flow_bb_inside_loop_p (loop, bb))
784 return true;
785 return false;
786 }
787
788
789 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
790 stmt_vec_info structs for all the stmts in LOOP_IN. */
791
792 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
793 : vec_info (vec_info::loop, init_cost (loop_in), shared),
794 loop (loop_in),
795 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
796 num_itersm1 (NULL_TREE),
797 num_iters (NULL_TREE),
798 num_iters_unchanged (NULL_TREE),
799 num_iters_assumptions (NULL_TREE),
800 th (0),
801 versioning_threshold (0),
802 vectorization_factor (0),
803 max_vectorization_factor (0),
804 mask_skip_niters (NULL_TREE),
805 rgroup_compare_type (NULL_TREE),
806 simd_if_cond (NULL_TREE),
807 unaligned_dr (NULL),
808 peeling_for_alignment (0),
809 ptr_mask (0),
810 ivexpr_map (NULL),
811 scan_map (NULL),
812 slp_unrolling_factor (1),
813 single_scalar_iteration_cost (0),
814 vec_outside_cost (0),
815 vec_inside_cost (0),
816 vectorizable (false),
817 can_use_partial_vectors_p (true),
818 using_partial_vectors_p (false),
819 epil_using_partial_vectors_p (false),
820 peeling_for_gaps (false),
821 peeling_for_niter (false),
822 no_data_dependencies (false),
823 has_mask_store (false),
824 scalar_loop_scaling (profile_probability::uninitialized ()),
825 scalar_loop (NULL),
826 orig_loop_info (NULL)
827 {
828 /* CHECKME: We want to visit all BBs before their successors (except for
829 latch blocks, for which this assertion wouldn't hold). In the simple
830 case of the loop forms we allow, a dfs order of the BBs would the same
831 as reversed postorder traversal, so we are safe. */
832
833 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
834 bbs, loop->num_nodes, loop);
835 gcc_assert (nbbs == loop->num_nodes);
836
837 for (unsigned int i = 0; i < nbbs; i++)
838 {
839 basic_block bb = bbs[i];
840 gimple_stmt_iterator si;
841
842 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
843 {
844 gimple *phi = gsi_stmt (si);
845 gimple_set_uid (phi, 0);
846 add_stmt (phi);
847 }
848
849 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
850 {
851 gimple *stmt = gsi_stmt (si);
852 gimple_set_uid (stmt, 0);
853 if (is_gimple_debug (stmt))
854 continue;
855 add_stmt (stmt);
856 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
857 third argument is the #pragma omp simd if (x) condition, when 0,
858 loop shouldn't be vectorized, when non-zero constant, it should
859 be vectorized normally, otherwise versioned with vectorized loop
860 done if the condition is non-zero at runtime. */
861 if (loop_in->simduid
862 && is_gimple_call (stmt)
863 && gimple_call_internal_p (stmt)
864 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
865 && gimple_call_num_args (stmt) >= 3
866 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
867 && (loop_in->simduid
868 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
869 {
870 tree arg = gimple_call_arg (stmt, 2);
871 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
872 simd_if_cond = arg;
873 else
874 gcc_assert (integer_nonzerop (arg));
875 }
876 }
877 }
878
879 epilogue_vinfos.create (6);
880 }
881
882 /* Free all levels of rgroup CONTROLS. */
883
884 void
885 release_vec_loop_controls (vec<rgroup_controls> *controls)
886 {
887 rgroup_controls *rgc;
888 unsigned int i;
889 FOR_EACH_VEC_ELT (*controls, i, rgc)
890 rgc->controls.release ();
891 controls->release ();
892 }
893
894 /* Free all memory used by the _loop_vec_info, as well as all the
895 stmt_vec_info structs of all the stmts in the loop. */
896
897 _loop_vec_info::~_loop_vec_info ()
898 {
899 free (bbs);
900
901 release_vec_loop_controls (&masks);
902 release_vec_loop_controls (&lens);
903 delete ivexpr_map;
904 delete scan_map;
905 epilogue_vinfos.release ();
906
907 loop->aux = NULL;
908 }
909
910 /* Return an invariant or register for EXPR and emit necessary
911 computations in the LOOP_VINFO loop preheader. */
912
913 tree
914 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
915 {
916 if (is_gimple_reg (expr)
917 || is_gimple_min_invariant (expr))
918 return expr;
919
920 if (! loop_vinfo->ivexpr_map)
921 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
922 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
923 if (! cached)
924 {
925 gimple_seq stmts = NULL;
926 cached = force_gimple_operand (unshare_expr (expr),
927 &stmts, true, NULL_TREE);
928 if (stmts)
929 {
930 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
931 gsi_insert_seq_on_edge_immediate (e, stmts);
932 }
933 }
934 return cached;
935 }
936
937 /* Return true if we can use CMP_TYPE as the comparison type to produce
938 all masks required to mask LOOP_VINFO. */
939
940 static bool
941 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
942 {
943 rgroup_controls *rgm;
944 unsigned int i;
945 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
946 if (rgm->type != NULL_TREE
947 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
948 cmp_type, rgm->type,
949 OPTIMIZE_FOR_SPEED))
950 return false;
951 return true;
952 }
953
954 /* Calculate the maximum number of scalars per iteration for every
955 rgroup in LOOP_VINFO. */
956
957 static unsigned int
958 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
959 {
960 unsigned int res = 1;
961 unsigned int i;
962 rgroup_controls *rgm;
963 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
964 res = MAX (res, rgm->max_nscalars_per_iter);
965 return res;
966 }
967
968 /* Calculate the minimum precision necessary to represent:
969
970 MAX_NITERS * FACTOR
971
972 as an unsigned integer, where MAX_NITERS is the maximum number of
973 loop header iterations for the original scalar form of LOOP_VINFO. */
974
975 static unsigned
976 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
977 {
978 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
979
980 /* Get the maximum number of iterations that is representable
981 in the counter type. */
982 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
983 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
984
985 /* Get a more refined estimate for the number of iterations. */
986 widest_int max_back_edges;
987 if (max_loop_iterations (loop, &max_back_edges))
988 max_ni = wi::smin (max_ni, max_back_edges + 1);
989
990 /* Work out how many bits we need to represent the limit. */
991 return wi::min_precision (max_ni * factor, UNSIGNED);
992 }
993
994 /* Each statement in LOOP_VINFO can be masked where necessary. Check
995 whether we can actually generate the masks required. Return true if so,
996 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
997
998 static bool
999 vect_verify_full_masking (loop_vec_info loop_vinfo)
1000 {
1001 unsigned int min_ni_width;
1002 unsigned int max_nscalars_per_iter
1003 = vect_get_max_nscalars_per_iter (loop_vinfo);
1004
1005 /* Use a normal loop if there are no statements that need masking.
1006 This only happens in rare degenerate cases: it means that the loop
1007 has no loads, no stores, and no live-out values. */
1008 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1009 return false;
1010
1011 /* Work out how many bits we need to represent the limit. */
1012 min_ni_width
1013 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1014
1015 /* Find a scalar mode for which WHILE_ULT is supported. */
1016 opt_scalar_int_mode cmp_mode_iter;
1017 tree cmp_type = NULL_TREE;
1018 tree iv_type = NULL_TREE;
1019 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1020 unsigned int iv_precision = UINT_MAX;
1021
1022 if (iv_limit != -1)
1023 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1024 UNSIGNED);
1025
1026 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1027 {
1028 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1029 if (cmp_bits >= min_ni_width
1030 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1031 {
1032 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1033 if (this_type
1034 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1035 {
1036 /* Although we could stop as soon as we find a valid mode,
1037 there are at least two reasons why that's not always the
1038 best choice:
1039
1040 - An IV that's Pmode or wider is more likely to be reusable
1041 in address calculations than an IV that's narrower than
1042 Pmode.
1043
1044 - Doing the comparison in IV_PRECISION or wider allows
1045 a natural 0-based IV, whereas using a narrower comparison
1046 type requires mitigations against wrap-around.
1047
1048 Conversely, if the IV limit is variable, doing the comparison
1049 in a wider type than the original type can introduce
1050 unnecessary extensions, so picking the widest valid mode
1051 is not always a good choice either.
1052
1053 Here we prefer the first IV type that's Pmode or wider,
1054 and the first comparison type that's IV_PRECISION or wider.
1055 (The comparison type must be no wider than the IV type,
1056 to avoid extensions in the vector loop.)
1057
1058 ??? We might want to try continuing beyond Pmode for ILP32
1059 targets if CMP_BITS < IV_PRECISION. */
1060 iv_type = this_type;
1061 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1062 cmp_type = this_type;
1063 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1064 break;
1065 }
1066 }
1067 }
1068
1069 if (!cmp_type)
1070 return false;
1071
1072 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1073 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1074 return true;
1075 }
1076
1077 /* Check whether we can use vector access with length based on precison
1078 comparison. So far, to keep it simple, we only allow the case that the
1079 precision of the target supported length is larger than the precision
1080 required by loop niters. */
1081
1082 static bool
1083 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1084 {
1085 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1086 return false;
1087
1088 unsigned int max_nitems_per_iter = 1;
1089 unsigned int i;
1090 rgroup_controls *rgl;
1091 /* Find the maximum number of items per iteration for every rgroup. */
1092 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1093 {
1094 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1095 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1096 }
1097
1098 /* Work out how many bits we need to represent the length limit. */
1099 unsigned int min_ni_prec
1100 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1101
1102 /* Now use the maximum of below precisions for one suitable IV type:
1103 - the IV's natural precision
1104 - the precision needed to hold: the maximum number of scalar
1105 iterations multiplied by the scale factor (min_ni_prec above)
1106 - the Pmode precision
1107
1108 If min_ni_prec is less than the precision of the current niters,
1109 we perfer to still use the niters type. Prefer to use Pmode and
1110 wider IV to avoid narrow conversions. */
1111
1112 unsigned int ni_prec
1113 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1114 min_ni_prec = MAX (min_ni_prec, ni_prec);
1115 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1116
1117 tree iv_type = NULL_TREE;
1118 opt_scalar_int_mode tmode_iter;
1119 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1120 {
1121 scalar_mode tmode = tmode_iter.require ();
1122 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1123
1124 /* ??? Do we really want to construct one IV whose precision exceeds
1125 BITS_PER_WORD? */
1126 if (tbits > BITS_PER_WORD)
1127 break;
1128
1129 /* Find the first available standard integral type. */
1130 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1131 {
1132 iv_type = build_nonstandard_integer_type (tbits, true);
1133 break;
1134 }
1135 }
1136
1137 if (!iv_type)
1138 {
1139 if (dump_enabled_p ())
1140 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1141 "can't vectorize with length-based partial vectors"
1142 " because there is no suitable iv type.\n");
1143 return false;
1144 }
1145
1146 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1147 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1148
1149 return true;
1150 }
1151
1152 /* Calculate the cost of one scalar iteration of the loop. */
1153 static void
1154 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1155 {
1156 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1157 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1158 int nbbs = loop->num_nodes, factor;
1159 int innerloop_iters, i;
1160
1161 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1162
1163 /* Gather costs for statements in the scalar loop. */
1164
1165 /* FORNOW. */
1166 innerloop_iters = 1;
1167 if (loop->inner)
1168 innerloop_iters = 50; /* FIXME */
1169
1170 for (i = 0; i < nbbs; i++)
1171 {
1172 gimple_stmt_iterator si;
1173 basic_block bb = bbs[i];
1174
1175 if (bb->loop_father == loop->inner)
1176 factor = innerloop_iters;
1177 else
1178 factor = 1;
1179
1180 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1181 {
1182 gimple *stmt = gsi_stmt (si);
1183 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1184
1185 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1186 continue;
1187
1188 /* Skip stmts that are not vectorized inside the loop. */
1189 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1190 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1191 && (!STMT_VINFO_LIVE_P (vstmt_info)
1192 || !VECTORIZABLE_CYCLE_DEF
1193 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1194 continue;
1195
1196 vect_cost_for_stmt kind;
1197 if (STMT_VINFO_DATA_REF (stmt_info))
1198 {
1199 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1200 kind = scalar_load;
1201 else
1202 kind = scalar_store;
1203 }
1204 else if (vect_nop_conversion_p (stmt_info))
1205 continue;
1206 else
1207 kind = scalar_stmt;
1208
1209 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1210 factor, kind, stmt_info, 0, vect_prologue);
1211 }
1212 }
1213
1214 /* Now accumulate cost. */
1215 void *target_cost_data = init_cost (loop);
1216 stmt_info_for_cost *si;
1217 int j;
1218 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1219 j, si)
1220 (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
1221 si->kind, si->stmt_info, si->vectype,
1222 si->misalign, vect_body);
1223 unsigned dummy, body_cost = 0;
1224 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1225 destroy_cost_data (target_cost_data);
1226 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1227 }
1228
1229
1230 /* Function vect_analyze_loop_form_1.
1231
1232 Verify that certain CFG restrictions hold, including:
1233 - the loop has a pre-header
1234 - the loop has a single entry and exit
1235 - the loop exit condition is simple enough
1236 - the number of iterations can be analyzed, i.e, a countable loop. The
1237 niter could be analyzed under some assumptions. */
1238
1239 opt_result
1240 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1241 tree *assumptions, tree *number_of_iterationsm1,
1242 tree *number_of_iterations, gcond **inner_loop_cond)
1243 {
1244 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1245
1246 /* Different restrictions apply when we are considering an inner-most loop,
1247 vs. an outer (nested) loop.
1248 (FORNOW. May want to relax some of these restrictions in the future). */
1249
1250 if (!loop->inner)
1251 {
1252 /* Inner-most loop. We currently require that the number of BBs is
1253 exactly 2 (the header and latch). Vectorizable inner-most loops
1254 look like this:
1255
1256 (pre-header)
1257 |
1258 header <--------+
1259 | | |
1260 | +--> latch --+
1261 |
1262 (exit-bb) */
1263
1264 if (loop->num_nodes != 2)
1265 return opt_result::failure_at (vect_location,
1266 "not vectorized:"
1267 " control flow in loop.\n");
1268
1269 if (empty_block_p (loop->header))
1270 return opt_result::failure_at (vect_location,
1271 "not vectorized: empty loop.\n");
1272 }
1273 else
1274 {
1275 class loop *innerloop = loop->inner;
1276 edge entryedge;
1277
1278 /* Nested loop. We currently require that the loop is doubly-nested,
1279 contains a single inner loop, and the number of BBs is exactly 5.
1280 Vectorizable outer-loops look like this:
1281
1282 (pre-header)
1283 |
1284 header <---+
1285 | |
1286 inner-loop |
1287 | |
1288 tail ------+
1289 |
1290 (exit-bb)
1291
1292 The inner-loop has the properties expected of inner-most loops
1293 as described above. */
1294
1295 if ((loop->inner)->inner || (loop->inner)->next)
1296 return opt_result::failure_at (vect_location,
1297 "not vectorized:"
1298 " multiple nested loops.\n");
1299
1300 if (loop->num_nodes != 5)
1301 return opt_result::failure_at (vect_location,
1302 "not vectorized:"
1303 " control flow in loop.\n");
1304
1305 entryedge = loop_preheader_edge (innerloop);
1306 if (entryedge->src != loop->header
1307 || !single_exit (innerloop)
1308 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1309 return opt_result::failure_at (vect_location,
1310 "not vectorized:"
1311 " unsupported outerloop form.\n");
1312
1313 /* Analyze the inner-loop. */
1314 tree inner_niterm1, inner_niter, inner_assumptions;
1315 opt_result res
1316 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1317 &inner_assumptions, &inner_niterm1,
1318 &inner_niter, NULL);
1319 if (!res)
1320 {
1321 if (dump_enabled_p ())
1322 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1323 "not vectorized: Bad inner loop.\n");
1324 return res;
1325 }
1326
1327 /* Don't support analyzing niter under assumptions for inner
1328 loop. */
1329 if (!integer_onep (inner_assumptions))
1330 return opt_result::failure_at (vect_location,
1331 "not vectorized: Bad inner loop.\n");
1332
1333 if (!expr_invariant_in_loop_p (loop, inner_niter))
1334 return opt_result::failure_at (vect_location,
1335 "not vectorized: inner-loop count not"
1336 " invariant.\n");
1337
1338 if (dump_enabled_p ())
1339 dump_printf_loc (MSG_NOTE, vect_location,
1340 "Considering outer-loop vectorization.\n");
1341 }
1342
1343 if (!single_exit (loop))
1344 return opt_result::failure_at (vect_location,
1345 "not vectorized: multiple exits.\n");
1346 if (EDGE_COUNT (loop->header->preds) != 2)
1347 return opt_result::failure_at (vect_location,
1348 "not vectorized:"
1349 " too many incoming edges.\n");
1350
1351 /* We assume that the loop exit condition is at the end of the loop. i.e,
1352 that the loop is represented as a do-while (with a proper if-guard
1353 before the loop if needed), where the loop header contains all the
1354 executable statements, and the latch is empty. */
1355 if (!empty_block_p (loop->latch)
1356 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1357 return opt_result::failure_at (vect_location,
1358 "not vectorized: latch block not empty.\n");
1359
1360 /* Make sure the exit is not abnormal. */
1361 edge e = single_exit (loop);
1362 if (e->flags & EDGE_ABNORMAL)
1363 return opt_result::failure_at (vect_location,
1364 "not vectorized:"
1365 " abnormal loop exit edge.\n");
1366
1367 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1368 number_of_iterationsm1);
1369 if (!*loop_cond)
1370 return opt_result::failure_at
1371 (vect_location,
1372 "not vectorized: complicated exit condition.\n");
1373
1374 if (integer_zerop (*assumptions)
1375 || !*number_of_iterations
1376 || chrec_contains_undetermined (*number_of_iterations))
1377 return opt_result::failure_at
1378 (*loop_cond,
1379 "not vectorized: number of iterations cannot be computed.\n");
1380
1381 if (integer_zerop (*number_of_iterations))
1382 return opt_result::failure_at
1383 (*loop_cond,
1384 "not vectorized: number of iterations = 0.\n");
1385
1386 return opt_result::success ();
1387 }
1388
1389 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1390
1391 opt_loop_vec_info
1392 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1393 {
1394 tree assumptions, number_of_iterations, number_of_iterationsm1;
1395 gcond *loop_cond, *inner_loop_cond = NULL;
1396
1397 opt_result res
1398 = vect_analyze_loop_form_1 (loop, &loop_cond,
1399 &assumptions, &number_of_iterationsm1,
1400 &number_of_iterations, &inner_loop_cond);
1401 if (!res)
1402 return opt_loop_vec_info::propagate_failure (res);
1403
1404 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1405 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1406 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1407 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1408 if (!integer_onep (assumptions))
1409 {
1410 /* We consider to vectorize this loop by versioning it under
1411 some assumptions. In order to do this, we need to clear
1412 existing information computed by scev and niter analyzer. */
1413 scev_reset_htab ();
1414 free_numbers_of_iterations_estimates (loop);
1415 /* Also set flag for this loop so that following scev and niter
1416 analysis are done under the assumptions. */
1417 loop_constraint_set (loop, LOOP_C_FINITE);
1418 /* Also record the assumptions for versioning. */
1419 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1420 }
1421
1422 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1423 {
1424 if (dump_enabled_p ())
1425 {
1426 dump_printf_loc (MSG_NOTE, vect_location,
1427 "Symbolic number of iterations is ");
1428 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1429 dump_printf (MSG_NOTE, "\n");
1430 }
1431 }
1432
1433 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1434 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1435 if (inner_loop_cond)
1436 {
1437 stmt_vec_info inner_loop_cond_info
1438 = loop_vinfo->lookup_stmt (inner_loop_cond);
1439 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1440 }
1441
1442 gcc_assert (!loop->aux);
1443 loop->aux = loop_vinfo;
1444 return opt_loop_vec_info::success (loop_vinfo);
1445 }
1446
1447
1448
1449 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1450 statements update the vectorization factor. */
1451
1452 static void
1453 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1454 {
1455 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1456 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1457 int nbbs = loop->num_nodes;
1458 poly_uint64 vectorization_factor;
1459 int i;
1460
1461 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1462
1463 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1464 gcc_assert (known_ne (vectorization_factor, 0U));
1465
1466 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1467 vectorization factor of the loop is the unrolling factor required by
1468 the SLP instances. If that unrolling factor is 1, we say, that we
1469 perform pure SLP on loop - cross iteration parallelism is not
1470 exploited. */
1471 bool only_slp_in_loop = true;
1472 for (i = 0; i < nbbs; i++)
1473 {
1474 basic_block bb = bbs[i];
1475 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1476 gsi_next (&si))
1477 {
1478 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1479 if (!stmt_info)
1480 continue;
1481 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1482 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1483 && !PURE_SLP_STMT (stmt_info))
1484 /* STMT needs both SLP and loop-based vectorization. */
1485 only_slp_in_loop = false;
1486 }
1487 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1488 gsi_next (&si))
1489 {
1490 if (is_gimple_debug (gsi_stmt (si)))
1491 continue;
1492 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1493 stmt_info = vect_stmt_to_vectorize (stmt_info);
1494 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1495 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1496 && !PURE_SLP_STMT (stmt_info))
1497 /* STMT needs both SLP and loop-based vectorization. */
1498 only_slp_in_loop = false;
1499 }
1500 }
1501
1502 if (only_slp_in_loop)
1503 {
1504 if (dump_enabled_p ())
1505 dump_printf_loc (MSG_NOTE, vect_location,
1506 "Loop contains only SLP stmts\n");
1507 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1508 }
1509 else
1510 {
1511 if (dump_enabled_p ())
1512 dump_printf_loc (MSG_NOTE, vect_location,
1513 "Loop contains SLP and non-SLP stmts\n");
1514 /* Both the vectorization factor and unroll factor have the form
1515 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1516 so they must have a common multiple. */
1517 vectorization_factor
1518 = force_common_multiple (vectorization_factor,
1519 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1520 }
1521
1522 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1523 if (dump_enabled_p ())
1524 {
1525 dump_printf_loc (MSG_NOTE, vect_location,
1526 "Updating vectorization factor to ");
1527 dump_dec (MSG_NOTE, vectorization_factor);
1528 dump_printf (MSG_NOTE, ".\n");
1529 }
1530 }
1531
1532 /* Return true if STMT_INFO describes a double reduction phi and if
1533 the other phi in the reduction is also relevant for vectorization.
1534 This rejects cases such as:
1535
1536 outer1:
1537 x_1 = PHI <x_3(outer2), ...>;
1538 ...
1539
1540 inner:
1541 x_2 = ...;
1542 ...
1543
1544 outer2:
1545 x_3 = PHI <x_2(inner)>;
1546
1547 if nothing in x_2 or elsewhere makes x_1 relevant. */
1548
1549 static bool
1550 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1551 {
1552 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1553 return false;
1554
1555 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1556 }
1557
1558 /* Function vect_analyze_loop_operations.
1559
1560 Scan the loop stmts and make sure they are all vectorizable. */
1561
1562 static opt_result
1563 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1564 {
1565 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1566 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1567 int nbbs = loop->num_nodes;
1568 int i;
1569 stmt_vec_info stmt_info;
1570 bool need_to_vectorize = false;
1571 bool ok;
1572
1573 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1574
1575 auto_vec<stmt_info_for_cost> cost_vec;
1576
1577 for (i = 0; i < nbbs; i++)
1578 {
1579 basic_block bb = bbs[i];
1580
1581 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1582 gsi_next (&si))
1583 {
1584 gphi *phi = si.phi ();
1585 ok = true;
1586
1587 stmt_info = loop_vinfo->lookup_stmt (phi);
1588 if (dump_enabled_p ())
1589 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1590 if (virtual_operand_p (gimple_phi_result (phi)))
1591 continue;
1592
1593 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1594 (i.e., a phi in the tail of the outer-loop). */
1595 if (! is_loop_header_bb_p (bb))
1596 {
1597 /* FORNOW: we currently don't support the case that these phis
1598 are not used in the outerloop (unless it is double reduction,
1599 i.e., this phi is vect_reduction_def), cause this case
1600 requires to actually do something here. */
1601 if (STMT_VINFO_LIVE_P (stmt_info)
1602 && !vect_active_double_reduction_p (stmt_info))
1603 return opt_result::failure_at (phi,
1604 "Unsupported loop-closed phi"
1605 " in outer-loop.\n");
1606
1607 /* If PHI is used in the outer loop, we check that its operand
1608 is defined in the inner loop. */
1609 if (STMT_VINFO_RELEVANT_P (stmt_info))
1610 {
1611 tree phi_op;
1612
1613 if (gimple_phi_num_args (phi) != 1)
1614 return opt_result::failure_at (phi, "unsupported phi");
1615
1616 phi_op = PHI_ARG_DEF (phi, 0);
1617 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1618 if (!op_def_info)
1619 return opt_result::failure_at (phi, "unsupported phi\n");
1620
1621 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1622 && (STMT_VINFO_RELEVANT (op_def_info)
1623 != vect_used_in_outer_by_reduction))
1624 return opt_result::failure_at (phi, "unsupported phi\n");
1625
1626 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1627 || (STMT_VINFO_DEF_TYPE (stmt_info)
1628 == vect_double_reduction_def))
1629 && !vectorizable_lc_phi (loop_vinfo,
1630 stmt_info, NULL, NULL))
1631 return opt_result::failure_at (phi, "unsupported phi\n");
1632 }
1633
1634 continue;
1635 }
1636
1637 gcc_assert (stmt_info);
1638
1639 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1640 || STMT_VINFO_LIVE_P (stmt_info))
1641 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1642 /* A scalar-dependence cycle that we don't support. */
1643 return opt_result::failure_at (phi,
1644 "not vectorized:"
1645 " scalar dependence cycle.\n");
1646
1647 if (STMT_VINFO_RELEVANT_P (stmt_info))
1648 {
1649 need_to_vectorize = true;
1650 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1651 && ! PURE_SLP_STMT (stmt_info))
1652 ok = vectorizable_induction (loop_vinfo,
1653 stmt_info, NULL, NULL,
1654 &cost_vec);
1655 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1656 || (STMT_VINFO_DEF_TYPE (stmt_info)
1657 == vect_double_reduction_def)
1658 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1659 && ! PURE_SLP_STMT (stmt_info))
1660 ok = vectorizable_reduction (loop_vinfo,
1661 stmt_info, NULL, NULL, &cost_vec);
1662 }
1663
1664 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1665 if (ok
1666 && STMT_VINFO_LIVE_P (stmt_info)
1667 && !PURE_SLP_STMT (stmt_info))
1668 ok = vectorizable_live_operation (loop_vinfo,
1669 stmt_info, NULL, NULL, NULL,
1670 -1, false, &cost_vec);
1671
1672 if (!ok)
1673 return opt_result::failure_at (phi,
1674 "not vectorized: relevant phi not "
1675 "supported: %G",
1676 static_cast <gimple *> (phi));
1677 }
1678
1679 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1680 gsi_next (&si))
1681 {
1682 gimple *stmt = gsi_stmt (si);
1683 if (!gimple_clobber_p (stmt)
1684 && !is_gimple_debug (stmt))
1685 {
1686 opt_result res
1687 = vect_analyze_stmt (loop_vinfo,
1688 loop_vinfo->lookup_stmt (stmt),
1689 &need_to_vectorize,
1690 NULL, NULL, &cost_vec);
1691 if (!res)
1692 return res;
1693 }
1694 }
1695 } /* bbs */
1696
1697 add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
1698
1699 /* All operations in the loop are either irrelevant (deal with loop
1700 control, or dead), or only used outside the loop and can be moved
1701 out of the loop (e.g. invariants, inductions). The loop can be
1702 optimized away by scalar optimizations. We're better off not
1703 touching this loop. */
1704 if (!need_to_vectorize)
1705 {
1706 if (dump_enabled_p ())
1707 dump_printf_loc (MSG_NOTE, vect_location,
1708 "All the computation can be taken out of the loop.\n");
1709 return opt_result::failure_at
1710 (vect_location,
1711 "not vectorized: redundant loop. no profit to vectorize.\n");
1712 }
1713
1714 return opt_result::success ();
1715 }
1716
1717 /* Return true if we know that the iteration count is smaller than the
1718 vectorization factor. Return false if it isn't, or if we can't be sure
1719 either way. */
1720
1721 static bool
1722 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1723 {
1724 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1725
1726 HOST_WIDE_INT max_niter;
1727 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1728 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1729 else
1730 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1731
1732 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1733 return true;
1734
1735 return false;
1736 }
1737
1738 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1739 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1740 definitely no, or -1 if it's worth retrying. */
1741
1742 static int
1743 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1744 {
1745 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1746 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1747
1748 /* Only loops that can handle partially-populated vectors can have iteration
1749 counts less than the vectorization factor. */
1750 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1751 {
1752 if (vect_known_niters_smaller_than_vf (loop_vinfo))
1753 {
1754 if (dump_enabled_p ())
1755 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1756 "not vectorized: iteration count smaller than "
1757 "vectorization factor.\n");
1758 return 0;
1759 }
1760 }
1761
1762 int min_profitable_iters, min_profitable_estimate;
1763 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1764 &min_profitable_estimate);
1765
1766 if (min_profitable_iters < 0)
1767 {
1768 if (dump_enabled_p ())
1769 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1770 "not vectorized: vectorization not profitable.\n");
1771 if (dump_enabled_p ())
1772 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1773 "not vectorized: vector version will never be "
1774 "profitable.\n");
1775 return -1;
1776 }
1777
1778 int min_scalar_loop_bound = (param_min_vect_loop_bound
1779 * assumed_vf);
1780
1781 /* Use the cost model only if it is more conservative than user specified
1782 threshold. */
1783 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1784 min_profitable_iters);
1785
1786 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1787
1788 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1789 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1790 {
1791 if (dump_enabled_p ())
1792 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1793 "not vectorized: vectorization not profitable.\n");
1794 if (dump_enabled_p ())
1795 dump_printf_loc (MSG_NOTE, vect_location,
1796 "not vectorized: iteration count smaller than user "
1797 "specified loop bound parameter or minimum profitable "
1798 "iterations (whichever is more conservative).\n");
1799 return 0;
1800 }
1801
1802 /* The static profitablity threshold min_profitable_estimate includes
1803 the cost of having to check at runtime whether the scalar loop
1804 should be used instead. If it turns out that we don't need or want
1805 such a check, the threshold we should use for the static estimate
1806 is simply the point at which the vector loop becomes more profitable
1807 than the scalar loop. */
1808 if (min_profitable_estimate > min_profitable_iters
1809 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1810 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1811 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1812 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1813 {
1814 if (dump_enabled_p ())
1815 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1816 " choice between the scalar and vector loops\n");
1817 min_profitable_estimate = min_profitable_iters;
1818 }
1819
1820 HOST_WIDE_INT estimated_niter;
1821
1822 /* If we are vectorizing an epilogue then we know the maximum number of
1823 scalar iterations it will cover is at least one lower than the
1824 vectorization factor of the main loop. */
1825 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1826 estimated_niter
1827 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1828 else
1829 {
1830 estimated_niter = estimated_stmt_executions_int (loop);
1831 if (estimated_niter == -1)
1832 estimated_niter = likely_max_stmt_executions_int (loop);
1833 }
1834 if (estimated_niter != -1
1835 && ((unsigned HOST_WIDE_INT) estimated_niter
1836 < MAX (th, (unsigned) min_profitable_estimate)))
1837 {
1838 if (dump_enabled_p ())
1839 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1840 "not vectorized: estimated iteration count too "
1841 "small.\n");
1842 if (dump_enabled_p ())
1843 dump_printf_loc (MSG_NOTE, vect_location,
1844 "not vectorized: estimated iteration count smaller "
1845 "than specified loop bound parameter or minimum "
1846 "profitable iterations (whichever is more "
1847 "conservative).\n");
1848 return -1;
1849 }
1850
1851 return 1;
1852 }
1853
1854 static opt_result
1855 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1856 vec<data_reference_p> *datarefs,
1857 unsigned int *n_stmts)
1858 {
1859 *n_stmts = 0;
1860 for (unsigned i = 0; i < loop->num_nodes; i++)
1861 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1862 !gsi_end_p (gsi); gsi_next (&gsi))
1863 {
1864 gimple *stmt = gsi_stmt (gsi);
1865 if (is_gimple_debug (stmt))
1866 continue;
1867 ++(*n_stmts);
1868 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1869 NULL, 0);
1870 if (!res)
1871 {
1872 if (is_gimple_call (stmt) && loop->safelen)
1873 {
1874 tree fndecl = gimple_call_fndecl (stmt), op;
1875 if (fndecl != NULL_TREE)
1876 {
1877 cgraph_node *node = cgraph_node::get (fndecl);
1878 if (node != NULL && node->simd_clones != NULL)
1879 {
1880 unsigned int j, n = gimple_call_num_args (stmt);
1881 for (j = 0; j < n; j++)
1882 {
1883 op = gimple_call_arg (stmt, j);
1884 if (DECL_P (op)
1885 || (REFERENCE_CLASS_P (op)
1886 && get_base_address (op)))
1887 break;
1888 }
1889 op = gimple_call_lhs (stmt);
1890 /* Ignore #pragma omp declare simd functions
1891 if they don't have data references in the
1892 call stmt itself. */
1893 if (j == n
1894 && !(op
1895 && (DECL_P (op)
1896 || (REFERENCE_CLASS_P (op)
1897 && get_base_address (op)))))
1898 continue;
1899 }
1900 }
1901 }
1902 return res;
1903 }
1904 /* If dependence analysis will give up due to the limit on the
1905 number of datarefs stop here and fail fatally. */
1906 if (datarefs->length ()
1907 > (unsigned)param_loop_max_datarefs_for_datadeps)
1908 return opt_result::failure_at (stmt, "exceeded param "
1909 "loop-max-datarefs-for-datadeps\n");
1910 }
1911 return opt_result::success ();
1912 }
1913
1914 /* Look for SLP-only access groups and turn each individual access into its own
1915 group. */
1916 static void
1917 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1918 {
1919 unsigned int i;
1920 struct data_reference *dr;
1921
1922 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1923
1924 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1925 FOR_EACH_VEC_ELT (datarefs, i, dr)
1926 {
1927 gcc_assert (DR_REF (dr));
1928 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1929
1930 /* Check if the load is a part of an interleaving chain. */
1931 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1932 {
1933 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1934 unsigned int group_size = DR_GROUP_SIZE (first_element);
1935
1936 /* Check if SLP-only groups. */
1937 if (!STMT_SLP_TYPE (stmt_info)
1938 && STMT_VINFO_SLP_VECT_ONLY (first_element))
1939 {
1940 /* Dissolve the group. */
1941 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1942
1943 stmt_vec_info vinfo = first_element;
1944 while (vinfo)
1945 {
1946 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1947 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1948 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1949 DR_GROUP_SIZE (vinfo) = 1;
1950 if (STMT_VINFO_STRIDED_P (first_element))
1951 DR_GROUP_GAP (vinfo) = 0;
1952 else
1953 DR_GROUP_GAP (vinfo) = group_size - 1;
1954 vinfo = next;
1955 }
1956 }
1957 }
1958 }
1959 }
1960
1961
1962 /* Decides whether we need to create an epilogue loop to handle
1963 remaining scalar iterations and sets PEELING_FOR_NITERS accordingly. */
1964
1965 void
1966 determine_peel_for_niter (loop_vec_info loop_vinfo)
1967 {
1968 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1969
1970 unsigned HOST_WIDE_INT const_vf;
1971 HOST_WIDE_INT max_niter
1972 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1973
1974 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1975 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1976 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1977 (loop_vinfo));
1978
1979 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1980 /* The main loop handles all iterations. */
1981 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1982 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1983 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1984 {
1985 /* Work out the (constant) number of iterations that need to be
1986 peeled for reasons other than niters. */
1987 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1988 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1989 peel_niter += 1;
1990 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1991 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1992 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1993 }
1994 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1995 /* ??? When peeling for gaps but not alignment, we could
1996 try to check whether the (variable) niters is known to be
1997 VF * N + 1. That's something of a niche case though. */
1998 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1999 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2000 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2001 < (unsigned) exact_log2 (const_vf))
2002 /* In case of versioning, check if the maximum number of
2003 iterations is greater than th. If they are identical,
2004 the epilogue is unnecessary. */
2005 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2006 || ((unsigned HOST_WIDE_INT) max_niter
2007 > (th / const_vf) * const_vf))))
2008 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2009 }
2010
2011
2012 /* Function vect_analyze_loop_2.
2013
2014 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2015 for it. The different analyses will record information in the
2016 loop_vec_info struct. */
2017 static opt_result
2018 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
2019 {
2020 opt_result ok = opt_result::success ();
2021 int res;
2022 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2023 poly_uint64 min_vf = 2;
2024 loop_vec_info orig_loop_vinfo = NULL;
2025
2026 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2027 loop_vec_info of the first vectorized loop. */
2028 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2029 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2030 else
2031 orig_loop_vinfo = loop_vinfo;
2032 gcc_assert (orig_loop_vinfo);
2033
2034 /* The first group of checks is independent of the vector size. */
2035 fatal = true;
2036
2037 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2038 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2039 return opt_result::failure_at (vect_location,
2040 "not vectorized: simd if(0)\n");
2041
2042 /* Find all data references in the loop (which correspond to vdefs/vuses)
2043 and analyze their evolution in the loop. */
2044
2045 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2046
2047 /* Gather the data references and count stmts in the loop. */
2048 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2049 {
2050 opt_result res
2051 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2052 &LOOP_VINFO_DATAREFS (loop_vinfo),
2053 n_stmts);
2054 if (!res)
2055 {
2056 if (dump_enabled_p ())
2057 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2058 "not vectorized: loop contains function "
2059 "calls or data references that cannot "
2060 "be analyzed\n");
2061 return res;
2062 }
2063 loop_vinfo->shared->save_datarefs ();
2064 }
2065 else
2066 loop_vinfo->shared->check_datarefs ();
2067
2068 /* Analyze the data references and also adjust the minimal
2069 vectorization factor according to the loads and stores. */
2070
2071 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2072 if (!ok)
2073 {
2074 if (dump_enabled_p ())
2075 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2076 "bad data references.\n");
2077 return ok;
2078 }
2079
2080 /* Classify all cross-iteration scalar data-flow cycles.
2081 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2082 vect_analyze_scalar_cycles (loop_vinfo);
2083
2084 vect_pattern_recog (loop_vinfo);
2085
2086 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2087
2088 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2089 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2090
2091 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2092 if (!ok)
2093 {
2094 if (dump_enabled_p ())
2095 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2096 "bad data access.\n");
2097 return ok;
2098 }
2099
2100 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2101
2102 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2103 if (!ok)
2104 {
2105 if (dump_enabled_p ())
2106 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2107 "unexpected pattern.\n");
2108 return ok;
2109 }
2110
2111 /* While the rest of the analysis below depends on it in some way. */
2112 fatal = false;
2113
2114 /* Analyze data dependences between the data-refs in the loop
2115 and adjust the maximum vectorization factor according to
2116 the dependences.
2117 FORNOW: fail at the first data dependence that we encounter. */
2118
2119 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2120 if (!ok)
2121 {
2122 if (dump_enabled_p ())
2123 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2124 "bad data dependence.\n");
2125 return ok;
2126 }
2127 if (max_vf != MAX_VECTORIZATION_FACTOR
2128 && maybe_lt (max_vf, min_vf))
2129 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2130 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2131
2132 ok = vect_determine_vectorization_factor (loop_vinfo);
2133 if (!ok)
2134 {
2135 if (dump_enabled_p ())
2136 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2137 "can't determine vectorization factor.\n");
2138 return ok;
2139 }
2140 if (max_vf != MAX_VECTORIZATION_FACTOR
2141 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2142 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2143
2144 /* Compute the scalar iteration cost. */
2145 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2146
2147 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2148
2149 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2150 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2151 if (!ok)
2152 return ok;
2153
2154 /* If there are any SLP instances mark them as pure_slp. */
2155 bool slp = vect_make_slp_decision (loop_vinfo);
2156 if (slp)
2157 {
2158 /* Find stmts that need to be both vectorized and SLPed. */
2159 vect_detect_hybrid_slp (loop_vinfo);
2160
2161 /* Update the vectorization factor based on the SLP decision. */
2162 vect_update_vf_for_slp (loop_vinfo);
2163
2164 /* Optimize the SLP graph with the vectorization factor fixed. */
2165 vect_optimize_slp (loop_vinfo);
2166 }
2167
2168 bool saved_can_use_partial_vectors_p
2169 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2170
2171 /* We don't expect to have to roll back to anything other than an empty
2172 set of rgroups. */
2173 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2174
2175 /* This is the point where we can re-start analysis with SLP forced off. */
2176 start_over:
2177
2178 /* Now the vectorization factor is final. */
2179 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2180 gcc_assert (known_ne (vectorization_factor, 0U));
2181
2182 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2183 {
2184 dump_printf_loc (MSG_NOTE, vect_location,
2185 "vectorization_factor = ");
2186 dump_dec (MSG_NOTE, vectorization_factor);
2187 dump_printf (MSG_NOTE, ", niters = %wd\n",
2188 LOOP_VINFO_INT_NITERS (loop_vinfo));
2189 }
2190
2191 /* Analyze the alignment of the data-refs in the loop.
2192 Fail if a data reference is found that cannot be vectorized. */
2193
2194 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2195 if (!ok)
2196 {
2197 if (dump_enabled_p ())
2198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2199 "bad data alignment.\n");
2200 return ok;
2201 }
2202
2203 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2204 It is important to call pruning after vect_analyze_data_ref_accesses,
2205 since we use grouping information gathered by interleaving analysis. */
2206 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2207 if (!ok)
2208 return ok;
2209
2210 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2211 vectorization, since we do not want to add extra peeling or
2212 add versioning for alignment. */
2213 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2214 /* This pass will decide on using loop versioning and/or loop peeling in
2215 order to enhance the alignment of data references in the loop. */
2216 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2217 if (!ok)
2218 return ok;
2219
2220 if (slp)
2221 {
2222 /* Analyze operations in the SLP instances. Note this may
2223 remove unsupported SLP instances which makes the above
2224 SLP kind detection invalid. */
2225 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2226 vect_slp_analyze_operations (loop_vinfo);
2227 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2228 {
2229 ok = opt_result::failure_at (vect_location,
2230 "unsupported SLP instances\n");
2231 goto again;
2232 }
2233 }
2234
2235 /* Dissolve SLP-only groups. */
2236 vect_dissolve_slp_only_groups (loop_vinfo);
2237
2238 /* Scan all the remaining operations in the loop that are not subject
2239 to SLP and make sure they are vectorizable. */
2240 ok = vect_analyze_loop_operations (loop_vinfo);
2241 if (!ok)
2242 {
2243 if (dump_enabled_p ())
2244 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2245 "bad operation or unsupported loop bound.\n");
2246 return ok;
2247 }
2248
2249 /* For now, we don't expect to mix both masking and length approaches for one
2250 loop, disable it if both are recorded. */
2251 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2252 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2253 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2254 {
2255 if (dump_enabled_p ())
2256 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2257 "can't vectorize a loop with partial vectors"
2258 " because we don't expect to mix different"
2259 " approaches with partial vectors for the"
2260 " same loop.\n");
2261 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2262 }
2263
2264 /* Decide whether to vectorize a loop with partial vectors for
2265 this vectorization factor. */
2266 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
2267 {
2268 if (param_vect_partial_vector_usage == 0)
2269 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2270 else if (vect_verify_full_masking (loop_vinfo)
2271 || vect_verify_loop_lens (loop_vinfo))
2272 {
2273 /* The epilogue and other known niters less than VF
2274 cases can still use vector access with length fully. */
2275 if (param_vect_partial_vector_usage == 1
2276 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2277 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2278 {
2279 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2280 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2281 }
2282 else
2283 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2284 }
2285 else
2286 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2287 }
2288 else
2289 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2290
2291 if (dump_enabled_p ())
2292 {
2293 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2294 dump_printf_loc (MSG_NOTE, vect_location,
2295 "operating on partial vectors.\n");
2296 else
2297 dump_printf_loc (MSG_NOTE, vect_location,
2298 "operating only on full vectors.\n");
2299 }
2300
2301 /* If epilog loop is required because of data accesses with gaps,
2302 one additional iteration needs to be peeled. Check if there is
2303 enough iterations for vectorization. */
2304 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2305 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2306 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2307 {
2308 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2309 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2310
2311 if (known_lt (wi::to_widest (scalar_niters), vf))
2312 return opt_result::failure_at (vect_location,
2313 "loop has no enough iterations to"
2314 " support peeling for gaps.\n");
2315 }
2316
2317 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2318 to be able to handle fewer than VF scalars, or needs to have a lower VF
2319 than the main loop. */
2320 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2321 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2322 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2323 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2324 return opt_result::failure_at (vect_location,
2325 "Vectorization factor too high for"
2326 " epilogue loop.\n");
2327
2328 /* Check the costings of the loop make vectorizing worthwhile. */
2329 res = vect_analyze_loop_costing (loop_vinfo);
2330 if (res < 0)
2331 {
2332 ok = opt_result::failure_at (vect_location,
2333 "Loop costings may not be worthwhile.\n");
2334 goto again;
2335 }
2336 if (!res)
2337 return opt_result::failure_at (vect_location,
2338 "Loop costings not worthwhile.\n");
2339
2340 determine_peel_for_niter (loop_vinfo);
2341 /* If an epilogue loop is required make sure we can create one. */
2342 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2343 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2344 {
2345 if (dump_enabled_p ())
2346 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2347 if (!vect_can_advance_ivs_p (loop_vinfo)
2348 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2349 single_exit (LOOP_VINFO_LOOP
2350 (loop_vinfo))))
2351 {
2352 ok = opt_result::failure_at (vect_location,
2353 "not vectorized: can't create required "
2354 "epilog loop\n");
2355 goto again;
2356 }
2357 }
2358
2359 /* During peeling, we need to check if number of loop iterations is
2360 enough for both peeled prolog loop and vector loop. This check
2361 can be merged along with threshold check of loop versioning, so
2362 increase threshold for this case if necessary.
2363
2364 If we are analyzing an epilogue we still want to check what its
2365 versioning threshold would be. If we decide to vectorize the epilogues we
2366 will want to use the lowest versioning threshold of all epilogues and main
2367 loop. This will enable us to enter a vectorized epilogue even when
2368 versioning the loop. We can't simply check whether the epilogue requires
2369 versioning though since we may have skipped some versioning checks when
2370 analyzing the epilogue. For instance, checks for alias versioning will be
2371 skipped when dealing with epilogues as we assume we already checked them
2372 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2373 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2374 {
2375 poly_uint64 niters_th = 0;
2376 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2377
2378 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2379 {
2380 /* Niters for peeled prolog loop. */
2381 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2382 {
2383 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2384 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2385 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2386 }
2387 else
2388 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2389 }
2390
2391 /* Niters for at least one iteration of vectorized loop. */
2392 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2393 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2394 /* One additional iteration because of peeling for gap. */
2395 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2396 niters_th += 1;
2397
2398 /* Use the same condition as vect_transform_loop to decide when to use
2399 the cost to determine a versioning threshold. */
2400 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2401 && ordered_p (th, niters_th))
2402 niters_th = ordered_max (poly_uint64 (th), niters_th);
2403
2404 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2405 }
2406
2407 gcc_assert (known_eq (vectorization_factor,
2408 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2409
2410 /* Ok to vectorize! */
2411 return opt_result::success ();
2412
2413 again:
2414 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2415 gcc_assert (!ok);
2416
2417 /* Try again with SLP forced off but if we didn't do any SLP there is
2418 no point in re-trying. */
2419 if (!slp)
2420 return ok;
2421
2422 /* If there are reduction chains re-trying will fail anyway. */
2423 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2424 return ok;
2425
2426 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2427 via interleaving or lane instructions. */
2428 slp_instance instance;
2429 slp_tree node;
2430 unsigned i, j;
2431 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2432 {
2433 stmt_vec_info vinfo;
2434 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2435 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2436 continue;
2437 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2438 unsigned int size = DR_GROUP_SIZE (vinfo);
2439 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2440 if (! vect_store_lanes_supported (vectype, size, false)
2441 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2442 && ! vect_grouped_store_supported (vectype, size))
2443 return opt_result::failure_at (vinfo->stmt,
2444 "unsupported grouped store\n");
2445 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2446 {
2447 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2448 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2449 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2450 size = DR_GROUP_SIZE (vinfo);
2451 vectype = STMT_VINFO_VECTYPE (vinfo);
2452 if (! vect_load_lanes_supported (vectype, size, false)
2453 && ! vect_grouped_load_supported (vectype, single_element_p,
2454 size))
2455 return opt_result::failure_at (vinfo->stmt,
2456 "unsupported grouped load\n");
2457 }
2458 }
2459
2460 if (dump_enabled_p ())
2461 dump_printf_loc (MSG_NOTE, vect_location,
2462 "re-trying with SLP disabled\n");
2463
2464 /* Roll back state appropriately. No SLP this time. */
2465 slp = false;
2466 /* Restore vectorization factor as it were without SLP. */
2467 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2468 /* Free the SLP instances. */
2469 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2470 vect_free_slp_instance (instance, false);
2471 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2472 /* Reset SLP type to loop_vect on all stmts. */
2473 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2474 {
2475 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2476 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2477 !gsi_end_p (si); gsi_next (&si))
2478 {
2479 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2480 STMT_SLP_TYPE (stmt_info) = loop_vect;
2481 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2482 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2483 {
2484 /* vectorizable_reduction adjusts reduction stmt def-types,
2485 restore them to that of the PHI. */
2486 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2487 = STMT_VINFO_DEF_TYPE (stmt_info);
2488 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2489 (STMT_VINFO_REDUC_DEF (stmt_info)))
2490 = STMT_VINFO_DEF_TYPE (stmt_info);
2491 }
2492 }
2493 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2494 !gsi_end_p (si); gsi_next (&si))
2495 {
2496 if (is_gimple_debug (gsi_stmt (si)))
2497 continue;
2498 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2499 STMT_SLP_TYPE (stmt_info) = loop_vect;
2500 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2501 {
2502 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2503 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2504 STMT_SLP_TYPE (stmt_info) = loop_vect;
2505 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2506 !gsi_end_p (pi); gsi_next (&pi))
2507 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2508 = loop_vect;
2509 }
2510 }
2511 }
2512 /* Free optimized alias test DDRS. */
2513 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2514 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2515 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2516 /* Reset target cost data. */
2517 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2518 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2519 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2520 /* Reset accumulated rgroup information. */
2521 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2522 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2523 /* Reset assorted flags. */
2524 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2525 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2526 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2527 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2528 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2529 = saved_can_use_partial_vectors_p;
2530
2531 goto start_over;
2532 }
2533
2534 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2535 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2536 OLD_LOOP_VINFO is better unless something specifically indicates
2537 otherwise.
2538
2539 Note that this deliberately isn't a partial order. */
2540
2541 static bool
2542 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2543 loop_vec_info old_loop_vinfo)
2544 {
2545 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2546 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2547
2548 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2549 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2550
2551 /* Always prefer a VF of loop->simdlen over any other VF. */
2552 if (loop->simdlen)
2553 {
2554 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2555 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2556 if (new_simdlen_p != old_simdlen_p)
2557 return new_simdlen_p;
2558 }
2559
2560 /* Limit the VFs to what is likely to be the maximum number of iterations,
2561 to handle cases in which at least one loop_vinfo is fully-masked. */
2562 HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2563 if (estimated_max_niter != -1)
2564 {
2565 if (known_le (estimated_max_niter, new_vf))
2566 new_vf = estimated_max_niter;
2567 if (known_le (estimated_max_niter, old_vf))
2568 old_vf = estimated_max_niter;
2569 }
2570
2571 /* Check whether the (fractional) cost per scalar iteration is lower
2572 or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
2573 poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
2574 * poly_widest_int (old_vf));
2575 poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
2576 * poly_widest_int (new_vf));
2577 if (maybe_lt (rel_old, rel_new))
2578 {
2579 /* When old_loop_vinfo uses a variable vectorization factor,
2580 we know that it has a lower cost for at least one runtime VF.
2581 However, we don't know how likely that VF is.
2582
2583 One option would be to compare the costs for the estimated VFs.
2584 The problem is that that can put too much pressure on the cost
2585 model. E.g. if the estimated VF is also the lowest possible VF,
2586 and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2587 for the estimated VF, we'd then choose new_loop_vinfo even
2588 though (a) new_loop_vinfo might not actually be better than
2589 old_loop_vinfo for that VF and (b) it would be significantly
2590 worse at larger VFs.
2591
2592 Here we go for a hacky compromise: pick new_loop_vinfo if it is
2593 no more expensive than old_loop_vinfo even after doubling the
2594 estimated old_loop_vinfo VF. For all but trivial loops, this
2595 ensures that we only pick new_loop_vinfo if it is significantly
2596 better than old_loop_vinfo at the estimated VF. */
2597 if (rel_new.is_constant ())
2598 return false;
2599
2600 HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf);
2601 HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf);
2602 widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost
2603 * widest_int (old_estimated_vf));
2604 widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost
2605 * widest_int (new_estimated_vf));
2606 return estimated_rel_new * 2 <= estimated_rel_old;
2607 }
2608 if (known_lt (rel_new, rel_old))
2609 return true;
2610
2611 /* If there's nothing to choose between the loop bodies, see whether
2612 there's a difference in the prologue and epilogue costs. */
2613 if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2614 return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2615
2616 return false;
2617 }
2618
2619 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2620 true if we should. */
2621
2622 static bool
2623 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2624 loop_vec_info old_loop_vinfo)
2625 {
2626 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2627 return false;
2628
2629 if (dump_enabled_p ())
2630 dump_printf_loc (MSG_NOTE, vect_location,
2631 "***** Preferring vector mode %s to vector mode %s\n",
2632 GET_MODE_NAME (new_loop_vinfo->vector_mode),
2633 GET_MODE_NAME (old_loop_vinfo->vector_mode));
2634 return true;
2635 }
2636
2637 /* Function vect_analyze_loop.
2638
2639 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2640 for it. The different analyses will record information in the
2641 loop_vec_info struct. */
2642 opt_loop_vec_info
2643 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2644 {
2645 auto_vector_modes vector_modes;
2646
2647 /* Autodetect first vector size we try. */
2648 unsigned int autovec_flags
2649 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2650 loop->simdlen != 0);
2651 unsigned int mode_i = 0;
2652
2653 DUMP_VECT_SCOPE ("analyze_loop_nest");
2654
2655 if (loop_outer (loop)
2656 && loop_vec_info_for_loop (loop_outer (loop))
2657 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2658 return opt_loop_vec_info::failure_at (vect_location,
2659 "outer-loop already vectorized.\n");
2660
2661 if (!find_loop_nest (loop, &shared->loop_nest))
2662 return opt_loop_vec_info::failure_at
2663 (vect_location,
2664 "not vectorized: loop nest containing two or more consecutive inner"
2665 " loops cannot be vectorized\n");
2666
2667 unsigned n_stmts = 0;
2668 machine_mode autodetected_vector_mode = VOIDmode;
2669 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2670 machine_mode next_vector_mode = VOIDmode;
2671 poly_uint64 lowest_th = 0;
2672 unsigned vectorized_loops = 0;
2673 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2674 && !unlimited_cost_model (loop));
2675
2676 bool vect_epilogues = false;
2677 opt_result res = opt_result::success ();
2678 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2679 while (1)
2680 {
2681 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2682 opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2683 if (!loop_vinfo)
2684 {
2685 if (dump_enabled_p ())
2686 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2687 "bad loop form.\n");
2688 gcc_checking_assert (first_loop_vinfo == NULL);
2689 return loop_vinfo;
2690 }
2691 loop_vinfo->vector_mode = next_vector_mode;
2692
2693 bool fatal = false;
2694
2695 /* When pick_lowest_cost_p is true, we should in principle iterate
2696 over all the loop_vec_infos that LOOP_VINFO could replace and
2697 try to vectorize LOOP_VINFO under the same conditions.
2698 E.g. when trying to replace an epilogue loop, we should vectorize
2699 LOOP_VINFO as an epilogue loop with the same VF limit. When trying
2700 to replace the main loop, we should vectorize LOOP_VINFO as a main
2701 loop too.
2702
2703 However, autovectorize_vector_modes is usually sorted as follows:
2704
2705 - Modes that naturally produce lower VFs usually follow modes that
2706 naturally produce higher VFs.
2707
2708 - When modes naturally produce the same VF, maskable modes
2709 usually follow unmaskable ones, so that the maskable mode
2710 can be used to vectorize the epilogue of the unmaskable mode.
2711
2712 This order is preferred because it leads to the maximum
2713 epilogue vectorization opportunities. Targets should only use
2714 a different order if they want to make wide modes available while
2715 disparaging them relative to earlier, smaller modes. The assumption
2716 in that case is that the wider modes are more expensive in some
2717 way that isn't reflected directly in the costs.
2718
2719 There should therefore be few interesting cases in which
2720 LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2721 treated as a standalone loop, and ends up being genuinely cheaper
2722 than FIRST_LOOP_VINFO. */
2723 if (vect_epilogues)
2724 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2725
2726 res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2727 if (mode_i == 0)
2728 autodetected_vector_mode = loop_vinfo->vector_mode;
2729 if (dump_enabled_p ())
2730 {
2731 if (res)
2732 dump_printf_loc (MSG_NOTE, vect_location,
2733 "***** Analysis succeeded with vector mode %s\n",
2734 GET_MODE_NAME (loop_vinfo->vector_mode));
2735 else
2736 dump_printf_loc (MSG_NOTE, vect_location,
2737 "***** Analysis failed with vector mode %s\n",
2738 GET_MODE_NAME (loop_vinfo->vector_mode));
2739 }
2740
2741 loop->aux = NULL;
2742
2743 if (!fatal)
2744 while (mode_i < vector_modes.length ()
2745 && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
2746 {
2747 if (dump_enabled_p ())
2748 dump_printf_loc (MSG_NOTE, vect_location,
2749 "***** The result for vector mode %s would"
2750 " be the same\n",
2751 GET_MODE_NAME (vector_modes[mode_i]));
2752 mode_i += 1;
2753 }
2754
2755 if (res)
2756 {
2757 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2758 vectorized_loops++;
2759
2760 /* Once we hit the desired simdlen for the first time,
2761 discard any previous attempts. */
2762 if (simdlen
2763 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2764 {
2765 delete first_loop_vinfo;
2766 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2767 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2768 simdlen = 0;
2769 }
2770 else if (pick_lowest_cost_p && first_loop_vinfo)
2771 {
2772 /* Keep trying to roll back vectorization attempts while the
2773 loop_vec_infos they produced were worse than this one. */
2774 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
2775 while (!vinfos.is_empty ()
2776 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
2777 {
2778 gcc_assert (vect_epilogues);
2779 delete vinfos.pop ();
2780 }
2781 if (vinfos.is_empty ()
2782 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2783 {
2784 delete first_loop_vinfo;
2785 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2786 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2787 }
2788 }
2789
2790 if (first_loop_vinfo == NULL)
2791 {
2792 first_loop_vinfo = loop_vinfo;
2793 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
2794 }
2795 else if (vect_epilogues
2796 /* For now only allow one epilogue loop. */
2797 && first_loop_vinfo->epilogue_vinfos.is_empty ())
2798 {
2799 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
2800 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
2801 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2802 || maybe_ne (lowest_th, 0U));
2803 /* Keep track of the known smallest versioning
2804 threshold. */
2805 if (ordered_p (lowest_th, th))
2806 lowest_th = ordered_min (lowest_th, th);
2807 }
2808 else
2809 {
2810 delete loop_vinfo;
2811 loop_vinfo = opt_loop_vec_info::success (NULL);
2812 }
2813
2814 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
2815 enabled, SIMDUID is not set, it is the innermost loop and we have
2816 either already found the loop's SIMDLEN or there was no SIMDLEN to
2817 begin with.
2818 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
2819 vect_epilogues = (!simdlen
2820 && loop->inner == NULL
2821 && param_vect_epilogues_nomask
2822 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
2823 && !loop->simduid
2824 /* For now only allow one epilogue loop, but allow
2825 pick_lowest_cost_p to replace it. */
2826 && (first_loop_vinfo->epilogue_vinfos.is_empty ()
2827 || pick_lowest_cost_p));
2828
2829 /* Commit to first_loop_vinfo if we have no reason to try
2830 alternatives. */
2831 if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
2832 break;
2833 }
2834 else
2835 {
2836 delete loop_vinfo;
2837 loop_vinfo = opt_loop_vec_info::success (NULL);
2838 if (fatal)
2839 {
2840 gcc_checking_assert (first_loop_vinfo == NULL);
2841 break;
2842 }
2843 }
2844
2845 /* Handle the case that the original loop can use partial
2846 vectorization, but want to only adopt it for the epilogue.
2847 The retry should be in the same mode as original. */
2848 if (vect_epilogues
2849 && loop_vinfo
2850 && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
2851 {
2852 gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2853 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
2854 if (dump_enabled_p ())
2855 dump_printf_loc (MSG_NOTE, vect_location,
2856 "***** Re-trying analysis with same vector mode"
2857 " %s for epilogue with partial vectors.\n",
2858 GET_MODE_NAME (loop_vinfo->vector_mode));
2859 continue;
2860 }
2861
2862 if (mode_i < vector_modes.length ()
2863 && VECTOR_MODE_P (autodetected_vector_mode)
2864 && (related_vector_mode (vector_modes[mode_i],
2865 GET_MODE_INNER (autodetected_vector_mode))
2866 == autodetected_vector_mode)
2867 && (related_vector_mode (autodetected_vector_mode,
2868 GET_MODE_INNER (vector_modes[mode_i]))
2869 == vector_modes[mode_i]))
2870 {
2871 if (dump_enabled_p ())
2872 dump_printf_loc (MSG_NOTE, vect_location,
2873 "***** Skipping vector mode %s, which would"
2874 " repeat the analysis for %s\n",
2875 GET_MODE_NAME (vector_modes[mode_i]),
2876 GET_MODE_NAME (autodetected_vector_mode));
2877 mode_i += 1;
2878 }
2879
2880 if (mode_i == vector_modes.length ()
2881 || autodetected_vector_mode == VOIDmode)
2882 break;
2883
2884 /* Try the next biggest vector size. */
2885 next_vector_mode = vector_modes[mode_i++];
2886 if (dump_enabled_p ())
2887 dump_printf_loc (MSG_NOTE, vect_location,
2888 "***** Re-trying analysis with vector mode %s\n",
2889 GET_MODE_NAME (next_vector_mode));
2890 }
2891
2892 if (first_loop_vinfo)
2893 {
2894 loop->aux = (loop_vec_info) first_loop_vinfo;
2895 if (dump_enabled_p ())
2896 dump_printf_loc (MSG_NOTE, vect_location,
2897 "***** Choosing vector mode %s\n",
2898 GET_MODE_NAME (first_loop_vinfo->vector_mode));
2899 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
2900 return first_loop_vinfo;
2901 }
2902
2903 return opt_loop_vec_info::propagate_failure (res);
2904 }
2905
2906 /* Return true if there is an in-order reduction function for CODE, storing
2907 it in *REDUC_FN if so. */
2908
2909 static bool
2910 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2911 {
2912 switch (code)
2913 {
2914 case PLUS_EXPR:
2915 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2916 return true;
2917
2918 default:
2919 return false;
2920 }
2921 }
2922
2923 /* Function reduction_fn_for_scalar_code
2924
2925 Input:
2926 CODE - tree_code of a reduction operations.
2927
2928 Output:
2929 REDUC_FN - the corresponding internal function to be used to reduce the
2930 vector of partial results into a single scalar result, or IFN_LAST
2931 if the operation is a supported reduction operation, but does not have
2932 such an internal function.
2933
2934 Return FALSE if CODE currently cannot be vectorized as reduction. */
2935
2936 static bool
2937 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2938 {
2939 switch (code)
2940 {
2941 case MAX_EXPR:
2942 *reduc_fn = IFN_REDUC_MAX;
2943 return true;
2944
2945 case MIN_EXPR:
2946 *reduc_fn = IFN_REDUC_MIN;
2947 return true;
2948
2949 case PLUS_EXPR:
2950 *reduc_fn = IFN_REDUC_PLUS;
2951 return true;
2952
2953 case BIT_AND_EXPR:
2954 *reduc_fn = IFN_REDUC_AND;
2955 return true;
2956
2957 case BIT_IOR_EXPR:
2958 *reduc_fn = IFN_REDUC_IOR;
2959 return true;
2960
2961 case BIT_XOR_EXPR:
2962 *reduc_fn = IFN_REDUC_XOR;
2963 return true;
2964
2965 case MULT_EXPR:
2966 case MINUS_EXPR:
2967 *reduc_fn = IFN_LAST;
2968 return true;
2969
2970 default:
2971 return false;
2972 }
2973 }
2974
2975 /* If there is a neutral value X such that SLP reduction NODE would not
2976 be affected by the introduction of additional X elements, return that X,
2977 otherwise return null. CODE is the code of the reduction and VECTOR_TYPE
2978 is the vector type that would hold element X. REDUC_CHAIN is true if
2979 the SLP statements perform a single reduction, false if each statement
2980 performs an independent reduction. */
2981
2982 static tree
2983 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
2984 tree_code code, bool reduc_chain)
2985 {
2986 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2987 stmt_vec_info stmt_vinfo = stmts[0];
2988 tree scalar_type = TREE_TYPE (vector_type);
2989 class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2990 gcc_assert (loop);
2991
2992 switch (code)
2993 {
2994 case WIDEN_SUM_EXPR:
2995 case DOT_PROD_EXPR:
2996 case SAD_EXPR:
2997 case PLUS_EXPR:
2998 case MINUS_EXPR:
2999 case BIT_IOR_EXPR:
3000 case BIT_XOR_EXPR:
3001 return build_zero_cst (scalar_type);
3002
3003 case MULT_EXPR:
3004 return build_one_cst (scalar_type);
3005
3006 case BIT_AND_EXPR:
3007 return build_all_ones_cst (scalar_type);
3008
3009 case MAX_EXPR:
3010 case MIN_EXPR:
3011 /* For MIN/MAX the initial values are neutral. A reduction chain
3012 has only a single initial value, so that value is neutral for
3013 all statements. */
3014 if (reduc_chain)
3015 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
3016 loop_preheader_edge (loop));
3017 return NULL_TREE;
3018
3019 default:
3020 return NULL_TREE;
3021 }
3022 }
3023
3024 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3025 STMT is printed with a message MSG. */
3026
3027 static void
3028 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3029 {
3030 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3031 }
3032
3033 /* Return true if we need an in-order reduction for operation CODE
3034 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3035 overflow must wrap. */
3036
3037 bool
3038 needs_fold_left_reduction_p (tree type, tree_code code)
3039 {
3040 /* CHECKME: check for !flag_finite_math_only too? */
3041 if (SCALAR_FLOAT_TYPE_P (type))
3042 switch (code)
3043 {
3044 case MIN_EXPR:
3045 case MAX_EXPR:
3046 return false;
3047
3048 default:
3049 return !flag_associative_math;
3050 }
3051
3052 if (INTEGRAL_TYPE_P (type))
3053 {
3054 if (!operation_no_trapping_overflow (type, code))
3055 return true;
3056 return false;
3057 }
3058
3059 if (SAT_FIXED_POINT_TYPE_P (type))
3060 return true;
3061
3062 return false;
3063 }
3064
3065 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3066 has a handled computation expression. Store the main reduction
3067 operation in *CODE. */
3068
3069 static bool
3070 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3071 tree loop_arg, enum tree_code *code,
3072 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3073 {
3074 auto_bitmap visited;
3075 tree lookfor = PHI_RESULT (phi);
3076 ssa_op_iter curri;
3077 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3078 while (USE_FROM_PTR (curr) != loop_arg)
3079 curr = op_iter_next_use (&curri);
3080 curri.i = curri.numops;
3081 do
3082 {
3083 path.safe_push (std::make_pair (curri, curr));
3084 tree use = USE_FROM_PTR (curr);
3085 if (use == lookfor)
3086 break;
3087 gimple *def = SSA_NAME_DEF_STMT (use);
3088 if (gimple_nop_p (def)
3089 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3090 {
3091 pop:
3092 do
3093 {
3094 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3095 curri = x.first;
3096 curr = x.second;
3097 do
3098 curr = op_iter_next_use (&curri);
3099 /* Skip already visited or non-SSA operands (from iterating
3100 over PHI args). */
3101 while (curr != NULL_USE_OPERAND_P
3102 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3103 || ! bitmap_set_bit (visited,
3104 SSA_NAME_VERSION
3105 (USE_FROM_PTR (curr)))));
3106 }
3107 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3108 if (curr == NULL_USE_OPERAND_P)
3109 break;
3110 }
3111 else
3112 {
3113 if (gimple_code (def) == GIMPLE_PHI)
3114 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3115 else
3116 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3117 while (curr != NULL_USE_OPERAND_P
3118 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3119 || ! bitmap_set_bit (visited,
3120 SSA_NAME_VERSION
3121 (USE_FROM_PTR (curr)))))
3122 curr = op_iter_next_use (&curri);
3123 if (curr == NULL_USE_OPERAND_P)
3124 goto pop;
3125 }
3126 }
3127 while (1);
3128 if (dump_file && (dump_flags & TDF_DETAILS))
3129 {
3130 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3131 unsigned i;
3132 std::pair<ssa_op_iter, use_operand_p> *x;
3133 FOR_EACH_VEC_ELT (path, i, x)
3134 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3135 dump_printf (MSG_NOTE, "\n");
3136 }
3137
3138 /* Check whether the reduction path detected is valid. */
3139 bool fail = path.length () == 0;
3140 bool neg = false;
3141 int sign = -1;
3142 *code = ERROR_MARK;
3143 for (unsigned i = 1; i < path.length (); ++i)
3144 {
3145 gimple *use_stmt = USE_STMT (path[i].second);
3146 tree op = USE_FROM_PTR (path[i].second);
3147 if (! is_gimple_assign (use_stmt)
3148 /* The following make sure we can compute the operand index
3149 easily plus it mostly disallows chaining via COND_EXPR condition
3150 operands. */
3151 || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3152 && (gimple_num_ops (use_stmt) <= 2
3153 || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3154 && (gimple_num_ops (use_stmt) <= 3
3155 || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3156 {
3157 fail = true;
3158 break;
3159 }
3160 /* Check there's only a single stmt the op is used on inside
3161 of the loop. */
3162 imm_use_iterator imm_iter;
3163 gimple *op_use_stmt;
3164 unsigned cnt = 0;
3165 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3166 if (!is_gimple_debug (op_use_stmt)
3167 && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))
3168 {
3169 /* We want to allow x + x but not x < 1 ? x : 2. */
3170 if (is_gimple_assign (op_use_stmt)
3171 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3172 {
3173 use_operand_p use_p;
3174 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3175 cnt++;
3176 }
3177 else
3178 cnt++;
3179 }
3180 if (cnt != 1)
3181 {
3182 fail = true;
3183 break;
3184 }
3185 tree_code use_code = gimple_assign_rhs_code (use_stmt);
3186 if (use_code == MINUS_EXPR)
3187 {
3188 use_code = PLUS_EXPR;
3189 /* Track whether we negate the reduction value each iteration. */
3190 if (gimple_assign_rhs2 (use_stmt) == op)
3191 neg = ! neg;
3192 }
3193 if (CONVERT_EXPR_CODE_P (use_code)
3194 && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3195 TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3196 ;
3197 else if (*code == ERROR_MARK)
3198 {
3199 *code = use_code;
3200 sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3201 }
3202 else if (use_code != *code)
3203 {
3204 fail = true;
3205 break;
3206 }
3207 else if ((use_code == MIN_EXPR
3208 || use_code == MAX_EXPR)
3209 && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3210 {
3211 fail = true;
3212 break;
3213 }
3214 }
3215 return ! fail && ! neg && *code != ERROR_MARK;
3216 }
3217
3218 bool
3219 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3220 tree loop_arg, enum tree_code code)
3221 {
3222 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3223 enum tree_code code_;
3224 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3225 && code_ == code);
3226 }
3227
3228
3229
3230 /* Function vect_is_simple_reduction
3231
3232 (1) Detect a cross-iteration def-use cycle that represents a simple
3233 reduction computation. We look for the following pattern:
3234
3235 loop_header:
3236 a1 = phi < a0, a2 >
3237 a3 = ...
3238 a2 = operation (a3, a1)
3239
3240 or
3241
3242 a3 = ...
3243 loop_header:
3244 a1 = phi < a0, a2 >
3245 a2 = operation (a3, a1)
3246
3247 such that:
3248 1. operation is commutative and associative and it is safe to
3249 change the order of the computation
3250 2. no uses for a2 in the loop (a2 is used out of the loop)
3251 3. no uses of a1 in the loop besides the reduction operation
3252 4. no uses of a1 outside the loop.
3253
3254 Conditions 1,4 are tested here.
3255 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3256
3257 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3258 nested cycles.
3259
3260 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3261 reductions:
3262
3263 a1 = phi < a0, a2 >
3264 inner loop (def of a3)
3265 a2 = phi < a3 >
3266
3267 (4) Detect condition expressions, ie:
3268 for (int i = 0; i < N; i++)
3269 if (a[i] < val)
3270 ret_val = a[i];
3271
3272 */
3273
3274 static stmt_vec_info
3275 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3276 bool *double_reduc, bool *reduc_chain_p)
3277 {
3278 gphi *phi = as_a <gphi *> (phi_info->stmt);
3279 gimple *phi_use_stmt = NULL;
3280 imm_use_iterator imm_iter;
3281 use_operand_p use_p;
3282
3283 *double_reduc = false;
3284 *reduc_chain_p = false;
3285 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3286
3287 tree phi_name = PHI_RESULT (phi);
3288 /* ??? If there are no uses of the PHI result the inner loop reduction
3289 won't be detected as possibly double-reduction by vectorizable_reduction
3290 because that tries to walk the PHI arg from the preheader edge which
3291 can be constant. See PR60382. */
3292 if (has_zero_uses (phi_name))
3293 return NULL;
3294 class loop *loop = (gimple_bb (phi))->loop_father;
3295 unsigned nphi_def_loop_uses = 0;
3296 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3297 {
3298 gimple *use_stmt = USE_STMT (use_p);
3299 if (is_gimple_debug (use_stmt))
3300 continue;
3301
3302 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3303 {
3304 if (dump_enabled_p ())
3305 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3306 "intermediate value used outside loop.\n");
3307
3308 return NULL;
3309 }
3310
3311 nphi_def_loop_uses++;
3312 phi_use_stmt = use_stmt;
3313 }
3314
3315 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3316 if (TREE_CODE (latch_def) != SSA_NAME)
3317 {
3318 if (dump_enabled_p ())
3319 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3320 "reduction: not ssa_name: %T\n", latch_def);
3321 return NULL;
3322 }
3323
3324 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3325 if (!def_stmt_info
3326 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3327 return NULL;
3328
3329 bool nested_in_vect_loop
3330 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3331 unsigned nlatch_def_loop_uses = 0;
3332 auto_vec<gphi *, 3> lcphis;
3333 bool inner_loop_of_double_reduc = false;
3334 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3335 {
3336 gimple *use_stmt = USE_STMT (use_p);
3337 if (is_gimple_debug (use_stmt))
3338 continue;
3339 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3340 nlatch_def_loop_uses++;
3341 else
3342 {
3343 /* We can have more than one loop-closed PHI. */
3344 lcphis.safe_push (as_a <gphi *> (use_stmt));
3345 if (nested_in_vect_loop
3346 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3347 == vect_double_reduction_def))
3348 inner_loop_of_double_reduc = true;
3349 }
3350 }
3351
3352 /* If we are vectorizing an inner reduction we are executing that
3353 in the original order only in case we are not dealing with a
3354 double reduction. */
3355 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3356 {
3357 if (dump_enabled_p ())
3358 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3359 "detected nested cycle: ");
3360 return def_stmt_info;
3361 }
3362
3363 /* If this isn't a nested cycle or if the nested cycle reduction value
3364 is used ouside of the inner loop we cannot handle uses of the reduction
3365 value. */
3366 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3367 {
3368 if (dump_enabled_p ())
3369 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3370 "reduction used in loop.\n");
3371 return NULL;
3372 }
3373
3374 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3375 defined in the inner loop. */
3376 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3377 {
3378 tree op1 = PHI_ARG_DEF (def_stmt, 0);
3379 if (gimple_phi_num_args (def_stmt) != 1
3380 || TREE_CODE (op1) != SSA_NAME)
3381 {
3382 if (dump_enabled_p ())
3383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3384 "unsupported phi node definition.\n");
3385
3386 return NULL;
3387 }
3388
3389 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3390 if (gimple_bb (def1)
3391 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3392 && loop->inner
3393 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3394 && is_gimple_assign (def1)
3395 && is_a <gphi *> (phi_use_stmt)
3396 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3397 {
3398 if (dump_enabled_p ())
3399 report_vect_op (MSG_NOTE, def_stmt,
3400 "detected double reduction: ");
3401
3402 *double_reduc = true;
3403 return def_stmt_info;
3404 }
3405
3406 return NULL;
3407 }
3408
3409 /* Look for the expression computing latch_def from then loop PHI result. */
3410 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3411 enum tree_code code;
3412 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3413 path))
3414 {
3415 STMT_VINFO_REDUC_CODE (phi_info) = code;
3416 if (code == COND_EXPR && !nested_in_vect_loop)
3417 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3418
3419 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3420 reduction chain for which the additional restriction is that
3421 all operations in the chain are the same. */
3422 auto_vec<stmt_vec_info, 8> reduc_chain;
3423 unsigned i;
3424 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3425 for (i = path.length () - 1; i >= 1; --i)
3426 {
3427 gimple *stmt = USE_STMT (path[i].second);
3428 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3429 STMT_VINFO_REDUC_IDX (stmt_info)
3430 = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3431 enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3432 bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3433 && (i == 1 || i == path.length () - 1));
3434 if ((stmt_code != code && !leading_conversion)
3435 /* We can only handle the final value in epilogue
3436 generation for reduction chains. */
3437 || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3438 is_slp_reduc = false;
3439 /* For reduction chains we support a trailing/leading
3440 conversions. We do not store those in the actual chain. */
3441 if (leading_conversion)
3442 continue;
3443 reduc_chain.safe_push (stmt_info);
3444 }
3445 if (is_slp_reduc && reduc_chain.length () > 1)
3446 {
3447 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3448 {
3449 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3450 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3451 }
3452 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3453 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3454
3455 /* Save the chain for further analysis in SLP detection. */
3456 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3457 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3458
3459 *reduc_chain_p = true;
3460 if (dump_enabled_p ())
3461 dump_printf_loc (MSG_NOTE, vect_location,
3462 "reduction: detected reduction chain\n");
3463 }
3464 else if (dump_enabled_p ())
3465 dump_printf_loc (MSG_NOTE, vect_location,
3466 "reduction: detected reduction\n");
3467
3468 return def_stmt_info;
3469 }
3470
3471 if (dump_enabled_p ())
3472 dump_printf_loc (MSG_NOTE, vect_location,
3473 "reduction: unknown pattern\n");
3474
3475 return NULL;
3476 }
3477
3478 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3479 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3480 or -1 if not known. */
3481
3482 static int
3483 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3484 {
3485 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3486 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3487 {
3488 if (dump_enabled_p ())
3489 dump_printf_loc (MSG_NOTE, vect_location,
3490 "cost model: epilogue peel iters set to vf/2 "
3491 "because loop iterations are unknown .\n");
3492 return assumed_vf / 2;
3493 }
3494 else
3495 {
3496 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3497 peel_iters_prologue = MIN (niters, peel_iters_prologue);
3498 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3499 /* If we need to peel for gaps, but no peeling is required, we have to
3500 peel VF iterations. */
3501 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3502 peel_iters_epilogue = assumed_vf;
3503 return peel_iters_epilogue;
3504 }
3505 }
3506
3507 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3508 int
3509 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3510 int *peel_iters_epilogue,
3511 stmt_vector_for_cost *scalar_cost_vec,
3512 stmt_vector_for_cost *prologue_cost_vec,
3513 stmt_vector_for_cost *epilogue_cost_vec)
3514 {
3515 int retval = 0;
3516
3517 *peel_iters_epilogue
3518 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3519
3520 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3521 {
3522 /* If peeled iterations are known but number of scalar loop
3523 iterations are unknown, count a taken branch per peeled loop. */
3524 if (peel_iters_prologue > 0)
3525 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3526 NULL, NULL_TREE, 0, vect_prologue);
3527 if (*peel_iters_epilogue > 0)
3528 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3529 NULL, NULL_TREE, 0, vect_epilogue);
3530 }
3531
3532 stmt_info_for_cost *si;
3533 int j;
3534 if (peel_iters_prologue)
3535 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3536 retval += record_stmt_cost (prologue_cost_vec,
3537 si->count * peel_iters_prologue,
3538 si->kind, si->stmt_info, si->misalign,
3539 vect_prologue);
3540 if (*peel_iters_epilogue)
3541 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3542 retval += record_stmt_cost (epilogue_cost_vec,
3543 si->count * *peel_iters_epilogue,
3544 si->kind, si->stmt_info, si->misalign,
3545 vect_epilogue);
3546
3547 return retval;
3548 }
3549
3550 /* Function vect_estimate_min_profitable_iters
3551
3552 Return the number of iterations required for the vector version of the
3553 loop to be profitable relative to the cost of the scalar version of the
3554 loop.
3555
3556 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3557 of iterations for vectorization. -1 value means loop vectorization
3558 is not profitable. This returned value may be used for dynamic
3559 profitability check.
3560
3561 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3562 for static check against estimated number of iterations. */
3563
3564 static void
3565 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3566 int *ret_min_profitable_niters,
3567 int *ret_min_profitable_estimate)
3568 {
3569 int min_profitable_iters;
3570 int min_profitable_estimate;
3571 int peel_iters_prologue;
3572 int peel_iters_epilogue;
3573 unsigned vec_inside_cost = 0;
3574 int vec_outside_cost = 0;
3575 unsigned vec_prologue_cost = 0;
3576 unsigned vec_epilogue_cost = 0;
3577 int scalar_single_iter_cost = 0;
3578 int scalar_outside_cost = 0;
3579 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3580 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3581 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3582
3583 /* Cost model disabled. */
3584 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3585 {
3586 if (dump_enabled_p ())
3587 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3588 *ret_min_profitable_niters = 0;
3589 *ret_min_profitable_estimate = 0;
3590 return;
3591 }
3592
3593 /* Requires loop versioning tests to handle misalignment. */
3594 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3595 {
3596 /* FIXME: Make cost depend on complexity of individual check. */
3597 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3598 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3599 NULL, NULL_TREE, 0, vect_prologue);
3600 if (dump_enabled_p ())
3601 dump_printf (MSG_NOTE,
3602 "cost model: Adding cost of checks for loop "
3603 "versioning to treat misalignment.\n");
3604 }
3605
3606 /* Requires loop versioning with alias checks. */
3607 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3608 {
3609 /* FIXME: Make cost depend on complexity of individual check. */
3610 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3611 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3612 NULL, NULL_TREE, 0, vect_prologue);
3613 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3614 if (len)
3615 /* Count LEN - 1 ANDs and LEN comparisons. */
3616 (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
3617 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3618 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3619 if (len)
3620 {
3621 /* Count LEN - 1 ANDs and LEN comparisons. */
3622 unsigned int nstmts = len * 2 - 1;
3623 /* +1 for each bias that needs adding. */
3624 for (unsigned int i = 0; i < len; ++i)
3625 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3626 nstmts += 1;
3627 (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
3628 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3629 }
3630 if (dump_enabled_p ())
3631 dump_printf (MSG_NOTE,
3632 "cost model: Adding cost of checks for loop "
3633 "versioning aliasing.\n");
3634 }
3635
3636 /* Requires loop versioning with niter checks. */
3637 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3638 {
3639 /* FIXME: Make cost depend on complexity of individual check. */
3640 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
3641 NULL, NULL_TREE, 0, vect_prologue);
3642 if (dump_enabled_p ())
3643 dump_printf (MSG_NOTE,
3644 "cost model: Adding cost of checks for loop "
3645 "versioning niters.\n");
3646 }
3647
3648 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3649 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3650 NULL, NULL_TREE, 0, vect_prologue);
3651
3652 /* Count statements in scalar loop. Using this as scalar cost for a single
3653 iteration for now.
3654
3655 TODO: Add outer loop support.
3656
3657 TODO: Consider assigning different costs to different scalar
3658 statements. */
3659
3660 scalar_single_iter_cost
3661 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3662
3663 /* Add additional cost for the peeled instructions in prologue and epilogue
3664 loop. (For fully-masked loops there will be no peeling.)
3665
3666 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3667 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3668
3669 TODO: Build an expression that represents peel_iters for prologue and
3670 epilogue to be used in a run-time test. */
3671
3672 bool prologue_need_br_taken_cost = false;
3673 bool prologue_need_br_not_taken_cost = false;
3674
3675 /* Calculate peel_iters_prologue. */
3676 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3677 peel_iters_prologue = 0;
3678 else if (npeel < 0)
3679 {
3680 peel_iters_prologue = assumed_vf / 2;
3681 if (dump_enabled_p ())
3682 dump_printf (MSG_NOTE, "cost model: "
3683 "prologue peel iters set to vf/2.\n");
3684
3685 /* If peeled iterations are unknown, count a taken branch and a not taken
3686 branch per peeled loop. Even if scalar loop iterations are known,
3687 vector iterations are not known since peeled prologue iterations are
3688 not known. Hence guards remain the same. */
3689 prologue_need_br_taken_cost = true;
3690 prologue_need_br_not_taken_cost = true;
3691 }
3692 else
3693 {
3694 peel_iters_prologue = npeel;
3695 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
3696 /* If peeled iterations are known but number of scalar loop
3697 iterations are unknown, count a taken branch per peeled loop. */
3698 prologue_need_br_taken_cost = true;
3699 }
3700
3701 bool epilogue_need_br_taken_cost = false;
3702 bool epilogue_need_br_not_taken_cost = false;
3703
3704 /* Calculate peel_iters_epilogue. */
3705 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3706 /* We need to peel exactly one iteration for gaps. */
3707 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
3708 else if (npeel < 0)
3709 {
3710 /* If peeling for alignment is unknown, loop bound of main loop
3711 becomes unknown. */
3712 peel_iters_epilogue = assumed_vf / 2;
3713 if (dump_enabled_p ())
3714 dump_printf (MSG_NOTE, "cost model: "
3715 "epilogue peel iters set to vf/2 because "
3716 "peeling for alignment is unknown.\n");
3717
3718 /* See the same reason above in peel_iters_prologue calculation. */
3719 epilogue_need_br_taken_cost = true;
3720 epilogue_need_br_not_taken_cost = true;
3721 }
3722 else
3723 {
3724 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
3725 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
3726 /* If peeled iterations are known but number of scalar loop
3727 iterations are unknown, count a taken branch per peeled loop. */
3728 epilogue_need_br_taken_cost = true;
3729 }
3730
3731 stmt_info_for_cost *si;
3732 int j;
3733 /* Add costs associated with peel_iters_prologue. */
3734 if (peel_iters_prologue)
3735 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3736 {
3737 (void) add_stmt_cost (loop_vinfo, target_cost_data,
3738 si->count * peel_iters_prologue, si->kind,
3739 si->stmt_info, si->vectype, si->misalign,
3740 vect_prologue);
3741 }
3742
3743 /* Add costs associated with peel_iters_epilogue. */
3744 if (peel_iters_epilogue)
3745 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3746 {
3747 (void) add_stmt_cost (loop_vinfo, target_cost_data,
3748 si->count * peel_iters_epilogue, si->kind,
3749 si->stmt_info, si->vectype, si->misalign,
3750 vect_epilogue);
3751 }
3752
3753 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
3754
3755 if (prologue_need_br_taken_cost)
3756 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3757 NULL, NULL_TREE, 0, vect_prologue);
3758
3759 if (prologue_need_br_not_taken_cost)
3760 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
3761 cond_branch_not_taken, NULL, NULL_TREE, 0,
3762 vect_prologue);
3763
3764 if (epilogue_need_br_taken_cost)
3765 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3766 NULL, NULL_TREE, 0, vect_epilogue);
3767
3768 if (epilogue_need_br_not_taken_cost)
3769 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
3770 cond_branch_not_taken, NULL, NULL_TREE, 0,
3771 vect_epilogue);
3772
3773 /* Take care of special costs for rgroup controls of partial vectors. */
3774 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3775 {
3776 /* Calculate how many masks we need to generate. */
3777 unsigned int num_masks = 0;
3778 rgroup_controls *rgm;
3779 unsigned int num_vectors_m1;
3780 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
3781 if (rgm->type)
3782 num_masks += num_vectors_m1 + 1;
3783 gcc_assert (num_masks > 0);
3784
3785 /* In the worst case, we need to generate each mask in the prologue
3786 and in the loop body. One of the loop body mask instructions
3787 replaces the comparison in the scalar loop, and since we don't
3788 count the scalar comparison against the scalar body, we shouldn't
3789 count that vector instruction against the vector body either.
3790
3791 Sometimes we can use unpacks instead of generating prologue
3792 masks and sometimes the prologue mask will fold to a constant,
3793 so the actual prologue cost might be smaller. However, it's
3794 simpler and safer to use the worst-case cost; if this ends up
3795 being the tie-breaker between vectorizing or not, then it's
3796 probably better not to vectorize. */
3797 (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks,
3798 vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
3799 (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
3800 vector_stmt, NULL, NULL_TREE, 0, vect_body);
3801 }
3802 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
3803 {
3804 /* Referring to the functions vect_set_loop_condition_partial_vectors
3805 and vect_set_loop_controls_directly, we need to generate each
3806 length in the prologue and in the loop body if required. Although
3807 there are some possible optimizations, we consider the worst case
3808 here. */
3809
3810 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
3811 bool need_iterate_p
3812 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3813 && !vect_known_niters_smaller_than_vf (loop_vinfo));
3814
3815 /* Calculate how many statements to be added. */
3816 unsigned int prologue_stmts = 0;
3817 unsigned int body_stmts = 0;
3818
3819 rgroup_controls *rgc;
3820 unsigned int num_vectors_m1;
3821 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
3822 if (rgc->type)
3823 {
3824 /* May need one SHIFT for nitems_total computation. */
3825 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
3826 if (nitems != 1 && !niters_known_p)
3827 prologue_stmts += 1;
3828
3829 /* May need one MAX and one MINUS for wrap around. */
3830 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
3831 prologue_stmts += 2;
3832
3833 /* Need one MAX and one MINUS for each batch limit excepting for
3834 the 1st one. */
3835 prologue_stmts += num_vectors_m1 * 2;
3836
3837 unsigned int num_vectors = num_vectors_m1 + 1;
3838
3839 /* Need to set up lengths in prologue, only one MIN required
3840 for each since start index is zero. */
3841 prologue_stmts += num_vectors;
3842
3843 /* Each may need two MINs and one MINUS to update lengths in body
3844 for next iteration. */
3845 if (need_iterate_p)
3846 body_stmts += 3 * num_vectors;
3847 }
3848
3849 (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts,
3850 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3851 (void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts,
3852 scalar_stmt, NULL, NULL_TREE, 0, vect_body);
3853 }
3854
3855 /* FORNOW: The scalar outside cost is incremented in one of the
3856 following ways:
3857
3858 1. The vectorizer checks for alignment and aliasing and generates
3859 a condition that allows dynamic vectorization. A cost model
3860 check is ANDED with the versioning condition. Hence scalar code
3861 path now has the added cost of the versioning check.
3862
3863 if (cost > th & versioning_check)
3864 jmp to vector code
3865
3866 Hence run-time scalar is incremented by not-taken branch cost.
3867
3868 2. The vectorizer then checks if a prologue is required. If the
3869 cost model check was not done before during versioning, it has to
3870 be done before the prologue check.
3871
3872 if (cost <= th)
3873 prologue = scalar_iters
3874 if (prologue == 0)
3875 jmp to vector code
3876 else
3877 execute prologue
3878 if (prologue == num_iters)
3879 go to exit
3880
3881 Hence the run-time scalar cost is incremented by a taken branch,
3882 plus a not-taken branch, plus a taken branch cost.
3883
3884 3. The vectorizer then checks if an epilogue is required. If the
3885 cost model check was not done before during prologue check, it
3886 has to be done with the epilogue check.
3887
3888 if (prologue == 0)
3889 jmp to vector code
3890 else
3891 execute prologue
3892 if (prologue == num_iters)
3893 go to exit
3894 vector code:
3895 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3896 jmp to epilogue
3897
3898 Hence the run-time scalar cost should be incremented by 2 taken
3899 branches.
3900
3901 TODO: The back end may reorder the BBS's differently and reverse
3902 conditions/branch directions. Change the estimates below to
3903 something more reasonable. */
3904
3905 /* If the number of iterations is known and we do not do versioning, we can
3906 decide whether to vectorize at compile time. Hence the scalar version
3907 do not carry cost model guard costs. */
3908 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3909 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3910 {
3911 /* Cost model check occurs at versioning. */
3912 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3913 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3914 else
3915 {
3916 /* Cost model check occurs at prologue generation. */
3917 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3918 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3919 + vect_get_stmt_cost (cond_branch_not_taken);
3920 /* Cost model check occurs at epilogue generation. */
3921 else
3922 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3923 }
3924 }
3925
3926 /* Complete the target-specific cost calculations. */
3927 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3928 &vec_inside_cost, &vec_epilogue_cost);
3929
3930 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3931
3932 /* Stash the costs so that we can compare two loop_vec_infos. */
3933 loop_vinfo->vec_inside_cost = vec_inside_cost;
3934 loop_vinfo->vec_outside_cost = vec_outside_cost;
3935
3936 if (dump_enabled_p ())
3937 {
3938 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3939 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3940 vec_inside_cost);
3941 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3942 vec_prologue_cost);
3943 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3944 vec_epilogue_cost);
3945 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3946 scalar_single_iter_cost);
3947 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3948 scalar_outside_cost);
3949 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3950 vec_outside_cost);
3951 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3952 peel_iters_prologue);
3953 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3954 peel_iters_epilogue);
3955 }
3956
3957 /* Calculate number of iterations required to make the vector version
3958 profitable, relative to the loop bodies only. The following condition
3959 must hold true:
3960 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3961 where
3962 SIC = scalar iteration cost, VIC = vector iteration cost,
3963 VOC = vector outside cost, VF = vectorization factor,
3964 NPEEL = prologue iterations + epilogue iterations,
3965 SOC = scalar outside cost for run time cost model check. */
3966
3967 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3968 - vec_inside_cost);
3969 if (saving_per_viter <= 0)
3970 {
3971 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3972 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3973 "vectorization did not happen for a simd loop");
3974
3975 if (dump_enabled_p ())
3976 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3977 "cost model: the vector iteration cost = %d "
3978 "divided by the scalar iteration cost = %d "
3979 "is greater or equal to the vectorization factor = %d"
3980 ".\n",
3981 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3982 *ret_min_profitable_niters = -1;
3983 *ret_min_profitable_estimate = -1;
3984 return;
3985 }
3986
3987 /* ??? The "if" arm is written to handle all cases; see below for what
3988 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
3989 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3990 {
3991 /* Rewriting the condition above in terms of the number of
3992 vector iterations (vniters) rather than the number of
3993 scalar iterations (niters) gives:
3994
3995 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3996
3997 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3998
3999 For integer N, X and Y when X > 0:
4000
4001 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4002 int outside_overhead = (vec_outside_cost
4003 - scalar_single_iter_cost * peel_iters_prologue
4004 - scalar_single_iter_cost * peel_iters_epilogue
4005 - scalar_outside_cost);
4006 /* We're only interested in cases that require at least one
4007 vector iteration. */
4008 int min_vec_niters = 1;
4009 if (outside_overhead > 0)
4010 min_vec_niters = outside_overhead / saving_per_viter + 1;
4011
4012 if (dump_enabled_p ())
4013 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4014 min_vec_niters);
4015
4016 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4017 {
4018 /* Now that we know the minimum number of vector iterations,
4019 find the minimum niters for which the scalar cost is larger:
4020
4021 SIC * niters > VIC * vniters + VOC - SOC
4022
4023 We know that the minimum niters is no more than
4024 vniters * VF + NPEEL, but it might be (and often is) less
4025 than that if a partial vector iteration is cheaper than the
4026 equivalent scalar code. */
4027 int threshold = (vec_inside_cost * min_vec_niters
4028 + vec_outside_cost
4029 - scalar_outside_cost);
4030 if (threshold <= 0)
4031 min_profitable_iters = 1;
4032 else
4033 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4034 }
4035 else
4036 /* Convert the number of vector iterations into a number of
4037 scalar iterations. */
4038 min_profitable_iters = (min_vec_niters * assumed_vf
4039 + peel_iters_prologue
4040 + peel_iters_epilogue);
4041 }
4042 else
4043 {
4044 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4045 * assumed_vf
4046 - vec_inside_cost * peel_iters_prologue
4047 - vec_inside_cost * peel_iters_epilogue);
4048 if (min_profitable_iters <= 0)
4049 min_profitable_iters = 0;
4050 else
4051 {
4052 min_profitable_iters /= saving_per_viter;
4053
4054 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4055 <= (((int) vec_inside_cost * min_profitable_iters)
4056 + (((int) vec_outside_cost - scalar_outside_cost)
4057 * assumed_vf)))
4058 min_profitable_iters++;
4059 }
4060 }
4061
4062 if (dump_enabled_p ())
4063 dump_printf (MSG_NOTE,
4064 " Calculated minimum iters for profitability: %d\n",
4065 min_profitable_iters);
4066
4067 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4068 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4069 /* We want the vectorized loop to execute at least once. */
4070 min_profitable_iters = assumed_vf + peel_iters_prologue;
4071 else if (min_profitable_iters < peel_iters_prologue)
4072 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4073 vectorized loop executes at least once. */
4074 min_profitable_iters = peel_iters_prologue;
4075
4076 if (dump_enabled_p ())
4077 dump_printf_loc (MSG_NOTE, vect_location,
4078 " Runtime profitability threshold = %d\n",
4079 min_profitable_iters);
4080
4081 *ret_min_profitable_niters = min_profitable_iters;
4082
4083 /* Calculate number of iterations required to make the vector version
4084 profitable, relative to the loop bodies only.
4085
4086 Non-vectorized variant is SIC * niters and it must win over vector
4087 variant on the expected loop trip count. The following condition must hold true:
4088 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4089
4090 if (vec_outside_cost <= 0)
4091 min_profitable_estimate = 0;
4092 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4093 {
4094 /* This is a repeat of the code above, but with + SOC rather
4095 than - SOC. */
4096 int outside_overhead = (vec_outside_cost
4097 - scalar_single_iter_cost * peel_iters_prologue
4098 - scalar_single_iter_cost * peel_iters_epilogue
4099 + scalar_outside_cost);
4100 int min_vec_niters = 1;
4101 if (outside_overhead > 0)
4102 min_vec_niters = outside_overhead / saving_per_viter + 1;
4103
4104 int threshold = (vec_inside_cost * min_vec_niters
4105 + vec_outside_cost
4106 + scalar_outside_cost);
4107 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4108 }
4109 else
4110 {
4111 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4112 * assumed_vf
4113 - vec_inside_cost * peel_iters_prologue
4114 - vec_inside_cost * peel_iters_epilogue)
4115 / ((scalar_single_iter_cost * assumed_vf)
4116 - vec_inside_cost);
4117 }
4118 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4119 if (dump_enabled_p ())
4120 dump_printf_loc (MSG_NOTE, vect_location,
4121 " Static estimate profitability threshold = %d\n",
4122 min_profitable_estimate);
4123
4124 *ret_min_profitable_estimate = min_profitable_estimate;
4125 }
4126
4127 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4128 vector elements (not bits) for a vector with NELT elements. */
4129 static void
4130 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4131 vec_perm_builder *sel)
4132 {
4133 /* The encoding is a single stepped pattern. Any wrap-around is handled
4134 by vec_perm_indices. */
4135 sel->new_vector (nelt, 1, 3);
4136 for (unsigned int i = 0; i < 3; i++)
4137 sel->quick_push (i + offset);
4138 }
4139
4140 /* Checks whether the target supports whole-vector shifts for vectors of mode
4141 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4142 it supports vec_perm_const with masks for all necessary shift amounts. */
4143 static bool
4144 have_whole_vector_shift (machine_mode mode)
4145 {
4146 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4147 return true;
4148
4149 /* Variable-length vectors should be handled via the optab. */
4150 unsigned int nelt;
4151 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4152 return false;
4153
4154 vec_perm_builder sel;
4155 vec_perm_indices indices;
4156 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4157 {
4158 calc_vec_perm_mask_for_shift (i, nelt, &sel);
4159 indices.new_vector (sel, 2, nelt);
4160 if (!can_vec_perm_const_p (mode, indices, false))
4161 return false;
4162 }
4163 return true;
4164 }
4165
4166 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4167 functions. Design better to avoid maintenance issues. */
4168
4169 /* Function vect_model_reduction_cost.
4170
4171 Models cost for a reduction operation, including the vector ops
4172 generated within the strip-mine loop, the initial definition before
4173 the loop, and the epilogue code that must be generated. */
4174
4175 static void
4176 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4177 stmt_vec_info stmt_info, internal_fn reduc_fn,
4178 vect_reduction_type reduction_type,
4179 int ncopies, stmt_vector_for_cost *cost_vec)
4180 {
4181 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4182 enum tree_code code;
4183 optab optab;
4184 tree vectype;
4185 machine_mode mode;
4186 class loop *loop = NULL;
4187
4188 if (loop_vinfo)
4189 loop = LOOP_VINFO_LOOP (loop_vinfo);
4190
4191 /* Condition reductions generate two reductions in the loop. */
4192 if (reduction_type == COND_REDUCTION)
4193 ncopies *= 2;
4194
4195 vectype = STMT_VINFO_VECTYPE (stmt_info);
4196 mode = TYPE_MODE (vectype);
4197 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4198
4199 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4200
4201 if (reduction_type == EXTRACT_LAST_REDUCTION)
4202 /* No extra instructions are needed in the prologue. The loop body
4203 operations are costed in vectorizable_condition. */
4204 inside_cost = 0;
4205 else if (reduction_type == FOLD_LEFT_REDUCTION)
4206 {
4207 /* No extra instructions needed in the prologue. */
4208 prologue_cost = 0;
4209
4210 if (reduc_fn != IFN_LAST)
4211 /* Count one reduction-like operation per vector. */
4212 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4213 stmt_info, 0, vect_body);
4214 else
4215 {
4216 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4217 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4218 inside_cost = record_stmt_cost (cost_vec, nelements,
4219 vec_to_scalar, stmt_info, 0,
4220 vect_body);
4221 inside_cost += record_stmt_cost (cost_vec, nelements,
4222 scalar_stmt, stmt_info, 0,
4223 vect_body);
4224 }
4225 }
4226 else
4227 {
4228 /* Add in cost for initial definition.
4229 For cond reduction we have four vectors: initial index, step,
4230 initial result of the data reduction, initial value of the index
4231 reduction. */
4232 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4233 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4234 scalar_to_vec, stmt_info, 0,
4235 vect_prologue);
4236
4237 /* Cost of reduction op inside loop. */
4238 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4239 stmt_info, 0, vect_body);
4240 }
4241
4242 /* Determine cost of epilogue code.
4243
4244 We have a reduction operator that will reduce the vector in one statement.
4245 Also requires scalar extract. */
4246
4247 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4248 {
4249 if (reduc_fn != IFN_LAST)
4250 {
4251 if (reduction_type == COND_REDUCTION)
4252 {
4253 /* An EQ stmt and an COND_EXPR stmt. */
4254 epilogue_cost += record_stmt_cost (cost_vec, 2,
4255 vector_stmt, stmt_info, 0,
4256 vect_epilogue);
4257 /* Reduction of the max index and a reduction of the found
4258 values. */
4259 epilogue_cost += record_stmt_cost (cost_vec, 2,
4260 vec_to_scalar, stmt_info, 0,
4261 vect_epilogue);
4262 /* A broadcast of the max value. */
4263 epilogue_cost += record_stmt_cost (cost_vec, 1,
4264 scalar_to_vec, stmt_info, 0,
4265 vect_epilogue);
4266 }
4267 else
4268 {
4269 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4270 stmt_info, 0, vect_epilogue);
4271 epilogue_cost += record_stmt_cost (cost_vec, 1,
4272 vec_to_scalar, stmt_info, 0,
4273 vect_epilogue);
4274 }
4275 }
4276 else if (reduction_type == COND_REDUCTION)
4277 {
4278 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4279 /* Extraction of scalar elements. */
4280 epilogue_cost += record_stmt_cost (cost_vec,
4281 2 * estimated_nunits,
4282 vec_to_scalar, stmt_info, 0,
4283 vect_epilogue);
4284 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4285 epilogue_cost += record_stmt_cost (cost_vec,
4286 2 * estimated_nunits - 3,
4287 scalar_stmt, stmt_info, 0,
4288 vect_epilogue);
4289 }
4290 else if (reduction_type == EXTRACT_LAST_REDUCTION
4291 || reduction_type == FOLD_LEFT_REDUCTION)
4292 /* No extra instructions need in the epilogue. */
4293 ;
4294 else
4295 {
4296 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4297 tree bitsize =
4298 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4299 int element_bitsize = tree_to_uhwi (bitsize);
4300 int nelements = vec_size_in_bits / element_bitsize;
4301
4302 if (code == COND_EXPR)
4303 code = MAX_EXPR;
4304
4305 optab = optab_for_tree_code (code, vectype, optab_default);
4306
4307 /* We have a whole vector shift available. */
4308 if (optab != unknown_optab
4309 && VECTOR_MODE_P (mode)
4310 && optab_handler (optab, mode) != CODE_FOR_nothing
4311 && have_whole_vector_shift (mode))
4312 {
4313 /* Final reduction via vector shifts and the reduction operator.
4314 Also requires scalar extract. */
4315 epilogue_cost += record_stmt_cost (cost_vec,
4316 exact_log2 (nelements) * 2,
4317 vector_stmt, stmt_info, 0,
4318 vect_epilogue);
4319 epilogue_cost += record_stmt_cost (cost_vec, 1,
4320 vec_to_scalar, stmt_info, 0,
4321 vect_epilogue);
4322 }
4323 else
4324 /* Use extracts and reduction op for final reduction. For N
4325 elements, we have N extracts and N-1 reduction ops. */
4326 epilogue_cost += record_stmt_cost (cost_vec,
4327 nelements + nelements - 1,
4328 vector_stmt, stmt_info, 0,
4329 vect_epilogue);
4330 }
4331 }
4332
4333 if (dump_enabled_p ())
4334 dump_printf (MSG_NOTE,
4335 "vect_model_reduction_cost: inside_cost = %d, "
4336 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4337 prologue_cost, epilogue_cost);
4338 }
4339
4340
4341 /* Function vect_model_induction_cost.
4342
4343 Models cost for induction operations. */
4344
4345 static void
4346 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4347 stmt_vector_for_cost *cost_vec)
4348 {
4349 unsigned inside_cost, prologue_cost;
4350
4351 if (PURE_SLP_STMT (stmt_info))
4352 return;
4353
4354 /* loop cost for vec_loop. */
4355 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4356 stmt_info, 0, vect_body);
4357
4358 /* prologue cost for vec_init and vec_step. */
4359 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4360 stmt_info, 0, vect_prologue);
4361
4362 if (dump_enabled_p ())
4363 dump_printf_loc (MSG_NOTE, vect_location,
4364 "vect_model_induction_cost: inside_cost = %d, "
4365 "prologue_cost = %d .\n", inside_cost, prologue_cost);
4366 }
4367
4368
4369
4370 /* Function get_initial_def_for_reduction
4371
4372 Input:
4373 STMT_VINFO - a stmt that performs a reduction operation in the loop.
4374 INIT_VAL - the initial value of the reduction variable
4375
4376 Output:
4377 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4378 of the reduction (used for adjusting the epilog - see below).
4379 Return a vector variable, initialized according to the operation that
4380 STMT_VINFO performs. This vector will be used as the initial value
4381 of the vector of partial results.
4382
4383 Option1 (adjust in epilog): Initialize the vector as follows:
4384 add/bit or/xor: [0,0,...,0,0]
4385 mult/bit and: [1,1,...,1,1]
4386 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4387 and when necessary (e.g. add/mult case) let the caller know
4388 that it needs to adjust the result by init_val.
4389
4390 Option2: Initialize the vector as follows:
4391 add/bit or/xor: [init_val,0,0,...,0]
4392 mult/bit and: [init_val,1,1,...,1]
4393 min/max/cond_expr: [init_val,init_val,...,init_val]
4394 and no adjustments are needed.
4395
4396 For example, for the following code:
4397
4398 s = init_val;
4399 for (i=0;i<n;i++)
4400 s = s + a[i];
4401
4402 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4403 For a vector of 4 units, we want to return either [0,0,0,init_val],
4404 or [0,0,0,0] and let the caller know that it needs to adjust
4405 the result at the end by 'init_val'.
4406
4407 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4408 initialization vector is simpler (same element in all entries), if
4409 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4410
4411 A cost model should help decide between these two schemes. */
4412
4413 static tree
4414 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4415 stmt_vec_info stmt_vinfo,
4416 enum tree_code code, tree init_val,
4417 tree *adjustment_def)
4418 {
4419 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4420 tree scalar_type = TREE_TYPE (init_val);
4421 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4422 tree def_for_init;
4423 tree init_def;
4424 REAL_VALUE_TYPE real_init_val = dconst0;
4425 int int_init_val = 0;
4426 gimple_seq stmts = NULL;
4427
4428 gcc_assert (vectype);
4429
4430 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4431 || SCALAR_FLOAT_TYPE_P (scalar_type));
4432
4433 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4434 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4435
4436 /* ADJUSTMENT_DEF is NULL when called from
4437 vect_create_epilog_for_reduction to vectorize double reduction. */
4438 if (adjustment_def)
4439 *adjustment_def = NULL;
4440
4441 switch (code)
4442 {
4443 case WIDEN_SUM_EXPR:
4444 case DOT_PROD_EXPR:
4445 case SAD_EXPR:
4446 case PLUS_EXPR:
4447 case MINUS_EXPR:
4448 case BIT_IOR_EXPR:
4449 case BIT_XOR_EXPR:
4450 case MULT_EXPR:
4451 case BIT_AND_EXPR:
4452 {
4453 if (code == MULT_EXPR)
4454 {
4455 real_init_val = dconst1;
4456 int_init_val = 1;
4457 }
4458
4459 if (code == BIT_AND_EXPR)
4460 int_init_val = -1;
4461
4462 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4463 def_for_init = build_real (scalar_type, real_init_val);
4464 else
4465 def_for_init = build_int_cst (scalar_type, int_init_val);
4466
4467 if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4468 {
4469 /* Option1: the first element is '0' or '1' as well. */
4470 if (!operand_equal_p (def_for_init, init_val, 0))
4471 *adjustment_def = init_val;
4472 init_def = gimple_build_vector_from_val (&stmts, vectype,
4473 def_for_init);
4474 }
4475 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4476 {
4477 /* Option2 (variable length): the first element is INIT_VAL. */
4478 init_def = gimple_build_vector_from_val (&stmts, vectype,
4479 def_for_init);
4480 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4481 vectype, init_def, init_val);
4482 }
4483 else
4484 {
4485 /* Option2: the first element is INIT_VAL. */
4486 tree_vector_builder elts (vectype, 1, 2);
4487 elts.quick_push (init_val);
4488 elts.quick_push (def_for_init);
4489 init_def = gimple_build_vector (&stmts, &elts);
4490 }
4491 }
4492 break;
4493
4494 case MIN_EXPR:
4495 case MAX_EXPR:
4496 case COND_EXPR:
4497 {
4498 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4499 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4500 }
4501 break;
4502
4503 default:
4504 gcc_unreachable ();
4505 }
4506
4507 if (stmts)
4508 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4509 return init_def;
4510 }
4511
4512 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4513 NUMBER_OF_VECTORS is the number of vector defs to create.
4514 If NEUTRAL_OP is nonnull, introducing extra elements of that
4515 value will not change the result. */
4516
4517 static void
4518 get_initial_defs_for_reduction (vec_info *vinfo,
4519 slp_tree slp_node,
4520 vec<tree> *vec_oprnds,
4521 unsigned int number_of_vectors,
4522 bool reduc_chain, tree neutral_op)
4523 {
4524 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4525 stmt_vec_info stmt_vinfo = stmts[0];
4526 unsigned HOST_WIDE_INT nunits;
4527 unsigned j, number_of_places_left_in_vector;
4528 tree vector_type;
4529 unsigned int group_size = stmts.length ();
4530 unsigned int i;
4531 class loop *loop;
4532
4533 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4534
4535 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4536
4537 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4538 gcc_assert (loop);
4539 edge pe = loop_preheader_edge (loop);
4540
4541 gcc_assert (!reduc_chain || neutral_op);
4542
4543 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4544 created vectors. It is greater than 1 if unrolling is performed.
4545
4546 For example, we have two scalar operands, s1 and s2 (e.g., group of
4547 strided accesses of size two), while NUNITS is four (i.e., four scalars
4548 of this type can be packed in a vector). The output vector will contain
4549 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4550 will be 2).
4551
4552 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4553 vectors containing the operands.
4554
4555 For example, NUNITS is four as before, and the group size is 8
4556 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4557 {s5, s6, s7, s8}. */
4558
4559 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4560 nunits = group_size;
4561
4562 number_of_places_left_in_vector = nunits;
4563 bool constant_p = true;
4564 tree_vector_builder elts (vector_type, nunits, 1);
4565 elts.quick_grow (nunits);
4566 gimple_seq ctor_seq = NULL;
4567 for (j = 0; j < nunits * number_of_vectors; ++j)
4568 {
4569 tree op;
4570 i = j % group_size;
4571 stmt_vinfo = stmts[i];
4572
4573 /* Get the def before the loop. In reduction chain we have only
4574 one initial value. Else we have as many as PHIs in the group. */
4575 if (reduc_chain)
4576 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4577 else if (((vec_oprnds->length () + 1) * nunits
4578 - number_of_places_left_in_vector >= group_size)
4579 && neutral_op)
4580 op = neutral_op;
4581 else
4582 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4583
4584 /* Create 'vect_ = {op0,op1,...,opn}'. */
4585 number_of_places_left_in_vector--;
4586 elts[nunits - number_of_places_left_in_vector - 1] = op;
4587 if (!CONSTANT_CLASS_P (op))
4588 constant_p = false;
4589
4590 if (number_of_places_left_in_vector == 0)
4591 {
4592 tree init;
4593 if (constant_p && !neutral_op
4594 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4595 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4596 /* Build the vector directly from ELTS. */
4597 init = gimple_build_vector (&ctor_seq, &elts);
4598 else if (neutral_op)
4599 {
4600 /* Build a vector of the neutral value and shift the
4601 other elements into place. */
4602 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4603 neutral_op);
4604 int k = nunits;
4605 while (k > 0 && elts[k - 1] == neutral_op)
4606 k -= 1;
4607 while (k > 0)
4608 {
4609 k -= 1;
4610 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4611 vector_type, init, elts[k]);
4612 }
4613 }
4614 else
4615 {
4616 /* First time round, duplicate ELTS to fill the
4617 required number of vectors. */
4618 duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4619 number_of_vectors, *vec_oprnds);
4620 break;
4621 }
4622 vec_oprnds->quick_push (init);
4623
4624 number_of_places_left_in_vector = nunits;
4625 elts.new_vector (vector_type, nunits, 1);
4626 elts.quick_grow (nunits);
4627 constant_p = true;
4628 }
4629 }
4630 if (ctor_seq != NULL)
4631 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4632 }
4633
4634 /* For a statement STMT_INFO taking part in a reduction operation return
4635 the stmt_vec_info the meta information is stored on. */
4636
4637 stmt_vec_info
4638 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4639 {
4640 stmt_info = vect_orig_stmt (stmt_info);
4641 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4642 if (!is_a <gphi *> (stmt_info->stmt)
4643 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4644 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4645 gphi *phi = as_a <gphi *> (stmt_info->stmt);
4646 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4647 {
4648 if (gimple_phi_num_args (phi) == 1)
4649 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4650 }
4651 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4652 {
4653 edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4654 stmt_vec_info info
4655 = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4656 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4657 stmt_info = info;
4658 }
4659 return stmt_info;
4660 }
4661
4662 /* Function vect_create_epilog_for_reduction
4663
4664 Create code at the loop-epilog to finalize the result of a reduction
4665 computation.
4666
4667 STMT_INFO is the scalar reduction stmt that is being vectorized.
4668 SLP_NODE is an SLP node containing a group of reduction statements. The
4669 first one in this group is STMT_INFO.
4670 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4671 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4672 (counting from 0)
4673
4674 This function:
4675 1. Completes the reduction def-use cycles.
4676 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4677 by calling the function specified by REDUC_FN if available, or by
4678 other means (whole-vector shifts or a scalar loop).
4679 The function also creates a new phi node at the loop exit to preserve
4680 loop-closed form, as illustrated below.
4681
4682 The flow at the entry to this function:
4683
4684 loop:
4685 vec_def = phi <vec_init, null> # REDUCTION_PHI
4686 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4687 s_loop = scalar_stmt # (scalar) STMT_INFO
4688 loop_exit:
4689 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4690 use <s_out0>
4691 use <s_out0>
4692
4693 The above is transformed by this function into:
4694
4695 loop:
4696 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4697 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4698 s_loop = scalar_stmt # (scalar) STMT_INFO
4699 loop_exit:
4700 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4701 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4702 v_out2 = reduce <v_out1>
4703 s_out3 = extract_field <v_out2, 0>
4704 s_out4 = adjust_result <s_out3>
4705 use <s_out4>
4706 use <s_out4>
4707 */
4708
4709 static void
4710 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
4711 stmt_vec_info stmt_info,
4712 slp_tree slp_node,
4713 slp_instance slp_node_instance)
4714 {
4715 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4716 gcc_assert (reduc_info->is_reduc_info);
4717 /* For double reductions we need to get at the inner loop reduction
4718 stmt which has the meta info attached. Our stmt_info is that of the
4719 loop-closed PHI of the inner loop which we remember as
4720 def for the reduction PHI generation. */
4721 bool double_reduc = false;
4722 stmt_vec_info rdef_info = stmt_info;
4723 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4724 {
4725 gcc_assert (!slp_node);
4726 double_reduc = true;
4727 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4728 (stmt_info->stmt, 0));
4729 stmt_info = vect_stmt_to_vectorize (stmt_info);
4730 }
4731 gphi *reduc_def_stmt
4732 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4733 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4734 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4735 tree vectype;
4736 machine_mode mode;
4737 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4738 basic_block exit_bb;
4739 tree scalar_dest;
4740 tree scalar_type;
4741 gimple *new_phi = NULL, *phi;
4742 gimple_stmt_iterator exit_gsi;
4743 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4744 gimple *epilog_stmt = NULL;
4745 gimple *exit_phi;
4746 tree bitsize;
4747 tree def;
4748 tree orig_name, scalar_result;
4749 imm_use_iterator imm_iter, phi_imm_iter;
4750 use_operand_p use_p, phi_use_p;
4751 gimple *use_stmt;
4752 bool nested_in_vect_loop = false;
4753 auto_vec<gimple *> new_phis;
4754 int j, i;
4755 auto_vec<tree> scalar_results;
4756 unsigned int group_size = 1, k;
4757 auto_vec<gimple *> phis;
4758 bool slp_reduc = false;
4759 bool direct_slp_reduc;
4760 tree new_phi_result;
4761 tree induction_index = NULL_TREE;
4762
4763 if (slp_node)
4764 group_size = SLP_TREE_LANES (slp_node);
4765
4766 if (nested_in_vect_loop_p (loop, stmt_info))
4767 {
4768 outer_loop = loop;
4769 loop = loop->inner;
4770 nested_in_vect_loop = true;
4771 gcc_assert (!slp_node);
4772 }
4773 gcc_assert (!nested_in_vect_loop || double_reduc);
4774
4775 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
4776 gcc_assert (vectype);
4777 mode = TYPE_MODE (vectype);
4778
4779 tree initial_def = NULL;
4780 tree induc_val = NULL_TREE;
4781 tree adjustment_def = NULL;
4782 if (slp_node)
4783 ;
4784 else
4785 {
4786 /* Get at the scalar def before the loop, that defines the initial value
4787 of the reduction variable. */
4788 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4789 loop_preheader_edge (loop));
4790 /* Optimize: for induction condition reduction, if we can't use zero
4791 for induc_val, use initial_def. */
4792 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4793 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4794 else if (double_reduc)
4795 ;
4796 else if (nested_in_vect_loop)
4797 ;
4798 else
4799 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4800 }
4801
4802 unsigned vec_num;
4803 int ncopies;
4804 if (slp_node)
4805 {
4806 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4807 ncopies = 1;
4808 }
4809 else
4810 {
4811 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
4812 vec_num = 1;
4813 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
4814 }
4815
4816 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4817 which is updated with the current index of the loop for every match of
4818 the original loop's cond_expr (VEC_STMT). This results in a vector
4819 containing the last time the condition passed for that vector lane.
4820 The first match will be a 1 to allow 0 to be used for non-matching
4821 indexes. If there are no matches at all then the vector will be all
4822 zeroes.
4823
4824 PR92772: This algorithm is broken for architectures that support
4825 masked vectors, but do not provide fold_extract_last. */
4826 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4827 {
4828 auto_vec<std::pair<tree, bool>, 2> ccompares;
4829 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
4830 cond_info = vect_stmt_to_vectorize (cond_info);
4831 while (cond_info != reduc_info)
4832 {
4833 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
4834 {
4835 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
4836 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4837 ccompares.safe_push
4838 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
4839 STMT_VINFO_REDUC_IDX (cond_info) == 2));
4840 }
4841 cond_info
4842 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
4843 1 + STMT_VINFO_REDUC_IDX
4844 (cond_info)));
4845 cond_info = vect_stmt_to_vectorize (cond_info);
4846 }
4847 gcc_assert (ccompares.length () != 0);
4848
4849 tree indx_before_incr, indx_after_incr;
4850 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4851 int scalar_precision
4852 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4853 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4854 tree cr_index_vector_type = get_related_vectype_for_scalar_type
4855 (TYPE_MODE (vectype), cr_index_scalar_type,
4856 TYPE_VECTOR_SUBPARTS (vectype));
4857
4858 /* First we create a simple vector induction variable which starts
4859 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4860 vector size (STEP). */
4861
4862 /* Create a {1,2,3,...} vector. */
4863 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4864
4865 /* Create a vector of the step value. */
4866 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4867 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4868
4869 /* Create an induction variable. */
4870 gimple_stmt_iterator incr_gsi;
4871 bool insert_after;
4872 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4873 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4874 insert_after, &indx_before_incr, &indx_after_incr);
4875
4876 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4877 filled with zeros (VEC_ZERO). */
4878
4879 /* Create a vector of 0s. */
4880 tree zero = build_zero_cst (cr_index_scalar_type);
4881 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4882
4883 /* Create a vector phi node. */
4884 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4885 new_phi = create_phi_node (new_phi_tree, loop->header);
4886 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4887 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4888
4889 /* Now take the condition from the loops original cond_exprs
4890 and produce a new cond_exprs (INDEX_COND_EXPR) which for
4891 every match uses values from the induction variable
4892 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4893 (NEW_PHI_TREE).
4894 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4895 the new cond_expr (INDEX_COND_EXPR). */
4896 gimple_seq stmts = NULL;
4897 for (int i = ccompares.length () - 1; i != -1; --i)
4898 {
4899 tree ccompare = ccompares[i].first;
4900 if (ccompares[i].second)
4901 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4902 cr_index_vector_type,
4903 ccompare,
4904 indx_before_incr, new_phi_tree);
4905 else
4906 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4907 cr_index_vector_type,
4908 ccompare,
4909 new_phi_tree, indx_before_incr);
4910 }
4911 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
4912
4913 /* Update the phi with the vec cond. */
4914 induction_index = new_phi_tree;
4915 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4916 loop_latch_edge (loop), UNKNOWN_LOCATION);
4917 }
4918
4919 /* 2. Create epilog code.
4920 The reduction epilog code operates across the elements of the vector
4921 of partial results computed by the vectorized loop.
4922 The reduction epilog code consists of:
4923
4924 step 1: compute the scalar result in a vector (v_out2)
4925 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4926 step 3: adjust the scalar result (s_out3) if needed.
4927
4928 Step 1 can be accomplished using one the following three schemes:
4929 (scheme 1) using reduc_fn, if available.
4930 (scheme 2) using whole-vector shifts, if available.
4931 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4932 combined.
4933
4934 The overall epilog code looks like this:
4935
4936 s_out0 = phi <s_loop> # original EXIT_PHI
4937 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4938 v_out2 = reduce <v_out1> # step 1
4939 s_out3 = extract_field <v_out2, 0> # step 2
4940 s_out4 = adjust_result <s_out3> # step 3
4941
4942 (step 3 is optional, and steps 1 and 2 may be combined).
4943 Lastly, the uses of s_out0 are replaced by s_out4. */
4944
4945
4946 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4947 v_out1 = phi <VECT_DEF>
4948 Store them in NEW_PHIS. */
4949 if (double_reduc)
4950 loop = outer_loop;
4951 exit_bb = single_exit (loop)->dest;
4952 new_phis.create (slp_node ? vec_num : ncopies);
4953 for (unsigned i = 0; i < vec_num; i++)
4954 {
4955 if (slp_node)
4956 def = vect_get_slp_vect_def (slp_node, i);
4957 else
4958 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
4959 for (j = 0; j < ncopies; j++)
4960 {
4961 tree new_def = copy_ssa_name (def);
4962 phi = create_phi_node (new_def, exit_bb);
4963 if (j == 0)
4964 new_phis.quick_push (phi);
4965 else
4966 {
4967 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
4968 new_phis.quick_push (phi);
4969 }
4970
4971 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4972 }
4973 }
4974
4975 exit_gsi = gsi_after_labels (exit_bb);
4976
4977 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4978 (i.e. when reduc_fn is not available) and in the final adjustment
4979 code (if needed). Also get the original scalar reduction variable as
4980 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4981 represents a reduction pattern), the tree-code and scalar-def are
4982 taken from the original stmt that the pattern-stmt (STMT) replaces.
4983 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4984 are taken from STMT. */
4985
4986 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4987 if (orig_stmt_info != stmt_info)
4988 {
4989 /* Reduction pattern */
4990 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4991 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4992 }
4993
4994 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4995 scalar_type = TREE_TYPE (scalar_dest);
4996 scalar_results.create (group_size);
4997 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4998 bitsize = TYPE_SIZE (scalar_type);
4999
5000 /* SLP reduction without reduction chain, e.g.,
5001 # a1 = phi <a2, a0>
5002 # b1 = phi <b2, b0>
5003 a2 = operation (a1)
5004 b2 = operation (b1) */
5005 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5006
5007 /* True if we should implement SLP_REDUC using native reduction operations
5008 instead of scalar operations. */
5009 direct_slp_reduc = (reduc_fn != IFN_LAST
5010 && slp_reduc
5011 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5012
5013 /* In case of reduction chain, e.g.,
5014 # a1 = phi <a3, a0>
5015 a2 = operation (a1)
5016 a3 = operation (a2),
5017
5018 we may end up with more than one vector result. Here we reduce them to
5019 one vector. */
5020 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
5021 {
5022 gimple_seq stmts = NULL;
5023 tree first_vect = PHI_RESULT (new_phis[0]);
5024 first_vect = gimple_convert (&stmts, vectype, first_vect);
5025 for (k = 1; k < new_phis.length (); k++)
5026 {
5027 gimple *next_phi = new_phis[k];
5028 tree second_vect = PHI_RESULT (next_phi);
5029 second_vect = gimple_convert (&stmts, vectype, second_vect);
5030 first_vect = gimple_build (&stmts, code, vectype,
5031 first_vect, second_vect);
5032 }
5033 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5034
5035 new_phi_result = first_vect;
5036 new_phis.truncate (0);
5037 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5038 }
5039 /* Likewise if we couldn't use a single defuse cycle. */
5040 else if (ncopies > 1)
5041 {
5042 gimple_seq stmts = NULL;
5043 tree first_vect = PHI_RESULT (new_phis[0]);
5044 first_vect = gimple_convert (&stmts, vectype, first_vect);
5045 for (int k = 1; k < ncopies; ++k)
5046 {
5047 tree second_vect = PHI_RESULT (new_phis[k]);
5048 second_vect = gimple_convert (&stmts, vectype, second_vect);
5049 first_vect = gimple_build (&stmts, code, vectype,
5050 first_vect, second_vect);
5051 }
5052 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5053 new_phi_result = first_vect;
5054 new_phis.truncate (0);
5055 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5056 }
5057 else
5058 new_phi_result = PHI_RESULT (new_phis[0]);
5059
5060 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5061 && reduc_fn != IFN_LAST)
5062 {
5063 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5064 various data values where the condition matched and another vector
5065 (INDUCTION_INDEX) containing all the indexes of those matches. We
5066 need to extract the last matching index (which will be the index with
5067 highest value) and use this to index into the data vector.
5068 For the case where there were no matches, the data vector will contain
5069 all default values and the index vector will be all zeros. */
5070
5071 /* Get various versions of the type of the vector of indexes. */
5072 tree index_vec_type = TREE_TYPE (induction_index);
5073 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5074 tree index_scalar_type = TREE_TYPE (index_vec_type);
5075 tree index_vec_cmp_type = truth_type_for (index_vec_type);
5076
5077 /* Get an unsigned integer version of the type of the data vector. */
5078 int scalar_precision
5079 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5080 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5081 tree vectype_unsigned = build_vector_type
5082 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5083
5084 /* First we need to create a vector (ZERO_VEC) of zeros and another
5085 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5086 can create using a MAX reduction and then expanding.
5087 In the case where the loop never made any matches, the max index will
5088 be zero. */
5089
5090 /* Vector of {0, 0, 0,...}. */
5091 tree zero_vec = build_zero_cst (vectype);
5092
5093 gimple_seq stmts = NULL;
5094 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5095 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5096
5097 /* Find maximum value from the vector of found indexes. */
5098 tree max_index = make_ssa_name (index_scalar_type);
5099 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5100 1, induction_index);
5101 gimple_call_set_lhs (max_index_stmt, max_index);
5102 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5103
5104 /* Vector of {max_index, max_index, max_index,...}. */
5105 tree max_index_vec = make_ssa_name (index_vec_type);
5106 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5107 max_index);
5108 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5109 max_index_vec_rhs);
5110 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5111
5112 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5113 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5114 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5115 otherwise. Only one value should match, resulting in a vector
5116 (VEC_COND) with one data value and the rest zeros.
5117 In the case where the loop never made any matches, every index will
5118 match, resulting in a vector with all data values (which will all be
5119 the default value). */
5120
5121 /* Compare the max index vector to the vector of found indexes to find
5122 the position of the max value. */
5123 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5124 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5125 induction_index,
5126 max_index_vec);
5127 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5128
5129 /* Use the compare to choose either values from the data vector or
5130 zero. */
5131 tree vec_cond = make_ssa_name (vectype);
5132 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5133 vec_compare, new_phi_result,
5134 zero_vec);
5135 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5136
5137 /* Finally we need to extract the data value from the vector (VEC_COND)
5138 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5139 reduction, but because this doesn't exist, we can use a MAX reduction
5140 instead. The data value might be signed or a float so we need to cast
5141 it first.
5142 In the case where the loop never made any matches, the data values are
5143 all identical, and so will reduce down correctly. */
5144
5145 /* Make the matched data values unsigned. */
5146 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5147 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5148 vec_cond);
5149 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5150 VIEW_CONVERT_EXPR,
5151 vec_cond_cast_rhs);
5152 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5153
5154 /* Reduce down to a scalar value. */
5155 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5156 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5157 1, vec_cond_cast);
5158 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5159 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5160
5161 /* Convert the reduced value back to the result type and set as the
5162 result. */
5163 stmts = NULL;
5164 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5165 data_reduc);
5166 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5167 scalar_results.safe_push (new_temp);
5168 }
5169 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5170 && reduc_fn == IFN_LAST)
5171 {
5172 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5173 idx = 0;
5174 idx_val = induction_index[0];
5175 val = data_reduc[0];
5176 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5177 if (induction_index[i] > idx_val)
5178 val = data_reduc[i], idx_val = induction_index[i];
5179 return val; */
5180
5181 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5182 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5183 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5184 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5185 /* Enforced by vectorizable_reduction, which ensures we have target
5186 support before allowing a conditional reduction on variable-length
5187 vectors. */
5188 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5189 tree idx_val = NULL_TREE, val = NULL_TREE;
5190 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5191 {
5192 tree old_idx_val = idx_val;
5193 tree old_val = val;
5194 idx_val = make_ssa_name (idx_eltype);
5195 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5196 build3 (BIT_FIELD_REF, idx_eltype,
5197 induction_index,
5198 bitsize_int (el_size),
5199 bitsize_int (off)));
5200 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5201 val = make_ssa_name (data_eltype);
5202 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5203 build3 (BIT_FIELD_REF,
5204 data_eltype,
5205 new_phi_result,
5206 bitsize_int (el_size),
5207 bitsize_int (off)));
5208 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5209 if (off != 0)
5210 {
5211 tree new_idx_val = idx_val;
5212 if (off != v_size - el_size)
5213 {
5214 new_idx_val = make_ssa_name (idx_eltype);
5215 epilog_stmt = gimple_build_assign (new_idx_val,
5216 MAX_EXPR, idx_val,
5217 old_idx_val);
5218 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5219 }
5220 tree new_val = make_ssa_name (data_eltype);
5221 epilog_stmt = gimple_build_assign (new_val,
5222 COND_EXPR,
5223 build2 (GT_EXPR,
5224 boolean_type_node,
5225 idx_val,
5226 old_idx_val),
5227 val, old_val);
5228 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5229 idx_val = new_idx_val;
5230 val = new_val;
5231 }
5232 }
5233 /* Convert the reduced value back to the result type and set as the
5234 result. */
5235 gimple_seq stmts = NULL;
5236 val = gimple_convert (&stmts, scalar_type, val);
5237 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5238 scalar_results.safe_push (val);
5239 }
5240
5241 /* 2.3 Create the reduction code, using one of the three schemes described
5242 above. In SLP we simply need to extract all the elements from the
5243 vector (without reducing them), so we use scalar shifts. */
5244 else if (reduc_fn != IFN_LAST && !slp_reduc)
5245 {
5246 tree tmp;
5247 tree vec_elem_type;
5248
5249 /* Case 1: Create:
5250 v_out2 = reduc_expr <v_out1> */
5251
5252 if (dump_enabled_p ())
5253 dump_printf_loc (MSG_NOTE, vect_location,
5254 "Reduce using direct vector reduction.\n");
5255
5256 gimple_seq stmts = NULL;
5257 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5258 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5259 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5260 vec_elem_type, new_phi_result);
5261 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5262 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5263
5264 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5265 && induc_val)
5266 {
5267 /* Earlier we set the initial value to be a vector if induc_val
5268 values. Check the result and if it is induc_val then replace
5269 with the original initial value, unless induc_val is
5270 the same as initial_def already. */
5271 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5272 induc_val);
5273
5274 tmp = make_ssa_name (new_scalar_dest);
5275 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5276 initial_def, new_temp);
5277 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5278 new_temp = tmp;
5279 }
5280
5281 scalar_results.safe_push (new_temp);
5282 }
5283 else if (direct_slp_reduc)
5284 {
5285 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5286 with the elements for other SLP statements replaced with the
5287 neutral value. We can then do a normal reduction on each vector. */
5288
5289 /* Enforced by vectorizable_reduction. */
5290 gcc_assert (new_phis.length () == 1);
5291 gcc_assert (pow2p_hwi (group_size));
5292
5293 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5294 vec<stmt_vec_info> orig_phis
5295 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5296 gimple_seq seq = NULL;
5297
5298 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5299 and the same element size as VECTYPE. */
5300 tree index = build_index_vector (vectype, 0, 1);
5301 tree index_type = TREE_TYPE (index);
5302 tree index_elt_type = TREE_TYPE (index_type);
5303 tree mask_type = truth_type_for (index_type);
5304
5305 /* Create a vector that, for each element, identifies which of
5306 the REDUC_GROUP_SIZE results should use it. */
5307 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5308 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5309 build_vector_from_val (index_type, index_mask));
5310
5311 /* Get a neutral vector value. This is simply a splat of the neutral
5312 scalar value if we have one, otherwise the initial scalar value
5313 is itself a neutral value. */
5314 tree vector_identity = NULL_TREE;
5315 tree neutral_op = NULL_TREE;
5316 if (slp_node)
5317 {
5318 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5319 neutral_op
5320 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5321 vectype, code, first != NULL);
5322 }
5323 if (neutral_op)
5324 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5325 neutral_op);
5326 for (unsigned int i = 0; i < group_size; ++i)
5327 {
5328 /* If there's no univeral neutral value, we can use the
5329 initial scalar value from the original PHI. This is used
5330 for MIN and MAX reduction, for example. */
5331 if (!neutral_op)
5332 {
5333 tree scalar_value
5334 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5335 loop_preheader_edge (loop));
5336 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5337 scalar_value);
5338 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5339 scalar_value);
5340 }
5341
5342 /* Calculate the equivalent of:
5343
5344 sel[j] = (index[j] == i);
5345
5346 which selects the elements of NEW_PHI_RESULT that should
5347 be included in the result. */
5348 tree compare_val = build_int_cst (index_elt_type, i);
5349 compare_val = build_vector_from_val (index_type, compare_val);
5350 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5351 index, compare_val);
5352
5353 /* Calculate the equivalent of:
5354
5355 vec = seq ? new_phi_result : vector_identity;
5356
5357 VEC is now suitable for a full vector reduction. */
5358 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5359 sel, new_phi_result, vector_identity);
5360
5361 /* Do the reduction and convert it to the appropriate type. */
5362 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5363 TREE_TYPE (vectype), vec);
5364 scalar = gimple_convert (&seq, scalar_type, scalar);
5365 scalar_results.safe_push (scalar);
5366 }
5367 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5368 }
5369 else
5370 {
5371 bool reduce_with_shift;
5372 tree vec_temp;
5373
5374 gcc_assert (slp_reduc || new_phis.length () == 1);
5375
5376 /* See if the target wants to do the final (shift) reduction
5377 in a vector mode of smaller size and first reduce upper/lower
5378 halves against each other. */
5379 enum machine_mode mode1 = mode;
5380 tree stype = TREE_TYPE (vectype);
5381 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5382 unsigned nunits1 = nunits;
5383 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5384 && new_phis.length () == 1)
5385 {
5386 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5387 /* For SLP reductions we have to make sure lanes match up, but
5388 since we're doing individual element final reduction reducing
5389 vector width here is even more important.
5390 ??? We can also separate lanes with permutes, for the common
5391 case of power-of-two group-size odd/even extracts would work. */
5392 if (slp_reduc && nunits != nunits1)
5393 {
5394 nunits1 = least_common_multiple (nunits1, group_size);
5395 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5396 }
5397 }
5398 if (!slp_reduc
5399 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5400 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5401
5402 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5403 stype, nunits1);
5404 reduce_with_shift = have_whole_vector_shift (mode1);
5405 if (!VECTOR_MODE_P (mode1))
5406 reduce_with_shift = false;
5407 else
5408 {
5409 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5410 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5411 reduce_with_shift = false;
5412 }
5413
5414 /* First reduce the vector to the desired vector size we should
5415 do shift reduction on by combining upper and lower halves. */
5416 new_temp = new_phi_result;
5417 while (nunits > nunits1)
5418 {
5419 nunits /= 2;
5420 vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5421 stype, nunits);
5422 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5423
5424 /* The target has to make sure we support lowpart/highpart
5425 extraction, either via direct vector extract or through
5426 an integer mode punning. */
5427 tree dst1, dst2;
5428 if (convert_optab_handler (vec_extract_optab,
5429 TYPE_MODE (TREE_TYPE (new_temp)),
5430 TYPE_MODE (vectype1))
5431 != CODE_FOR_nothing)
5432 {
5433 /* Extract sub-vectors directly once vec_extract becomes
5434 a conversion optab. */
5435 dst1 = make_ssa_name (vectype1);
5436 epilog_stmt
5437 = gimple_build_assign (dst1, BIT_FIELD_REF,
5438 build3 (BIT_FIELD_REF, vectype1,
5439 new_temp, TYPE_SIZE (vectype1),
5440 bitsize_int (0)));
5441 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5442 dst2 = make_ssa_name (vectype1);
5443 epilog_stmt
5444 = gimple_build_assign (dst2, BIT_FIELD_REF,
5445 build3 (BIT_FIELD_REF, vectype1,
5446 new_temp, TYPE_SIZE (vectype1),
5447 bitsize_int (bitsize)));
5448 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5449 }
5450 else
5451 {
5452 /* Extract via punning to appropriately sized integer mode
5453 vector. */
5454 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5455 tree etype = build_vector_type (eltype, 2);
5456 gcc_assert (convert_optab_handler (vec_extract_optab,
5457 TYPE_MODE (etype),
5458 TYPE_MODE (eltype))
5459 != CODE_FOR_nothing);
5460 tree tem = make_ssa_name (etype);
5461 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5462 build1 (VIEW_CONVERT_EXPR,
5463 etype, new_temp));
5464 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5465 new_temp = tem;
5466 tem = make_ssa_name (eltype);
5467 epilog_stmt
5468 = gimple_build_assign (tem, BIT_FIELD_REF,
5469 build3 (BIT_FIELD_REF, eltype,
5470 new_temp, TYPE_SIZE (eltype),
5471 bitsize_int (0)));
5472 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5473 dst1 = make_ssa_name (vectype1);
5474 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5475 build1 (VIEW_CONVERT_EXPR,
5476 vectype1, tem));
5477 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5478 tem = make_ssa_name (eltype);
5479 epilog_stmt
5480 = gimple_build_assign (tem, BIT_FIELD_REF,
5481 build3 (BIT_FIELD_REF, eltype,
5482 new_temp, TYPE_SIZE (eltype),
5483 bitsize_int (bitsize)));
5484 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5485 dst2 = make_ssa_name (vectype1);
5486 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5487 build1 (VIEW_CONVERT_EXPR,
5488 vectype1, tem));
5489 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5490 }
5491
5492 new_temp = make_ssa_name (vectype1);
5493 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5494 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5495 new_phis[0] = epilog_stmt;
5496 }
5497
5498 if (reduce_with_shift && !slp_reduc)
5499 {
5500 int element_bitsize = tree_to_uhwi (bitsize);
5501 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5502 for variable-length vectors and also requires direct target support
5503 for loop reductions. */
5504 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5505 int nelements = vec_size_in_bits / element_bitsize;
5506 vec_perm_builder sel;
5507 vec_perm_indices indices;
5508
5509 int elt_offset;
5510
5511 tree zero_vec = build_zero_cst (vectype1);
5512 /* Case 2: Create:
5513 for (offset = nelements/2; offset >= 1; offset/=2)
5514 {
5515 Create: va' = vec_shift <va, offset>
5516 Create: va = vop <va, va'>
5517 } */
5518
5519 tree rhs;
5520
5521 if (dump_enabled_p ())
5522 dump_printf_loc (MSG_NOTE, vect_location,
5523 "Reduce using vector shifts\n");
5524
5525 gimple_seq stmts = NULL;
5526 new_temp = gimple_convert (&stmts, vectype1, new_temp);
5527 for (elt_offset = nelements / 2;
5528 elt_offset >= 1;
5529 elt_offset /= 2)
5530 {
5531 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5532 indices.new_vector (sel, 2, nelements);
5533 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5534 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5535 new_temp, zero_vec, mask);
5536 new_temp = gimple_build (&stmts, code,
5537 vectype1, new_name, new_temp);
5538 }
5539 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5540
5541 /* 2.4 Extract the final scalar result. Create:
5542 s_out3 = extract_field <v_out2, bitpos> */
5543
5544 if (dump_enabled_p ())
5545 dump_printf_loc (MSG_NOTE, vect_location,
5546 "extract scalar result\n");
5547
5548 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5549 bitsize, bitsize_zero_node);
5550 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5551 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5552 gimple_assign_set_lhs (epilog_stmt, new_temp);
5553 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5554 scalar_results.safe_push (new_temp);
5555 }
5556 else
5557 {
5558 /* Case 3: Create:
5559 s = extract_field <v_out2, 0>
5560 for (offset = element_size;
5561 offset < vector_size;
5562 offset += element_size;)
5563 {
5564 Create: s' = extract_field <v_out2, offset>
5565 Create: s = op <s, s'> // For non SLP cases
5566 } */
5567
5568 if (dump_enabled_p ())
5569 dump_printf_loc (MSG_NOTE, vect_location,
5570 "Reduce using scalar code.\n");
5571
5572 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5573 int element_bitsize = tree_to_uhwi (bitsize);
5574 tree compute_type = TREE_TYPE (vectype);
5575 gimple_seq stmts = NULL;
5576 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5577 {
5578 int bit_offset;
5579 if (gimple_code (new_phi) == GIMPLE_PHI)
5580 vec_temp = PHI_RESULT (new_phi);
5581 else
5582 vec_temp = gimple_assign_lhs (new_phi);
5583 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5584 vec_temp, bitsize, bitsize_zero_node);
5585
5586 /* In SLP we don't need to apply reduction operation, so we just
5587 collect s' values in SCALAR_RESULTS. */
5588 if (slp_reduc)
5589 scalar_results.safe_push (new_temp);
5590
5591 for (bit_offset = element_bitsize;
5592 bit_offset < vec_size_in_bits;
5593 bit_offset += element_bitsize)
5594 {
5595 tree bitpos = bitsize_int (bit_offset);
5596 new_name = gimple_build (&stmts, BIT_FIELD_REF,
5597 compute_type, vec_temp,
5598 bitsize, bitpos);
5599 if (slp_reduc)
5600 {
5601 /* In SLP we don't need to apply reduction operation, so
5602 we just collect s' values in SCALAR_RESULTS. */
5603 new_temp = new_name;
5604 scalar_results.safe_push (new_name);
5605 }
5606 else
5607 new_temp = gimple_build (&stmts, code, compute_type,
5608 new_name, new_temp);
5609 }
5610 }
5611
5612 /* The only case where we need to reduce scalar results in SLP, is
5613 unrolling. If the size of SCALAR_RESULTS is greater than
5614 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5615 REDUC_GROUP_SIZE. */
5616 if (slp_reduc)
5617 {
5618 tree res, first_res, new_res;
5619
5620 /* Reduce multiple scalar results in case of SLP unrolling. */
5621 for (j = group_size; scalar_results.iterate (j, &res);
5622 j++)
5623 {
5624 first_res = scalar_results[j % group_size];
5625 new_res = gimple_build (&stmts, code, compute_type,
5626 first_res, res);
5627 scalar_results[j % group_size] = new_res;
5628 }
5629 for (k = 0; k < group_size; k++)
5630 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5631 scalar_results[k]);
5632 }
5633 else
5634 {
5635 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5636 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5637 scalar_results.safe_push (new_temp);
5638 }
5639
5640 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5641 }
5642
5643 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5644 && induc_val)
5645 {
5646 /* Earlier we set the initial value to be a vector if induc_val
5647 values. Check the result and if it is induc_val then replace
5648 with the original initial value, unless induc_val is
5649 the same as initial_def already. */
5650 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5651 induc_val);
5652
5653 tree tmp = make_ssa_name (new_scalar_dest);
5654 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5655 initial_def, new_temp);
5656 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5657 scalar_results[0] = tmp;
5658 }
5659 }
5660
5661 /* 2.5 Adjust the final result by the initial value of the reduction
5662 variable. (When such adjustment is not needed, then
5663 'adjustment_def' is zero). For example, if code is PLUS we create:
5664 new_temp = loop_exit_def + adjustment_def */
5665
5666 if (adjustment_def)
5667 {
5668 gcc_assert (!slp_reduc);
5669 gimple_seq stmts = NULL;
5670 if (nested_in_vect_loop)
5671 {
5672 new_phi = new_phis[0];
5673 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5674 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5675 new_temp = gimple_build (&stmts, code, vectype,
5676 PHI_RESULT (new_phi), adjustment_def);
5677 }
5678 else
5679 {
5680 new_temp = scalar_results[0];
5681 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5682 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5683 new_temp = gimple_build (&stmts, code, scalar_type,
5684 new_temp, adjustment_def);
5685 }
5686
5687 epilog_stmt = gimple_seq_last_stmt (stmts);
5688 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5689 if (nested_in_vect_loop)
5690 {
5691 if (!double_reduc)
5692 scalar_results.quick_push (new_temp);
5693 else
5694 scalar_results[0] = new_temp;
5695 }
5696 else
5697 scalar_results[0] = new_temp;
5698
5699 new_phis[0] = epilog_stmt;
5700 }
5701
5702 if (double_reduc)
5703 loop = loop->inner;
5704
5705 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5706 phis with new adjusted scalar results, i.e., replace use <s_out0>
5707 with use <s_out4>.
5708
5709 Transform:
5710 loop_exit:
5711 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5712 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5713 v_out2 = reduce <v_out1>
5714 s_out3 = extract_field <v_out2, 0>
5715 s_out4 = adjust_result <s_out3>
5716 use <s_out0>
5717 use <s_out0>
5718
5719 into:
5720
5721 loop_exit:
5722 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5723 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5724 v_out2 = reduce <v_out1>
5725 s_out3 = extract_field <v_out2, 0>
5726 s_out4 = adjust_result <s_out3>
5727 use <s_out4>
5728 use <s_out4> */
5729
5730
5731 /* In SLP reduction chain we reduce vector results into one vector if
5732 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5733 LHS of the last stmt in the reduction chain, since we are looking for
5734 the loop exit phi node. */
5735 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5736 {
5737 stmt_vec_info dest_stmt_info
5738 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5739 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5740 group_size = 1;
5741 }
5742
5743 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5744 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5745 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5746 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5747 correspond to the first vector stmt, etc.
5748 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5749 if (group_size > new_phis.length ())
5750 gcc_assert (!(group_size % new_phis.length ()));
5751
5752 for (k = 0; k < group_size; k++)
5753 {
5754 if (slp_reduc)
5755 {
5756 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5757
5758 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5759 /* SLP statements can't participate in patterns. */
5760 gcc_assert (!orig_stmt_info);
5761 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5762 }
5763
5764 if (nested_in_vect_loop)
5765 {
5766 if (double_reduc)
5767 loop = outer_loop;
5768 else
5769 gcc_unreachable ();
5770 }
5771
5772 phis.create (3);
5773 /* Find the loop-closed-use at the loop exit of the original scalar
5774 result. (The reduction result is expected to have two immediate uses,
5775 one at the latch block, and one at the loop exit). For double
5776 reductions we are looking for exit phis of the outer loop. */
5777 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5778 {
5779 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5780 {
5781 if (!is_gimple_debug (USE_STMT (use_p)))
5782 phis.safe_push (USE_STMT (use_p));
5783 }
5784 else
5785 {
5786 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5787 {
5788 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5789
5790 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5791 {
5792 if (!flow_bb_inside_loop_p (loop,
5793 gimple_bb (USE_STMT (phi_use_p)))
5794 && !is_gimple_debug (USE_STMT (phi_use_p)))
5795 phis.safe_push (USE_STMT (phi_use_p));
5796 }
5797 }
5798 }
5799 }
5800
5801 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5802 {
5803 /* Replace the uses: */
5804 orig_name = PHI_RESULT (exit_phi);
5805 scalar_result = scalar_results[k];
5806 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5807 {
5808 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5809 SET_USE (use_p, scalar_result);
5810 update_stmt (use_stmt);
5811 }
5812 }
5813
5814 phis.release ();
5815 }
5816 }
5817
5818 /* Return a vector of type VECTYPE that is equal to the vector select
5819 operation "MASK ? VEC : IDENTITY". Insert the select statements
5820 before GSI. */
5821
5822 static tree
5823 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5824 tree vec, tree identity)
5825 {
5826 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5827 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5828 mask, vec, identity);
5829 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5830 return cond;
5831 }
5832
5833 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5834 order, starting with LHS. Insert the extraction statements before GSI and
5835 associate the new scalar SSA names with variable SCALAR_DEST.
5836 Return the SSA name for the result. */
5837
5838 static tree
5839 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5840 tree_code code, tree lhs, tree vector_rhs)
5841 {
5842 tree vectype = TREE_TYPE (vector_rhs);
5843 tree scalar_type = TREE_TYPE (vectype);
5844 tree bitsize = TYPE_SIZE (scalar_type);
5845 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5846 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5847
5848 for (unsigned HOST_WIDE_INT bit_offset = 0;
5849 bit_offset < vec_size_in_bits;
5850 bit_offset += element_bitsize)
5851 {
5852 tree bitpos = bitsize_int (bit_offset);
5853 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5854 bitsize, bitpos);
5855
5856 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5857 rhs = make_ssa_name (scalar_dest, stmt);
5858 gimple_assign_set_lhs (stmt, rhs);
5859 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5860
5861 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5862 tree new_name = make_ssa_name (scalar_dest, stmt);
5863 gimple_assign_set_lhs (stmt, new_name);
5864 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5865 lhs = new_name;
5866 }
5867 return lhs;
5868 }
5869
5870 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
5871 type of the vector input. */
5872
5873 static internal_fn
5874 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5875 {
5876 internal_fn mask_reduc_fn;
5877
5878 switch (reduc_fn)
5879 {
5880 case IFN_FOLD_LEFT_PLUS:
5881 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5882 break;
5883
5884 default:
5885 return IFN_LAST;
5886 }
5887
5888 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5889 OPTIMIZE_FOR_SPEED))
5890 return mask_reduc_fn;
5891 return IFN_LAST;
5892 }
5893
5894 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5895 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5896 statement. CODE is the operation performed by STMT_INFO and OPS are
5897 its scalar operands. REDUC_INDEX is the index of the operand in
5898 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5899 implements in-order reduction, or IFN_LAST if we should open-code it.
5900 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5901 that should be used to control the operation in a fully-masked loop. */
5902
5903 static bool
5904 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
5905 stmt_vec_info stmt_info,
5906 gimple_stmt_iterator *gsi,
5907 gimple **vec_stmt, slp_tree slp_node,
5908 gimple *reduc_def_stmt,
5909 tree_code code, internal_fn reduc_fn,
5910 tree ops[3], tree vectype_in,
5911 int reduc_index, vec_loop_masks *masks)
5912 {
5913 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5914 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5915 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5916
5917 int ncopies;
5918 if (slp_node)
5919 ncopies = 1;
5920 else
5921 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5922
5923 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5924 gcc_assert (ncopies == 1);
5925 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5926
5927 if (slp_node)
5928 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5929 TYPE_VECTOR_SUBPARTS (vectype_in)));
5930
5931 tree op0 = ops[1 - reduc_index];
5932
5933 int group_size = 1;
5934 stmt_vec_info scalar_dest_def_info;
5935 auto_vec<tree> vec_oprnds0;
5936 if (slp_node)
5937 {
5938 auto_vec<vec<tree> > vec_defs (2);
5939 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
5940 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5941 vec_defs[0].release ();
5942 vec_defs[1].release ();
5943 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5944 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5945 }
5946 else
5947 {
5948 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
5949 op0, &vec_oprnds0);
5950 scalar_dest_def_info = stmt_info;
5951 }
5952
5953 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5954 tree scalar_type = TREE_TYPE (scalar_dest);
5955 tree reduc_var = gimple_phi_result (reduc_def_stmt);
5956
5957 int vec_num = vec_oprnds0.length ();
5958 gcc_assert (vec_num == 1 || slp_node);
5959 tree vec_elem_type = TREE_TYPE (vectype_out);
5960 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5961
5962 tree vector_identity = NULL_TREE;
5963 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5964 vector_identity = build_zero_cst (vectype_out);
5965
5966 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5967 int i;
5968 tree def0;
5969 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5970 {
5971 gimple *new_stmt;
5972 tree mask = NULL_TREE;
5973 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5974 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5975
5976 /* Handle MINUS by adding the negative. */
5977 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5978 {
5979 tree negated = make_ssa_name (vectype_out);
5980 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5981 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5982 def0 = negated;
5983 }
5984
5985 if (mask && mask_reduc_fn == IFN_LAST)
5986 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5987 vector_identity);
5988
5989 /* On the first iteration the input is simply the scalar phi
5990 result, and for subsequent iterations it is the output of
5991 the preceding operation. */
5992 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
5993 {
5994 if (mask && mask_reduc_fn != IFN_LAST)
5995 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
5996 def0, mask);
5997 else
5998 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
5999 def0);
6000 /* For chained SLP reductions the output of the previous reduction
6001 operation serves as the input of the next. For the final statement
6002 the output cannot be a temporary - we reuse the original
6003 scalar destination of the last statement. */
6004 if (i != vec_num - 1)
6005 {
6006 gimple_set_lhs (new_stmt, scalar_dest_var);
6007 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6008 gimple_set_lhs (new_stmt, reduc_var);
6009 }
6010 }
6011 else
6012 {
6013 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6014 reduc_var, def0);
6015 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6016 /* Remove the statement, so that we can use the same code paths
6017 as for statements that we've just created. */
6018 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6019 gsi_remove (&tmp_gsi, true);
6020 }
6021
6022 if (i == vec_num - 1)
6023 {
6024 gimple_set_lhs (new_stmt, scalar_dest);
6025 vect_finish_replace_stmt (loop_vinfo,
6026 scalar_dest_def_info,
6027 new_stmt);
6028 }
6029 else
6030 vect_finish_stmt_generation (loop_vinfo,
6031 scalar_dest_def_info,
6032 new_stmt, gsi);
6033
6034 if (slp_node)
6035 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6036 else
6037 {
6038 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6039 *vec_stmt = new_stmt;
6040 }
6041 }
6042
6043 return true;
6044 }
6045
6046 /* Function is_nonwrapping_integer_induction.
6047
6048 Check if STMT_VINO (which is part of loop LOOP) both increments and
6049 does not cause overflow. */
6050
6051 static bool
6052 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6053 {
6054 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6055 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6056 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6057 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6058 widest_int ni, max_loop_value, lhs_max;
6059 wi::overflow_type overflow = wi::OVF_NONE;
6060
6061 /* Make sure the loop is integer based. */
6062 if (TREE_CODE (base) != INTEGER_CST
6063 || TREE_CODE (step) != INTEGER_CST)
6064 return false;
6065
6066 /* Check that the max size of the loop will not wrap. */
6067
6068 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6069 return true;
6070
6071 if (! max_stmt_executions (loop, &ni))
6072 return false;
6073
6074 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6075 &overflow);
6076 if (overflow)
6077 return false;
6078
6079 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6080 TYPE_SIGN (lhs_type), &overflow);
6081 if (overflow)
6082 return false;
6083
6084 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6085 <= TYPE_PRECISION (lhs_type));
6086 }
6087
6088 /* Check if masking can be supported by inserting a conditional expression.
6089 CODE is the code for the operation. COND_FN is the conditional internal
6090 function, if it exists. VECTYPE_IN is the type of the vector input. */
6091 static bool
6092 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6093 tree vectype_in)
6094 {
6095 if (cond_fn != IFN_LAST
6096 && direct_internal_fn_supported_p (cond_fn, vectype_in,
6097 OPTIMIZE_FOR_SPEED))
6098 return false;
6099
6100 switch (code)
6101 {
6102 case DOT_PROD_EXPR:
6103 case SAD_EXPR:
6104 return true;
6105
6106 default:
6107 return false;
6108 }
6109 }
6110
6111 /* Insert a conditional expression to enable masked vectorization. CODE is the
6112 code for the operation. VOP is the array of operands. MASK is the loop
6113 mask. GSI is a statement iterator used to place the new conditional
6114 expression. */
6115 static void
6116 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6117 gimple_stmt_iterator *gsi)
6118 {
6119 switch (code)
6120 {
6121 case DOT_PROD_EXPR:
6122 {
6123 tree vectype = TREE_TYPE (vop[1]);
6124 tree zero = build_zero_cst (vectype);
6125 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6126 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6127 mask, vop[1], zero);
6128 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6129 vop[1] = masked_op1;
6130 break;
6131 }
6132
6133 case SAD_EXPR:
6134 {
6135 tree vectype = TREE_TYPE (vop[1]);
6136 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6137 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6138 mask, vop[1], vop[0]);
6139 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6140 vop[1] = masked_op1;
6141 break;
6142 }
6143
6144 default:
6145 gcc_unreachable ();
6146 }
6147 }
6148
6149 /* Function vectorizable_reduction.
6150
6151 Check if STMT_INFO performs a reduction operation that can be vectorized.
6152 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6153 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6154 Return true if STMT_INFO is vectorizable in this way.
6155
6156 This function also handles reduction idioms (patterns) that have been
6157 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6158 may be of this form:
6159 X = pattern_expr (arg0, arg1, ..., X)
6160 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6161 sequence that had been detected and replaced by the pattern-stmt
6162 (STMT_INFO).
6163
6164 This function also handles reduction of condition expressions, for example:
6165 for (int i = 0; i < N; i++)
6166 if (a[i] < value)
6167 last = a[i];
6168 This is handled by vectorising the loop and creating an additional vector
6169 containing the loop indexes for which "a[i] < value" was true. In the
6170 function epilogue this is reduced to a single max value and then used to
6171 index into the vector of results.
6172
6173 In some cases of reduction patterns, the type of the reduction variable X is
6174 different than the type of the other arguments of STMT_INFO.
6175 In such cases, the vectype that is used when transforming STMT_INFO into
6176 a vector stmt is different than the vectype that is used to determine the
6177 vectorization factor, because it consists of a different number of elements
6178 than the actual number of elements that are being operated upon in parallel.
6179
6180 For example, consider an accumulation of shorts into an int accumulator.
6181 On some targets it's possible to vectorize this pattern operating on 8
6182 shorts at a time (hence, the vectype for purposes of determining the
6183 vectorization factor should be V8HI); on the other hand, the vectype that
6184 is used to create the vector form is actually V4SI (the type of the result).
6185
6186 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6187 indicates what is the actual level of parallelism (V8HI in the example), so
6188 that the right vectorization factor would be derived. This vectype
6189 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6190 be used to create the vectorized stmt. The right vectype for the vectorized
6191 stmt is obtained from the type of the result X:
6192 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6193
6194 This means that, contrary to "regular" reductions (or "regular" stmts in
6195 general), the following equation:
6196 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6197 does *NOT* necessarily hold for reduction patterns. */
6198
6199 bool
6200 vectorizable_reduction (loop_vec_info loop_vinfo,
6201 stmt_vec_info stmt_info, slp_tree slp_node,
6202 slp_instance slp_node_instance,
6203 stmt_vector_for_cost *cost_vec)
6204 {
6205 tree scalar_dest;
6206 tree vectype_in = NULL_TREE;
6207 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6208 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6209 stmt_vec_info cond_stmt_vinfo = NULL;
6210 tree scalar_type;
6211 int i;
6212 int ncopies;
6213 bool single_defuse_cycle = false;
6214 bool nested_cycle = false;
6215 bool double_reduc = false;
6216 int vec_num;
6217 tree tem;
6218 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6219 tree cond_reduc_val = NULL_TREE;
6220
6221 /* Make sure it was already recognized as a reduction computation. */
6222 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6223 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6224 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6225 return false;
6226
6227 /* The stmt we store reduction analysis meta on. */
6228 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6229 reduc_info->is_reduc_info = true;
6230
6231 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6232 {
6233 if (is_a <gphi *> (stmt_info->stmt))
6234 /* Analysis for double-reduction is done on the outer
6235 loop PHI, nested cycles have no further restrictions. */
6236 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6237 else
6238 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6239 return true;
6240 }
6241
6242 stmt_vec_info orig_stmt_of_analysis = stmt_info;
6243 stmt_vec_info phi_info = stmt_info;
6244 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
6245 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6246 {
6247 if (!is_a <gphi *> (stmt_info->stmt))
6248 {
6249 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6250 return true;
6251 }
6252 if (slp_node)
6253 {
6254 slp_node_instance->reduc_phis = slp_node;
6255 /* ??? We're leaving slp_node to point to the PHIs, we only
6256 need it to get at the number of vector stmts which wasn't
6257 yet initialized for the instance root. */
6258 }
6259 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6260 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6261 else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6262 {
6263 use_operand_p use_p;
6264 gimple *use_stmt;
6265 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6266 &use_p, &use_stmt);
6267 gcc_assert (res);
6268 phi_info = loop_vinfo->lookup_stmt (use_stmt);
6269 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6270 }
6271 }
6272
6273 /* PHIs should not participate in patterns. */
6274 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6275 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6276
6277 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6278 and compute the reduction chain length. */
6279 tree reduc_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6280 loop_latch_edge (loop));
6281 unsigned reduc_chain_length = 0;
6282 bool only_slp_reduc_chain = true;
6283 stmt_info = NULL;
6284 while (reduc_def != PHI_RESULT (reduc_def_phi))
6285 {
6286 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6287 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6288 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6289 {
6290 if (dump_enabled_p ())
6291 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6292 "reduction chain broken by patterns.\n");
6293 return false;
6294 }
6295 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6296 only_slp_reduc_chain = false;
6297 /* ??? For epilogue generation live members of the chain need
6298 to point back to the PHI via their original stmt for
6299 info_for_reduction to work. */
6300 if (STMT_VINFO_LIVE_P (vdef))
6301 STMT_VINFO_REDUC_DEF (def) = phi_info;
6302 gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6303 if (!assign)
6304 {
6305 if (dump_enabled_p ())
6306 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6307 "reduction chain includes calls.\n");
6308 return false;
6309 }
6310 if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6311 {
6312 if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6313 TREE_TYPE (gimple_assign_rhs1 (assign))))
6314 {
6315 if (dump_enabled_p ())
6316 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6317 "conversion in the reduction chain.\n");
6318 return false;
6319 }
6320 }
6321 else if (!stmt_info)
6322 /* First non-conversion stmt. */
6323 stmt_info = vdef;
6324 reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6325 reduc_chain_length++;
6326 }
6327 /* PHIs should not participate in patterns. */
6328 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6329
6330 if (nested_in_vect_loop_p (loop, stmt_info))
6331 {
6332 loop = loop->inner;
6333 nested_cycle = true;
6334 }
6335
6336 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6337 element. */
6338 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6339 {
6340 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6341 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6342 }
6343 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6344 gcc_assert (slp_node
6345 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6346
6347 /* 1. Is vectorizable reduction? */
6348 /* Not supportable if the reduction variable is used in the loop, unless
6349 it's a reduction chain. */
6350 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6351 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6352 return false;
6353
6354 /* Reductions that are not used even in an enclosing outer-loop,
6355 are expected to be "live" (used out of the loop). */
6356 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6357 && !STMT_VINFO_LIVE_P (stmt_info))
6358 return false;
6359
6360 /* 2. Has this been recognized as a reduction pattern?
6361
6362 Check if STMT represents a pattern that has been recognized
6363 in earlier analysis stages. For stmts that represent a pattern,
6364 the STMT_VINFO_RELATED_STMT field records the last stmt in
6365 the original sequence that constitutes the pattern. */
6366
6367 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6368 if (orig_stmt_info)
6369 {
6370 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6371 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6372 }
6373
6374 /* 3. Check the operands of the operation. The first operands are defined
6375 inside the loop body. The last operand is the reduction variable,
6376 which is defined by the loop-header-phi. */
6377
6378 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6379 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6380 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6381 enum tree_code code = gimple_assign_rhs_code (stmt);
6382 bool lane_reduc_code_p
6383 = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6384 int op_type = TREE_CODE_LENGTH (code);
6385
6386 scalar_dest = gimple_assign_lhs (stmt);
6387 scalar_type = TREE_TYPE (scalar_dest);
6388 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6389 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6390 return false;
6391
6392 /* Do not try to vectorize bit-precision reductions. */
6393 if (!type_has_mode_precision_p (scalar_type))
6394 return false;
6395
6396 /* For lane-reducing ops we're reducing the number of reduction PHIs
6397 which means the only use of that may be in the lane-reducing operation. */
6398 if (lane_reduc_code_p
6399 && reduc_chain_length != 1
6400 && !only_slp_reduc_chain)
6401 {
6402 if (dump_enabled_p ())
6403 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6404 "lane-reducing reduction with extra stmts.\n");
6405 return false;
6406 }
6407
6408 /* All uses but the last are expected to be defined in the loop.
6409 The last use is the reduction variable. In case of nested cycle this
6410 assumption is not true: we use reduc_index to record the index of the
6411 reduction variable. */
6412 /* ??? To get at invariant/constant uses on the SLP node we have to
6413 get to it here, slp_node is still the reduction PHI. */
6414 slp_tree slp_for_stmt_info = NULL;
6415 if (slp_node)
6416 {
6417 slp_for_stmt_info = slp_node_instance->root;
6418 /* And then there's reduction chain with a conversion ... */
6419 if (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info) != stmt_info)
6420 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6421 gcc_assert (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info) == stmt_info);
6422 }
6423 slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6424 /* We need to skip an extra operand for COND_EXPRs with embedded
6425 comparison. */
6426 unsigned opno_adjust = 0;
6427 if (code == COND_EXPR
6428 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6429 opno_adjust = 1;
6430 for (i = 0; i < op_type; i++)
6431 {
6432 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6433 if (i == 0 && code == COND_EXPR)
6434 continue;
6435
6436 stmt_vec_info def_stmt_info;
6437 enum vect_def_type dt;
6438 tree op;
6439 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6440 i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6441 &def_stmt_info))
6442 {
6443 if (dump_enabled_p ())
6444 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6445 "use not simple.\n");
6446 return false;
6447 }
6448 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6449 continue;
6450
6451 /* There should be only one cycle def in the stmt, the one
6452 leading to reduc_def. */
6453 if (VECTORIZABLE_CYCLE_DEF (dt))
6454 return false;
6455
6456 /* To properly compute ncopies we are interested in the widest
6457 non-reduction input type in case we're looking at a widening
6458 accumulation that we later handle in vect_transform_reduction. */
6459 if (lane_reduc_code_p
6460 && tem
6461 && (!vectype_in
6462 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6463 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6464 vectype_in = tem;
6465
6466 if (code == COND_EXPR)
6467 {
6468 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6469 if (dt == vect_constant_def)
6470 {
6471 cond_reduc_dt = dt;
6472 cond_reduc_val = op;
6473 }
6474 if (dt == vect_induction_def
6475 && def_stmt_info
6476 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6477 {
6478 cond_reduc_dt = dt;
6479 cond_stmt_vinfo = def_stmt_info;
6480 }
6481 }
6482 }
6483 if (!vectype_in)
6484 vectype_in = STMT_VINFO_VECTYPE (phi_info);
6485 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6486
6487 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6488 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6489 /* If we have a condition reduction, see if we can simplify it further. */
6490 if (v_reduc_type == COND_REDUCTION)
6491 {
6492 if (slp_node)
6493 return false;
6494
6495 /* When the condition uses the reduction value in the condition, fail. */
6496 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6497 {
6498 if (dump_enabled_p ())
6499 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6500 "condition depends on previous iteration\n");
6501 return false;
6502 }
6503
6504 if (reduc_chain_length == 1
6505 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6506 vectype_in, OPTIMIZE_FOR_SPEED))
6507 {
6508 if (dump_enabled_p ())
6509 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6510 "optimizing condition reduction with"
6511 " FOLD_EXTRACT_LAST.\n");
6512 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6513 }
6514 else if (cond_reduc_dt == vect_induction_def)
6515 {
6516 tree base
6517 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6518 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6519
6520 gcc_assert (TREE_CODE (base) == INTEGER_CST
6521 && TREE_CODE (step) == INTEGER_CST);
6522 cond_reduc_val = NULL_TREE;
6523 enum tree_code cond_reduc_op_code = ERROR_MARK;
6524 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6525 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6526 ;
6527 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6528 above base; punt if base is the minimum value of the type for
6529 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6530 else if (tree_int_cst_sgn (step) == -1)
6531 {
6532 cond_reduc_op_code = MIN_EXPR;
6533 if (tree_int_cst_sgn (base) == -1)
6534 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6535 else if (tree_int_cst_lt (base,
6536 TYPE_MAX_VALUE (TREE_TYPE (base))))
6537 cond_reduc_val
6538 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6539 }
6540 else
6541 {
6542 cond_reduc_op_code = MAX_EXPR;
6543 if (tree_int_cst_sgn (base) == 1)
6544 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6545 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6546 base))
6547 cond_reduc_val
6548 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6549 }
6550 if (cond_reduc_val)
6551 {
6552 if (dump_enabled_p ())
6553 dump_printf_loc (MSG_NOTE, vect_location,
6554 "condition expression based on "
6555 "integer induction.\n");
6556 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6557 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6558 = cond_reduc_val;
6559 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6560 }
6561 }
6562 else if (cond_reduc_dt == vect_constant_def)
6563 {
6564 enum vect_def_type cond_initial_dt;
6565 tree cond_initial_val
6566 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6567
6568 gcc_assert (cond_reduc_val != NULL_TREE);
6569 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6570 if (cond_initial_dt == vect_constant_def
6571 && types_compatible_p (TREE_TYPE (cond_initial_val),
6572 TREE_TYPE (cond_reduc_val)))
6573 {
6574 tree e = fold_binary (LE_EXPR, boolean_type_node,
6575 cond_initial_val, cond_reduc_val);
6576 if (e && (integer_onep (e) || integer_zerop (e)))
6577 {
6578 if (dump_enabled_p ())
6579 dump_printf_loc (MSG_NOTE, vect_location,
6580 "condition expression based on "
6581 "compile time constant.\n");
6582 /* Record reduction code at analysis stage. */
6583 STMT_VINFO_REDUC_CODE (reduc_info)
6584 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6585 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6586 }
6587 }
6588 }
6589 }
6590
6591 if (STMT_VINFO_LIVE_P (phi_info))
6592 return false;
6593
6594 if (slp_node)
6595 ncopies = 1;
6596 else
6597 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6598
6599 gcc_assert (ncopies >= 1);
6600
6601 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6602
6603 if (nested_cycle)
6604 {
6605 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6606 == vect_double_reduction_def);
6607 double_reduc = true;
6608 }
6609
6610 /* 4.2. Check support for the epilog operation.
6611
6612 If STMT represents a reduction pattern, then the type of the
6613 reduction variable may be different than the type of the rest
6614 of the arguments. For example, consider the case of accumulation
6615 of shorts into an int accumulator; The original code:
6616 S1: int_a = (int) short_a;
6617 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6618
6619 was replaced with:
6620 STMT: int_acc = widen_sum <short_a, int_acc>
6621
6622 This means that:
6623 1. The tree-code that is used to create the vector operation in the
6624 epilog code (that reduces the partial results) is not the
6625 tree-code of STMT, but is rather the tree-code of the original
6626 stmt from the pattern that STMT is replacing. I.e, in the example
6627 above we want to use 'widen_sum' in the loop, but 'plus' in the
6628 epilog.
6629 2. The type (mode) we use to check available target support
6630 for the vector operation to be created in the *epilog*, is
6631 determined by the type of the reduction variable (in the example
6632 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6633 However the type (mode) we use to check available target support
6634 for the vector operation to be created *inside the loop*, is
6635 determined by the type of the other arguments to STMT (in the
6636 example we'd check this: optab_handler (widen_sum_optab,
6637 vect_short_mode)).
6638
6639 This is contrary to "regular" reductions, in which the types of all
6640 the arguments are the same as the type of the reduction variable.
6641 For "regular" reductions we can therefore use the same vector type
6642 (and also the same tree-code) when generating the epilog code and
6643 when generating the code inside the loop. */
6644
6645 enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6646 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6647
6648 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6649 if (reduction_type == TREE_CODE_REDUCTION)
6650 {
6651 /* Check whether it's ok to change the order of the computation.
6652 Generally, when vectorizing a reduction we change the order of the
6653 computation. This may change the behavior of the program in some
6654 cases, so we need to check that this is ok. One exception is when
6655 vectorizing an outer-loop: the inner-loop is executed sequentially,
6656 and therefore vectorizing reductions in the inner-loop during
6657 outer-loop vectorization is safe. */
6658 if (needs_fold_left_reduction_p (scalar_type, orig_code))
6659 {
6660 /* When vectorizing a reduction chain w/o SLP the reduction PHI
6661 is not directy used in stmt. */
6662 if (!only_slp_reduc_chain
6663 && reduc_chain_length != 1)
6664 {
6665 if (dump_enabled_p ())
6666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6667 "in-order reduction chain without SLP.\n");
6668 return false;
6669 }
6670 STMT_VINFO_REDUC_TYPE (reduc_info)
6671 = reduction_type = FOLD_LEFT_REDUCTION;
6672 }
6673 else if (!commutative_tree_code (orig_code)
6674 || !associative_tree_code (orig_code))
6675 {
6676 if (dump_enabled_p ())
6677 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6678 "reduction: not commutative/associative");
6679 return false;
6680 }
6681 }
6682
6683 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6684 && ncopies > 1)
6685 {
6686 if (dump_enabled_p ())
6687 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6688 "multiple types in double reduction or condition "
6689 "reduction or fold-left reduction.\n");
6690 return false;
6691 }
6692
6693 internal_fn reduc_fn = IFN_LAST;
6694 if (reduction_type == TREE_CODE_REDUCTION
6695 || reduction_type == FOLD_LEFT_REDUCTION
6696 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6697 || reduction_type == CONST_COND_REDUCTION)
6698 {
6699 if (reduction_type == FOLD_LEFT_REDUCTION
6700 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6701 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6702 {
6703 if (reduc_fn != IFN_LAST
6704 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6705 OPTIMIZE_FOR_SPEED))
6706 {
6707 if (dump_enabled_p ())
6708 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6709 "reduc op not supported by target.\n");
6710
6711 reduc_fn = IFN_LAST;
6712 }
6713 }
6714 else
6715 {
6716 if (!nested_cycle || double_reduc)
6717 {
6718 if (dump_enabled_p ())
6719 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6720 "no reduc code for scalar code.\n");
6721
6722 return false;
6723 }
6724 }
6725 }
6726 else if (reduction_type == COND_REDUCTION)
6727 {
6728 int scalar_precision
6729 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6730 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6731 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6732 nunits_out);
6733
6734 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6735 OPTIMIZE_FOR_SPEED))
6736 reduc_fn = IFN_REDUC_MAX;
6737 }
6738 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6739
6740 if (reduction_type != EXTRACT_LAST_REDUCTION
6741 && (!nested_cycle || double_reduc)
6742 && reduc_fn == IFN_LAST
6743 && !nunits_out.is_constant ())
6744 {
6745 if (dump_enabled_p ())
6746 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6747 "missing target support for reduction on"
6748 " variable-length vectors.\n");
6749 return false;
6750 }
6751
6752 /* For SLP reductions, see if there is a neutral value we can use. */
6753 tree neutral_op = NULL_TREE;
6754 if (slp_node)
6755 neutral_op = neutral_op_for_slp_reduction
6756 (slp_node_instance->reduc_phis, vectype_out, orig_code,
6757 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6758
6759 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6760 {
6761 /* We can't support in-order reductions of code such as this:
6762
6763 for (int i = 0; i < n1; ++i)
6764 for (int j = 0; j < n2; ++j)
6765 l += a[j];
6766
6767 since GCC effectively transforms the loop when vectorizing:
6768
6769 for (int i = 0; i < n1 / VF; ++i)
6770 for (int j = 0; j < n2; ++j)
6771 for (int k = 0; k < VF; ++k)
6772 l += a[j];
6773
6774 which is a reassociation of the original operation. */
6775 if (dump_enabled_p ())
6776 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6777 "in-order double reduction not supported.\n");
6778
6779 return false;
6780 }
6781
6782 if (reduction_type == FOLD_LEFT_REDUCTION
6783 && slp_node
6784 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6785 {
6786 /* We cannot use in-order reductions in this case because there is
6787 an implicit reassociation of the operations involved. */
6788 if (dump_enabled_p ())
6789 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6790 "in-order unchained SLP reductions not supported.\n");
6791 return false;
6792 }
6793
6794 /* For double reductions, and for SLP reductions with a neutral value,
6795 we construct a variable-length initial vector by loading a vector
6796 full of the neutral value and then shift-and-inserting the start
6797 values into the low-numbered elements. */
6798 if ((double_reduc || neutral_op)
6799 && !nunits_out.is_constant ()
6800 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6801 vectype_out, OPTIMIZE_FOR_SPEED))
6802 {
6803 if (dump_enabled_p ())
6804 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6805 "reduction on variable-length vectors requires"
6806 " target support for a vector-shift-and-insert"
6807 " operation.\n");
6808 return false;
6809 }
6810
6811 /* Check extra constraints for variable-length unchained SLP reductions. */
6812 if (STMT_SLP_TYPE (stmt_info)
6813 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6814 && !nunits_out.is_constant ())
6815 {
6816 /* We checked above that we could build the initial vector when
6817 there's a neutral element value. Check here for the case in
6818 which each SLP statement has its own initial value and in which
6819 that value needs to be repeated for every instance of the
6820 statement within the initial vector. */
6821 unsigned int group_size = SLP_TREE_LANES (slp_node);
6822 if (!neutral_op
6823 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
6824 TREE_TYPE (vectype_out)))
6825 {
6826 if (dump_enabled_p ())
6827 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6828 "unsupported form of SLP reduction for"
6829 " variable-length vectors: cannot build"
6830 " initial vector.\n");
6831 return false;
6832 }
6833 /* The epilogue code relies on the number of elements being a multiple
6834 of the group size. The duplicate-and-interleave approach to setting
6835 up the initial vector does too. */
6836 if (!multiple_p (nunits_out, group_size))
6837 {
6838 if (dump_enabled_p ())
6839 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6840 "unsupported form of SLP reduction for"
6841 " variable-length vectors: the vector size"
6842 " is not a multiple of the number of results.\n");
6843 return false;
6844 }
6845 }
6846
6847 if (reduction_type == COND_REDUCTION)
6848 {
6849 widest_int ni;
6850
6851 if (! max_loop_iterations (loop, &ni))
6852 {
6853 if (dump_enabled_p ())
6854 dump_printf_loc (MSG_NOTE, vect_location,
6855 "loop count not known, cannot create cond "
6856 "reduction.\n");
6857 return false;
6858 }
6859 /* Convert backedges to iterations. */
6860 ni += 1;
6861
6862 /* The additional index will be the same type as the condition. Check
6863 that the loop can fit into this less one (because we'll use up the
6864 zero slot for when there are no matches). */
6865 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6866 if (wi::geu_p (ni, wi::to_widest (max_index)))
6867 {
6868 if (dump_enabled_p ())
6869 dump_printf_loc (MSG_NOTE, vect_location,
6870 "loop size is greater than data size.\n");
6871 return false;
6872 }
6873 }
6874
6875 /* In case the vectorization factor (VF) is bigger than the number
6876 of elements that we can fit in a vectype (nunits), we have to generate
6877 more than one vector stmt - i.e - we need to "unroll" the
6878 vector stmt by a factor VF/nunits. For more details see documentation
6879 in vectorizable_operation. */
6880
6881 /* If the reduction is used in an outer loop we need to generate
6882 VF intermediate results, like so (e.g. for ncopies=2):
6883 r0 = phi (init, r0)
6884 r1 = phi (init, r1)
6885 r0 = x0 + r0;
6886 r1 = x1 + r1;
6887 (i.e. we generate VF results in 2 registers).
6888 In this case we have a separate def-use cycle for each copy, and therefore
6889 for each copy we get the vector def for the reduction variable from the
6890 respective phi node created for this copy.
6891
6892 Otherwise (the reduction is unused in the loop nest), we can combine
6893 together intermediate results, like so (e.g. for ncopies=2):
6894 r = phi (init, r)
6895 r = x0 + r;
6896 r = x1 + r;
6897 (i.e. we generate VF/2 results in a single register).
6898 In this case for each copy we get the vector def for the reduction variable
6899 from the vectorized reduction operation generated in the previous iteration.
6900
6901 This only works when we see both the reduction PHI and its only consumer
6902 in vectorizable_reduction and there are no intermediate stmts
6903 participating. */
6904 if (ncopies > 1
6905 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6906 && reduc_chain_length == 1)
6907 single_defuse_cycle = true;
6908
6909 if (single_defuse_cycle || lane_reduc_code_p)
6910 {
6911 gcc_assert (code != COND_EXPR);
6912
6913 /* 4. Supportable by target? */
6914 bool ok = true;
6915
6916 /* 4.1. check support for the operation in the loop */
6917 optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
6918 if (!optab)
6919 {
6920 if (dump_enabled_p ())
6921 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6922 "no optab.\n");
6923 ok = false;
6924 }
6925
6926 machine_mode vec_mode = TYPE_MODE (vectype_in);
6927 if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6928 {
6929 if (dump_enabled_p ())
6930 dump_printf (MSG_NOTE, "op not supported by target.\n");
6931 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6932 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6933 ok = false;
6934 else
6935 if (dump_enabled_p ())
6936 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6937 }
6938
6939 /* Worthwhile without SIMD support? */
6940 if (ok
6941 && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
6942 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6943 {
6944 if (dump_enabled_p ())
6945 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6946 "not worthwhile without SIMD support.\n");
6947 ok = false;
6948 }
6949
6950 /* lane-reducing operations have to go through vect_transform_reduction.
6951 For the other cases try without the single cycle optimization. */
6952 if (!ok)
6953 {
6954 if (lane_reduc_code_p)
6955 return false;
6956 else
6957 single_defuse_cycle = false;
6958 }
6959 }
6960 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
6961
6962 /* If the reduction stmt is one of the patterns that have lane
6963 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
6964 if ((ncopies > 1 && ! single_defuse_cycle)
6965 && lane_reduc_code_p)
6966 {
6967 if (dump_enabled_p ())
6968 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6969 "multi def-use cycle not possible for lane-reducing "
6970 "reduction operation\n");
6971 return false;
6972 }
6973
6974 if (slp_node
6975 && !(!single_defuse_cycle
6976 && code != DOT_PROD_EXPR
6977 && code != WIDEN_SUM_EXPR
6978 && code != SAD_EXPR
6979 && reduction_type != FOLD_LEFT_REDUCTION))
6980 for (i = 0; i < op_type; i++)
6981 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
6982 {
6983 if (dump_enabled_p ())
6984 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6985 "incompatible vector types for invariants\n");
6986 return false;
6987 }
6988
6989 if (slp_node)
6990 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6991 else
6992 vec_num = 1;
6993
6994 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
6995 reduction_type, ncopies, cost_vec);
6996 if (dump_enabled_p ()
6997 && reduction_type == FOLD_LEFT_REDUCTION)
6998 dump_printf_loc (MSG_NOTE, vect_location,
6999 "using an in-order (fold-left) reduction.\n");
7000 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7001 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7002 reductions go through their own vectorizable_* routines. */
7003 if (!single_defuse_cycle
7004 && code != DOT_PROD_EXPR
7005 && code != WIDEN_SUM_EXPR
7006 && code != SAD_EXPR
7007 && reduction_type != FOLD_LEFT_REDUCTION)
7008 {
7009 stmt_vec_info tem
7010 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7011 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7012 {
7013 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7014 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7015 }
7016 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7017 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7018 }
7019 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7020 {
7021 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7022 internal_fn cond_fn = get_conditional_internal_fn (code);
7023
7024 if (reduction_type != FOLD_LEFT_REDUCTION
7025 && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7026 && (cond_fn == IFN_LAST
7027 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7028 OPTIMIZE_FOR_SPEED)))
7029 {
7030 if (dump_enabled_p ())
7031 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7032 "can't operate on partial vectors because"
7033 " no conditional operation is available.\n");
7034 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7035 }
7036 else if (reduction_type == FOLD_LEFT_REDUCTION
7037 && reduc_fn == IFN_LAST
7038 && !expand_vec_cond_expr_p (vectype_in,
7039 truth_type_for (vectype_in),
7040 SSA_NAME))
7041 {
7042 if (dump_enabled_p ())
7043 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7044 "can't operate on partial vectors because"
7045 " no conditional operation is available.\n");
7046 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7047 }
7048 else
7049 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7050 vectype_in, NULL);
7051 }
7052 return true;
7053 }
7054
7055 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7056 value. */
7057
7058 bool
7059 vect_transform_reduction (loop_vec_info loop_vinfo,
7060 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7061 gimple **vec_stmt, slp_tree slp_node)
7062 {
7063 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7064 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7065 int i;
7066 int ncopies;
7067 int vec_num;
7068
7069 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7070 gcc_assert (reduc_info->is_reduc_info);
7071
7072 if (nested_in_vect_loop_p (loop, stmt_info))
7073 {
7074 loop = loop->inner;
7075 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7076 }
7077
7078 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
7079 enum tree_code code = gimple_assign_rhs_code (stmt);
7080 int op_type = TREE_CODE_LENGTH (code);
7081
7082 /* Flatten RHS. */
7083 tree ops[3];
7084 switch (get_gimple_rhs_class (code))
7085 {
7086 case GIMPLE_TERNARY_RHS:
7087 ops[2] = gimple_assign_rhs3 (stmt);
7088 /* Fall thru. */
7089 case GIMPLE_BINARY_RHS:
7090 ops[0] = gimple_assign_rhs1 (stmt);
7091 ops[1] = gimple_assign_rhs2 (stmt);
7092 break;
7093 default:
7094 gcc_unreachable ();
7095 }
7096
7097 /* All uses but the last are expected to be defined in the loop.
7098 The last use is the reduction variable. In case of nested cycle this
7099 assumption is not true: we use reduc_index to record the index of the
7100 reduction variable. */
7101 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7102 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7103 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7104 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7105
7106 if (slp_node)
7107 {
7108 ncopies = 1;
7109 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7110 }
7111 else
7112 {
7113 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7114 vec_num = 1;
7115 }
7116
7117 internal_fn cond_fn = get_conditional_internal_fn (code);
7118 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7119 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7120
7121 /* Transform. */
7122 tree new_temp = NULL_TREE;
7123 auto_vec<tree> vec_oprnds0;
7124 auto_vec<tree> vec_oprnds1;
7125 auto_vec<tree> vec_oprnds2;
7126 tree def0;
7127
7128 if (dump_enabled_p ())
7129 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7130
7131 /* FORNOW: Multiple types are not supported for condition. */
7132 if (code == COND_EXPR)
7133 gcc_assert (ncopies == 1);
7134
7135 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7136
7137 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7138 if (reduction_type == FOLD_LEFT_REDUCTION)
7139 {
7140 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7141 return vectorize_fold_left_reduction
7142 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7143 reduc_fn, ops, vectype_in, reduc_index, masks);
7144 }
7145
7146 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7147 gcc_assert (single_defuse_cycle
7148 || code == DOT_PROD_EXPR
7149 || code == WIDEN_SUM_EXPR
7150 || code == SAD_EXPR);
7151
7152 /* Create the destination vector */
7153 tree scalar_dest = gimple_assign_lhs (stmt);
7154 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7155
7156 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7157 single_defuse_cycle && reduc_index == 0
7158 ? NULL_TREE : ops[0], &vec_oprnds0,
7159 single_defuse_cycle && reduc_index == 1
7160 ? NULL_TREE : ops[1], &vec_oprnds1,
7161 op_type == ternary_op
7162 && !(single_defuse_cycle && reduc_index == 2)
7163 ? ops[2] : NULL_TREE, &vec_oprnds2);
7164 if (single_defuse_cycle)
7165 {
7166 gcc_assert (!slp_node);
7167 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7168 ops[reduc_index],
7169 reduc_index == 0 ? &vec_oprnds0
7170 : (reduc_index == 1 ? &vec_oprnds1
7171 : &vec_oprnds2));
7172 }
7173
7174 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7175 {
7176 gimple *new_stmt;
7177 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7178 if (masked_loop_p && !mask_by_cond_expr)
7179 {
7180 /* Make sure that the reduction accumulator is vop[0]. */
7181 if (reduc_index == 1)
7182 {
7183 gcc_assert (commutative_tree_code (code));
7184 std::swap (vop[0], vop[1]);
7185 }
7186 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7187 vectype_in, i);
7188 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7189 vop[0], vop[1], vop[0]);
7190 new_temp = make_ssa_name (vec_dest, call);
7191 gimple_call_set_lhs (call, new_temp);
7192 gimple_call_set_nothrow (call, true);
7193 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7194 new_stmt = call;
7195 }
7196 else
7197 {
7198 if (op_type == ternary_op)
7199 vop[2] = vec_oprnds2[i];
7200
7201 if (masked_loop_p && mask_by_cond_expr)
7202 {
7203 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7204 vectype_in, i);
7205 build_vect_cond_expr (code, vop, mask, gsi);
7206 }
7207
7208 new_stmt = gimple_build_assign (vec_dest, code,
7209 vop[0], vop[1], vop[2]);
7210 new_temp = make_ssa_name (vec_dest, new_stmt);
7211 gimple_assign_set_lhs (new_stmt, new_temp);
7212 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7213 }
7214
7215 if (slp_node)
7216 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7217 else if (single_defuse_cycle
7218 && i < ncopies - 1)
7219 {
7220 if (reduc_index == 0)
7221 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7222 else if (reduc_index == 1)
7223 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7224 else if (reduc_index == 2)
7225 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7226 }
7227 else
7228 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7229 }
7230
7231 if (!slp_node)
7232 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7233
7234 return true;
7235 }
7236
7237 /* Transform phase of a cycle PHI. */
7238
7239 bool
7240 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7241 stmt_vec_info stmt_info, gimple **vec_stmt,
7242 slp_tree slp_node, slp_instance slp_node_instance)
7243 {
7244 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7245 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7246 int i;
7247 int ncopies;
7248 int j;
7249 bool nested_cycle = false;
7250 int vec_num;
7251
7252 if (nested_in_vect_loop_p (loop, stmt_info))
7253 {
7254 loop = loop->inner;
7255 nested_cycle = true;
7256 }
7257
7258 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7259 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7260 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7261 gcc_assert (reduc_info->is_reduc_info);
7262
7263 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7264 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7265 /* Leave the scalar phi in place. */
7266 return true;
7267
7268 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7269 /* For a nested cycle we do not fill the above. */
7270 if (!vectype_in)
7271 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7272 gcc_assert (vectype_in);
7273
7274 if (slp_node)
7275 {
7276 /* The size vect_schedule_slp_instance computes is off for us. */
7277 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7278 * SLP_TREE_LANES (slp_node), vectype_in);
7279 ncopies = 1;
7280 }
7281 else
7282 {
7283 vec_num = 1;
7284 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7285 }
7286
7287 /* Check whether we should use a single PHI node and accumulate
7288 vectors to one before the backedge. */
7289 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7290 ncopies = 1;
7291
7292 /* Create the destination vector */
7293 gphi *phi = as_a <gphi *> (stmt_info->stmt);
7294 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7295 vectype_out);
7296
7297 /* Get the loop-entry arguments. */
7298 tree vec_initial_def;
7299 auto_vec<tree> vec_initial_defs;
7300 if (slp_node)
7301 {
7302 vec_initial_defs.reserve (vec_num);
7303 gcc_assert (slp_node == slp_node_instance->reduc_phis);
7304 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7305 tree neutral_op
7306 = neutral_op_for_slp_reduction (slp_node, vectype_out,
7307 STMT_VINFO_REDUC_CODE (reduc_info),
7308 first != NULL);
7309 get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
7310 &vec_initial_defs, vec_num,
7311 first != NULL, neutral_op);
7312 }
7313 else
7314 {
7315 /* Get at the scalar def before the loop, that defines the initial
7316 value of the reduction variable. */
7317 tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7318 loop_preheader_edge (loop));
7319 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7320 and we can't use zero for induc_val, use initial_def. Similarly
7321 for REDUC_MIN and initial_def larger than the base. */
7322 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7323 {
7324 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7325 if (TREE_CODE (initial_def) == INTEGER_CST
7326 && !integer_zerop (induc_val)
7327 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7328 && tree_int_cst_lt (initial_def, induc_val))
7329 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7330 && tree_int_cst_lt (induc_val, initial_def))))
7331 {
7332 induc_val = initial_def;
7333 /* Communicate we used the initial_def to epilouge
7334 generation. */
7335 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7336 }
7337 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7338 vec_initial_defs.create (ncopies);
7339 for (i = 0; i < ncopies; ++i)
7340 vec_initial_defs.quick_push (vec_initial_def);
7341 }
7342 else if (nested_cycle)
7343 {
7344 /* Do not use an adjustment def as that case is not supported
7345 correctly if ncopies is not one. */
7346 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7347 ncopies, initial_def,
7348 &vec_initial_defs);
7349 }
7350 else
7351 {
7352 tree adjustment_def = NULL_TREE;
7353 tree *adjustment_defp = &adjustment_def;
7354 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7355 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7356 adjustment_defp = NULL;
7357 vec_initial_def
7358 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
7359 initial_def, adjustment_defp);
7360 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7361 vec_initial_defs.create (ncopies);
7362 for (i = 0; i < ncopies; ++i)
7363 vec_initial_defs.quick_push (vec_initial_def);
7364 }
7365 }
7366
7367 /* Generate the reduction PHIs upfront. */
7368 for (i = 0; i < vec_num; i++)
7369 {
7370 tree vec_init_def = vec_initial_defs[i];
7371 for (j = 0; j < ncopies; j++)
7372 {
7373 /* Create the reduction-phi that defines the reduction
7374 operand. */
7375 gphi *new_phi = create_phi_node (vec_dest, loop->header);
7376
7377 /* Set the loop-entry arg of the reduction-phi. */
7378 if (j != 0 && nested_cycle)
7379 vec_init_def = vec_initial_defs[j];
7380 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7381 UNKNOWN_LOCATION);
7382
7383 /* The loop-latch arg is set in epilogue processing. */
7384
7385 if (slp_node)
7386 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7387 else
7388 {
7389 if (j == 0)
7390 *vec_stmt = new_phi;
7391 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7392 }
7393 }
7394 }
7395
7396 return true;
7397 }
7398
7399 /* Vectorizes LC PHIs. */
7400
7401 bool
7402 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7403 stmt_vec_info stmt_info, gimple **vec_stmt,
7404 slp_tree slp_node)
7405 {
7406 if (!loop_vinfo
7407 || !is_a <gphi *> (stmt_info->stmt)
7408 || gimple_phi_num_args (stmt_info->stmt) != 1)
7409 return false;
7410
7411 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7412 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7413 return false;
7414
7415 if (!vec_stmt) /* transformation not required. */
7416 {
7417 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7418 return true;
7419 }
7420
7421 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7422 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7423 basic_block bb = gimple_bb (stmt_info->stmt);
7424 edge e = single_pred_edge (bb);
7425 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7426 auto_vec<tree> vec_oprnds;
7427 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7428 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7429 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7430 for (unsigned i = 0; i < vec_oprnds.length (); i++)
7431 {
7432 /* Create the vectorized LC PHI node. */
7433 gphi *new_phi = create_phi_node (vec_dest, bb);
7434 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7435 if (slp_node)
7436 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7437 else
7438 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7439 }
7440 if (!slp_node)
7441 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7442
7443 return true;
7444 }
7445
7446
7447 /* Function vect_min_worthwhile_factor.
7448
7449 For a loop where we could vectorize the operation indicated by CODE,
7450 return the minimum vectorization factor that makes it worthwhile
7451 to use generic vectors. */
7452 static unsigned int
7453 vect_min_worthwhile_factor (enum tree_code code)
7454 {
7455 switch (code)
7456 {
7457 case PLUS_EXPR:
7458 case MINUS_EXPR:
7459 case NEGATE_EXPR:
7460 return 4;
7461
7462 case BIT_AND_EXPR:
7463 case BIT_IOR_EXPR:
7464 case BIT_XOR_EXPR:
7465 case BIT_NOT_EXPR:
7466 return 2;
7467
7468 default:
7469 return INT_MAX;
7470 }
7471 }
7472
7473 /* Return true if VINFO indicates we are doing loop vectorization and if
7474 it is worth decomposing CODE operations into scalar operations for
7475 that loop's vectorization factor. */
7476
7477 bool
7478 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7479 {
7480 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7481 unsigned HOST_WIDE_INT value;
7482 return (loop_vinfo
7483 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7484 && value >= vect_min_worthwhile_factor (code));
7485 }
7486
7487 /* Function vectorizable_induction
7488
7489 Check if STMT_INFO performs an induction computation that can be vectorized.
7490 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7491 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7492 Return true if STMT_INFO is vectorizable in this way. */
7493
7494 bool
7495 vectorizable_induction (loop_vec_info loop_vinfo,
7496 stmt_vec_info stmt_info,
7497 gimple **vec_stmt, slp_tree slp_node,
7498 stmt_vector_for_cost *cost_vec)
7499 {
7500 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7501 unsigned ncopies;
7502 bool nested_in_vect_loop = false;
7503 class loop *iv_loop;
7504 tree vec_def;
7505 edge pe = loop_preheader_edge (loop);
7506 basic_block new_bb;
7507 tree new_vec, vec_init, vec_step, t;
7508 tree new_name;
7509 gimple *new_stmt;
7510 gphi *induction_phi;
7511 tree induc_def, vec_dest;
7512 tree init_expr, step_expr;
7513 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7514 unsigned i;
7515 tree expr;
7516 gimple_seq stmts;
7517 gimple_stmt_iterator si;
7518
7519 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7520 if (!phi)
7521 return false;
7522
7523 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7524 return false;
7525
7526 /* Make sure it was recognized as induction computation. */
7527 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7528 return false;
7529
7530 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7531 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7532
7533 if (slp_node)
7534 ncopies = 1;
7535 else
7536 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7537 gcc_assert (ncopies >= 1);
7538
7539 /* FORNOW. These restrictions should be relaxed. */
7540 if (nested_in_vect_loop_p (loop, stmt_info))
7541 {
7542 imm_use_iterator imm_iter;
7543 use_operand_p use_p;
7544 gimple *exit_phi;
7545 edge latch_e;
7546 tree loop_arg;
7547
7548 if (ncopies > 1)
7549 {
7550 if (dump_enabled_p ())
7551 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7552 "multiple types in nested loop.\n");
7553 return false;
7554 }
7555
7556 /* FORNOW: outer loop induction with SLP not supported. */
7557 if (STMT_SLP_TYPE (stmt_info))
7558 return false;
7559
7560 exit_phi = NULL;
7561 latch_e = loop_latch_edge (loop->inner);
7562 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7563 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7564 {
7565 gimple *use_stmt = USE_STMT (use_p);
7566 if (is_gimple_debug (use_stmt))
7567 continue;
7568
7569 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7570 {
7571 exit_phi = use_stmt;
7572 break;
7573 }
7574 }
7575 if (exit_phi)
7576 {
7577 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7578 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7579 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7580 {
7581 if (dump_enabled_p ())
7582 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7583 "inner-loop induction only used outside "
7584 "of the outer vectorized loop.\n");
7585 return false;
7586 }
7587 }
7588
7589 nested_in_vect_loop = true;
7590 iv_loop = loop->inner;
7591 }
7592 else
7593 iv_loop = loop;
7594 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7595
7596 if (slp_node && !nunits.is_constant ())
7597 {
7598 /* The current SLP code creates the initial value element-by-element. */
7599 if (dump_enabled_p ())
7600 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7601 "SLP induction not supported for variable-length"
7602 " vectors.\n");
7603 return false;
7604 }
7605
7606 if (!vec_stmt) /* transformation not required. */
7607 {
7608 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7609 DUMP_VECT_SCOPE ("vectorizable_induction");
7610 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7611 return true;
7612 }
7613
7614 /* Transform. */
7615
7616 /* Compute a vector variable, initialized with the first VF values of
7617 the induction variable. E.g., for an iv with IV_PHI='X' and
7618 evolution S, for a vector of 4 units, we want to compute:
7619 [X, X + S, X + 2*S, X + 3*S]. */
7620
7621 if (dump_enabled_p ())
7622 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7623
7624 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7625 gcc_assert (step_expr != NULL_TREE);
7626 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7627
7628 pe = loop_preheader_edge (iv_loop);
7629 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7630 loop_preheader_edge (iv_loop));
7631
7632 stmts = NULL;
7633 if (!nested_in_vect_loop)
7634 {
7635 /* Convert the initial value to the IV update type. */
7636 tree new_type = TREE_TYPE (step_expr);
7637 init_expr = gimple_convert (&stmts, new_type, init_expr);
7638
7639 /* If we are using the loop mask to "peel" for alignment then we need
7640 to adjust the start value here. */
7641 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7642 if (skip_niters != NULL_TREE)
7643 {
7644 if (FLOAT_TYPE_P (vectype))
7645 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7646 skip_niters);
7647 else
7648 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7649 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7650 skip_niters, step_expr);
7651 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7652 init_expr, skip_step);
7653 }
7654 }
7655
7656 if (stmts)
7657 {
7658 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7659 gcc_assert (!new_bb);
7660 }
7661
7662 /* Find the first insertion point in the BB. */
7663 basic_block bb = gimple_bb (phi);
7664 si = gsi_after_labels (bb);
7665
7666 /* For SLP induction we have to generate several IVs as for example
7667 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7668 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7669 [VF*S, VF*S, VF*S, VF*S] for all. */
7670 if (slp_node)
7671 {
7672 /* Enforced above. */
7673 unsigned int const_nunits = nunits.to_constant ();
7674
7675 /* Generate [VF*S, VF*S, ... ]. */
7676 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7677 {
7678 expr = build_int_cst (integer_type_node, vf);
7679 expr = fold_convert (TREE_TYPE (step_expr), expr);
7680 }
7681 else
7682 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7683 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7684 expr, step_expr);
7685 if (! CONSTANT_CLASS_P (new_name))
7686 new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
7687 TREE_TYPE (step_expr), NULL);
7688 new_vec = build_vector_from_val (step_vectype, new_name);
7689 vec_step = vect_init_vector (loop_vinfo, stmt_info,
7690 new_vec, step_vectype, NULL);
7691
7692 /* Now generate the IVs. */
7693 unsigned group_size = SLP_TREE_LANES (slp_node);
7694 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7695 unsigned elts = const_nunits * nvects;
7696 /* Compute the number of distinct IVs we need. First reduce
7697 group_size if it is a multiple of const_nunits so we get
7698 one IV for a group_size of 4 but const_nunits 2. */
7699 unsigned group_sizep = group_size;
7700 if (group_sizep % const_nunits == 0)
7701 group_sizep = group_sizep / const_nunits;
7702 unsigned nivs = least_common_multiple (group_sizep,
7703 const_nunits) / const_nunits;
7704 gcc_assert (elts % group_size == 0);
7705 tree elt = init_expr;
7706 unsigned ivn;
7707 for (ivn = 0; ivn < nivs; ++ivn)
7708 {
7709 tree_vector_builder elts (step_vectype, const_nunits, 1);
7710 stmts = NULL;
7711 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7712 {
7713 if (ivn*const_nunits + eltn >= group_size
7714 && (ivn * const_nunits + eltn) % group_size == 0)
7715 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7716 elt, step_expr);
7717 elts.quick_push (elt);
7718 }
7719 vec_init = gimple_build_vector (&stmts, &elts);
7720 vec_init = gimple_convert (&stmts, vectype, vec_init);
7721 if (stmts)
7722 {
7723 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7724 gcc_assert (!new_bb);
7725 }
7726
7727 /* Create the induction-phi that defines the induction-operand. */
7728 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7729 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7730 induc_def = PHI_RESULT (induction_phi);
7731
7732 /* Create the iv update inside the loop */
7733 gimple_seq stmts = NULL;
7734 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7735 vec_def = gimple_build (&stmts,
7736 PLUS_EXPR, step_vectype, vec_def, vec_step);
7737 vec_def = gimple_convert (&stmts, vectype, vec_def);
7738 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7739
7740 /* Set the arguments of the phi node: */
7741 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7742 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7743 UNKNOWN_LOCATION);
7744
7745 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7746 }
7747 /* Fill up to the number of vectors we need for the whole group. */
7748 nivs = least_common_multiple (group_size,
7749 const_nunits) / const_nunits;
7750 for (; ivn < nivs; ++ivn)
7751 SLP_TREE_VEC_STMTS (slp_node)
7752 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
7753
7754 /* Re-use IVs when we can. */
7755 if (ivn < nvects)
7756 {
7757 unsigned vfp
7758 = least_common_multiple (group_size, const_nunits) / group_size;
7759 /* Generate [VF'*S, VF'*S, ... ]. */
7760 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7761 {
7762 expr = build_int_cst (integer_type_node, vfp);
7763 expr = fold_convert (TREE_TYPE (step_expr), expr);
7764 }
7765 else
7766 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7767 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7768 expr, step_expr);
7769 if (! CONSTANT_CLASS_P (new_name))
7770 new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
7771 TREE_TYPE (step_expr), NULL);
7772 new_vec = build_vector_from_val (step_vectype, new_name);
7773 vec_step = vect_init_vector (loop_vinfo, stmt_info, new_vec,
7774 step_vectype, NULL);
7775 for (; ivn < nvects; ++ivn)
7776 {
7777 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7778 tree def;
7779 if (gimple_code (iv) == GIMPLE_PHI)
7780 def = gimple_phi_result (iv);
7781 else
7782 def = gimple_assign_lhs (iv);
7783 gimple_seq stmts = NULL;
7784 def = gimple_convert (&stmts, step_vectype, def);
7785 def = gimple_build (&stmts,
7786 PLUS_EXPR, step_vectype, def, vec_step);
7787 def = gimple_convert (&stmts, vectype, def);
7788 if (gimple_code (iv) == GIMPLE_PHI)
7789 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7790 else
7791 {
7792 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7793 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
7794 }
7795 SLP_TREE_VEC_STMTS (slp_node)
7796 .quick_push (SSA_NAME_DEF_STMT (def));
7797 }
7798 }
7799
7800 return true;
7801 }
7802
7803 /* Create the vector that holds the initial_value of the induction. */
7804 if (nested_in_vect_loop)
7805 {
7806 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7807 been created during vectorization of previous stmts. We obtain it
7808 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7809 auto_vec<tree> vec_inits;
7810 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7811 init_expr, &vec_inits);
7812 vec_init = vec_inits[0];
7813 /* If the initial value is not of proper type, convert it. */
7814 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7815 {
7816 new_stmt
7817 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7818 vect_simple_var,
7819 "vec_iv_"),
7820 VIEW_CONVERT_EXPR,
7821 build1 (VIEW_CONVERT_EXPR, vectype,
7822 vec_init));
7823 vec_init = gimple_assign_lhs (new_stmt);
7824 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7825 new_stmt);
7826 gcc_assert (!new_bb);
7827 }
7828 }
7829 else
7830 {
7831 /* iv_loop is the loop to be vectorized. Create:
7832 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7833 stmts = NULL;
7834 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
7835
7836 unsigned HOST_WIDE_INT const_nunits;
7837 if (nunits.is_constant (&const_nunits))
7838 {
7839 tree_vector_builder elts (step_vectype, const_nunits, 1);
7840 elts.quick_push (new_name);
7841 for (i = 1; i < const_nunits; i++)
7842 {
7843 /* Create: new_name_i = new_name + step_expr */
7844 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7845 new_name, step_expr);
7846 elts.quick_push (new_name);
7847 }
7848 /* Create a vector from [new_name_0, new_name_1, ...,
7849 new_name_nunits-1] */
7850 vec_init = gimple_build_vector (&stmts, &elts);
7851 }
7852 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7853 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7854 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
7855 new_name, step_expr);
7856 else
7857 {
7858 /* Build:
7859 [base, base, base, ...]
7860 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7861 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7862 gcc_assert (flag_associative_math);
7863 tree index = build_index_vector (step_vectype, 0, 1);
7864 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7865 new_name);
7866 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7867 step_expr);
7868 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
7869 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
7870 vec_init, step_vec);
7871 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
7872 vec_init, base_vec);
7873 }
7874 vec_init = gimple_convert (&stmts, vectype, vec_init);
7875
7876 if (stmts)
7877 {
7878 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7879 gcc_assert (!new_bb);
7880 }
7881 }
7882
7883
7884 /* Create the vector that holds the step of the induction. */
7885 if (nested_in_vect_loop)
7886 /* iv_loop is nested in the loop to be vectorized. Generate:
7887 vec_step = [S, S, S, S] */
7888 new_name = step_expr;
7889 else
7890 {
7891 /* iv_loop is the loop to be vectorized. Generate:
7892 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7893 gimple_seq seq = NULL;
7894 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7895 {
7896 expr = build_int_cst (integer_type_node, vf);
7897 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7898 }
7899 else
7900 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7901 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7902 expr, step_expr);
7903 if (seq)
7904 {
7905 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7906 gcc_assert (!new_bb);
7907 }
7908 }
7909
7910 t = unshare_expr (new_name);
7911 gcc_assert (CONSTANT_CLASS_P (new_name)
7912 || TREE_CODE (new_name) == SSA_NAME);
7913 new_vec = build_vector_from_val (step_vectype, t);
7914 vec_step = vect_init_vector (loop_vinfo, stmt_info,
7915 new_vec, step_vectype, NULL);
7916
7917
7918 /* Create the following def-use cycle:
7919 loop prolog:
7920 vec_init = ...
7921 vec_step = ...
7922 loop:
7923 vec_iv = PHI <vec_init, vec_loop>
7924 ...
7925 STMT
7926 ...
7927 vec_loop = vec_iv + vec_step; */
7928
7929 /* Create the induction-phi that defines the induction-operand. */
7930 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7931 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7932 induc_def = PHI_RESULT (induction_phi);
7933
7934 /* Create the iv update inside the loop */
7935 stmts = NULL;
7936 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7937 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
7938 vec_def = gimple_convert (&stmts, vectype, vec_def);
7939 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7940 new_stmt = SSA_NAME_DEF_STMT (vec_def);
7941
7942 /* Set the arguments of the phi node: */
7943 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7944 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7945 UNKNOWN_LOCATION);
7946
7947 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
7948 *vec_stmt = induction_phi;
7949
7950 /* In case that vectorization factor (VF) is bigger than the number
7951 of elements that we can fit in a vectype (nunits), we have to generate
7952 more than one vector stmt - i.e - we need to "unroll" the
7953 vector stmt by a factor VF/nunits. For more details see documentation
7954 in vectorizable_operation. */
7955
7956 if (ncopies > 1)
7957 {
7958 gimple_seq seq = NULL;
7959 /* FORNOW. This restriction should be relaxed. */
7960 gcc_assert (!nested_in_vect_loop);
7961
7962 /* Create the vector that holds the step of the induction. */
7963 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7964 {
7965 expr = build_int_cst (integer_type_node, nunits);
7966 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7967 }
7968 else
7969 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7970 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7971 expr, step_expr);
7972 if (seq)
7973 {
7974 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7975 gcc_assert (!new_bb);
7976 }
7977
7978 t = unshare_expr (new_name);
7979 gcc_assert (CONSTANT_CLASS_P (new_name)
7980 || TREE_CODE (new_name) == SSA_NAME);
7981 new_vec = build_vector_from_val (step_vectype, t);
7982 vec_step = vect_init_vector (loop_vinfo, stmt_info,
7983 new_vec, step_vectype, NULL);
7984
7985 vec_def = induc_def;
7986 for (i = 1; i < ncopies; i++)
7987 {
7988 /* vec_i = vec_prev + vec_step */
7989 gimple_seq stmts = NULL;
7990 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
7991 vec_def = gimple_build (&stmts,
7992 PLUS_EXPR, step_vectype, vec_def, vec_step);
7993 vec_def = gimple_convert (&stmts, vectype, vec_def);
7994
7995 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7996 new_stmt = SSA_NAME_DEF_STMT (vec_def);
7997 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7998 }
7999 }
8000
8001 if (dump_enabled_p ())
8002 dump_printf_loc (MSG_NOTE, vect_location,
8003 "transform induction: created def-use cycle: %G%G",
8004 induction_phi, SSA_NAME_DEF_STMT (vec_def));
8005
8006 return true;
8007 }
8008
8009 /* Function vectorizable_live_operation.
8010
8011 STMT_INFO computes a value that is used outside the loop. Check if
8012 it can be supported. */
8013
8014 bool
8015 vectorizable_live_operation (vec_info *vinfo,
8016 stmt_vec_info stmt_info,
8017 gimple_stmt_iterator *gsi,
8018 slp_tree slp_node, slp_instance slp_node_instance,
8019 int slp_index, bool vec_stmt_p,
8020 stmt_vector_for_cost *cost_vec)
8021 {
8022 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8023 imm_use_iterator imm_iter;
8024 tree lhs, lhs_type, bitsize, vec_bitsize;
8025 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8026 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8027 int ncopies;
8028 gimple *use_stmt;
8029 auto_vec<tree> vec_oprnds;
8030 int vec_entry = 0;
8031 poly_uint64 vec_index = 0;
8032
8033 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8034
8035 /* If a stmt of a reduction is live, vectorize it via
8036 vect_create_epilog_for_reduction. vectorizable_reduction assessed
8037 validity so just trigger the transform here. */
8038 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8039 {
8040 if (!vec_stmt_p)
8041 return true;
8042 if (slp_node)
8043 {
8044 /* For reduction chains the meta-info is attached to
8045 the group leader. */
8046 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8047 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8048 /* For SLP reductions we vectorize the epilogue for
8049 all involved stmts together. */
8050 else if (slp_index != 0)
8051 return true;
8052 else
8053 /* For SLP reductions the meta-info is attached to
8054 the representative. */
8055 stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8056 }
8057 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8058 gcc_assert (reduc_info->is_reduc_info);
8059 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8060 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8061 return true;
8062 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8063 slp_node_instance);
8064 return true;
8065 }
8066
8067 /* If STMT is not relevant and it is a simple assignment and its inputs are
8068 invariant then it can remain in place, unvectorized. The original last
8069 scalar value that it computes will be used. */
8070 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8071 {
8072 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8073 if (dump_enabled_p ())
8074 dump_printf_loc (MSG_NOTE, vect_location,
8075 "statement is simple and uses invariant. Leaving in "
8076 "place.\n");
8077 return true;
8078 }
8079
8080 if (slp_node)
8081 ncopies = 1;
8082 else
8083 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8084
8085 if (slp_node)
8086 {
8087 gcc_assert (slp_index >= 0);
8088
8089 /* Get the last occurrence of the scalar index from the concatenation of
8090 all the slp vectors. Calculate which slp vector it is and the index
8091 within. */
8092 int num_scalar = SLP_TREE_LANES (slp_node);
8093 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8094 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8095
8096 /* Calculate which vector contains the result, and which lane of
8097 that vector we need. */
8098 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8099 {
8100 if (dump_enabled_p ())
8101 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8102 "Cannot determine which vector holds the"
8103 " final result.\n");
8104 return false;
8105 }
8106 }
8107
8108 if (!vec_stmt_p)
8109 {
8110 /* No transformation required. */
8111 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8112 {
8113 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8114 OPTIMIZE_FOR_SPEED))
8115 {
8116 if (dump_enabled_p ())
8117 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8118 "can't operate on partial vectors "
8119 "because the target doesn't support extract "
8120 "last reduction.\n");
8121 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8122 }
8123 else if (slp_node)
8124 {
8125 if (dump_enabled_p ())
8126 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8127 "can't operate on partial vectors "
8128 "because an SLP statement is live after "
8129 "the loop.\n");
8130 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8131 }
8132 else if (ncopies > 1)
8133 {
8134 if (dump_enabled_p ())
8135 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8136 "can't operate on partial vectors "
8137 "because ncopies is greater than 1.\n");
8138 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8139 }
8140 else
8141 {
8142 gcc_assert (ncopies == 1 && !slp_node);
8143 vect_record_loop_mask (loop_vinfo,
8144 &LOOP_VINFO_MASKS (loop_vinfo),
8145 1, vectype, NULL);
8146 }
8147 }
8148 /* ??? Enable for loop costing as well. */
8149 if (!loop_vinfo)
8150 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8151 0, vect_epilogue);
8152 return true;
8153 }
8154
8155 /* Use the lhs of the original scalar statement. */
8156 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8157 if (dump_enabled_p ())
8158 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8159 "stmt %G", stmt);
8160
8161 lhs = gimple_get_lhs (stmt);
8162 lhs_type = TREE_TYPE (lhs);
8163
8164 bitsize = vector_element_bits_tree (vectype);
8165 vec_bitsize = TYPE_SIZE (vectype);
8166
8167 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8168 tree vec_lhs, bitstart;
8169 gimple *vec_stmt;
8170 if (slp_node)
8171 {
8172 gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8173
8174 /* Get the correct slp vectorized stmt. */
8175 vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8176 vec_lhs = gimple_get_lhs (vec_stmt);
8177
8178 /* Get entry to use. */
8179 bitstart = bitsize_int (vec_index);
8180 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8181 }
8182 else
8183 {
8184 /* For multiple copies, get the last copy. */
8185 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8186 vec_lhs = gimple_get_lhs (vec_stmt);
8187
8188 /* Get the last lane in the vector. */
8189 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8190 }
8191
8192 if (loop_vinfo)
8193 {
8194 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8195 requirement, insert one phi node for it. It looks like:
8196 loop;
8197 BB:
8198 # lhs' = PHI <lhs>
8199 ==>
8200 loop;
8201 BB:
8202 # vec_lhs' = PHI <vec_lhs>
8203 new_tree = lane_extract <vec_lhs', ...>;
8204 lhs' = new_tree; */
8205
8206 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8207 basic_block exit_bb = single_exit (loop)->dest;
8208 gcc_assert (single_pred_p (exit_bb));
8209
8210 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8211 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8212 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8213
8214 gimple_seq stmts = NULL;
8215 tree new_tree;
8216 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8217 {
8218 /* Emit:
8219
8220 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8221
8222 where VEC_LHS is the vectorized live-out result and MASK is
8223 the loop mask for the final iteration. */
8224 gcc_assert (ncopies == 1 && !slp_node);
8225 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8226 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8227 1, vectype, 0);
8228 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8229 mask, vec_lhs_phi);
8230
8231 /* Convert the extracted vector element to the scalar type. */
8232 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8233 }
8234 else
8235 {
8236 tree bftype = TREE_TYPE (vectype);
8237 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8238 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8239 new_tree = build3 (BIT_FIELD_REF, bftype,
8240 vec_lhs_phi, bitsize, bitstart);
8241 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8242 &stmts, true, NULL_TREE);
8243 }
8244
8245 if (stmts)
8246 {
8247 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8248 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8249
8250 /* Remove existing phi from lhs and create one copy from new_tree. */
8251 tree lhs_phi = NULL_TREE;
8252 gimple_stmt_iterator gsi;
8253 for (gsi = gsi_start_phis (exit_bb);
8254 !gsi_end_p (gsi); gsi_next (&gsi))
8255 {
8256 gimple *phi = gsi_stmt (gsi);
8257 if ((gimple_phi_arg_def (phi, 0) == lhs))
8258 {
8259 remove_phi_node (&gsi, false);
8260 lhs_phi = gimple_phi_result (phi);
8261 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8262 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8263 break;
8264 }
8265 }
8266 }
8267
8268 /* Replace use of lhs with newly computed result. If the use stmt is a
8269 single arg PHI, just replace all uses of PHI result. It's necessary
8270 because lcssa PHI defining lhs may be before newly inserted stmt. */
8271 use_operand_p use_p;
8272 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8273 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8274 && !is_gimple_debug (use_stmt))
8275 {
8276 if (gimple_code (use_stmt) == GIMPLE_PHI
8277 && gimple_phi_num_args (use_stmt) == 1)
8278 {
8279 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8280 }
8281 else
8282 {
8283 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8284 SET_USE (use_p, new_tree);
8285 }
8286 update_stmt (use_stmt);
8287 }
8288 }
8289 else
8290 {
8291 /* For basic-block vectorization simply insert the lane-extraction. */
8292 tree bftype = TREE_TYPE (vectype);
8293 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8294 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8295 tree new_tree = build3 (BIT_FIELD_REF, bftype,
8296 vec_lhs, bitsize, bitstart);
8297 gimple_seq stmts = NULL;
8298 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8299 &stmts, true, NULL_TREE);
8300
8301 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
8302
8303 /* Replace use of lhs with newly computed result. If the use stmt is a
8304 single arg PHI, just replace all uses of PHI result. It's necessary
8305 because lcssa PHI defining lhs may be before newly inserted stmt. */
8306 use_operand_p use_p;
8307 stmt_vec_info use_stmt_info;
8308 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8309 if (!is_gimple_debug (use_stmt)
8310 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
8311 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
8312 {
8313 /* ??? This can happen when the live lane ends up being
8314 used in a vector construction code-generated by an
8315 external SLP node (and code-generation for that already
8316 happened). See gcc.dg/vect/bb-slp-47.c.
8317 Doing this is what would happen if that vector CTOR
8318 were not code-generated yet so it is not too bad.
8319 ??? In fact we'd likely want to avoid this situation
8320 in the first place. */
8321 if (gimple_code (use_stmt) != GIMPLE_PHI
8322 && !vect_stmt_dominates_stmt_p (gsi_stmt (*gsi), use_stmt))
8323 {
8324 gcc_assert (is_gimple_assign (use_stmt)
8325 && gimple_assign_rhs_code (use_stmt) == CONSTRUCTOR);
8326 if (dump_enabled_p ())
8327 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8328 "Using original scalar computation for "
8329 "live lane because use preceeds vector "
8330 "def\n");
8331 continue;
8332 }
8333 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8334 SET_USE (use_p, new_tree);
8335 update_stmt (use_stmt);
8336 }
8337 }
8338
8339 return true;
8340 }
8341
8342 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8343
8344 static void
8345 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8346 {
8347 ssa_op_iter op_iter;
8348 imm_use_iterator imm_iter;
8349 def_operand_p def_p;
8350 gimple *ustmt;
8351
8352 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8353 {
8354 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8355 {
8356 basic_block bb;
8357
8358 if (!is_gimple_debug (ustmt))
8359 continue;
8360
8361 bb = gimple_bb (ustmt);
8362
8363 if (!flow_bb_inside_loop_p (loop, bb))
8364 {
8365 if (gimple_debug_bind_p (ustmt))
8366 {
8367 if (dump_enabled_p ())
8368 dump_printf_loc (MSG_NOTE, vect_location,
8369 "killing debug use\n");
8370
8371 gimple_debug_bind_reset_value (ustmt);
8372 update_stmt (ustmt);
8373 }
8374 else
8375 gcc_unreachable ();
8376 }
8377 }
8378 }
8379 }
8380
8381 /* Given loop represented by LOOP_VINFO, return true if computation of
8382 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8383 otherwise. */
8384
8385 static bool
8386 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8387 {
8388 /* Constant case. */
8389 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8390 {
8391 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8392 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8393
8394 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8395 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8396 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8397 return true;
8398 }
8399
8400 widest_int max;
8401 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8402 /* Check the upper bound of loop niters. */
8403 if (get_max_loop_iterations (loop, &max))
8404 {
8405 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8406 signop sgn = TYPE_SIGN (type);
8407 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8408 if (max < type_max)
8409 return true;
8410 }
8411 return false;
8412 }
8413
8414 /* Return a mask type with half the number of elements as OLD_TYPE,
8415 given that it should have mode NEW_MODE. */
8416
8417 tree
8418 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8419 {
8420 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8421 return build_truth_vector_type_for_mode (nunits, new_mode);
8422 }
8423
8424 /* Return a mask type with twice as many elements as OLD_TYPE,
8425 given that it should have mode NEW_MODE. */
8426
8427 tree
8428 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8429 {
8430 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8431 return build_truth_vector_type_for_mode (nunits, new_mode);
8432 }
8433
8434 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8435 contain a sequence of NVECTORS masks that each control a vector of type
8436 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
8437 these vector masks with the vector version of SCALAR_MASK. */
8438
8439 void
8440 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8441 unsigned int nvectors, tree vectype, tree scalar_mask)
8442 {
8443 gcc_assert (nvectors != 0);
8444 if (masks->length () < nvectors)
8445 masks->safe_grow_cleared (nvectors, true);
8446 rgroup_controls *rgm = &(*masks)[nvectors - 1];
8447 /* The number of scalars per iteration and the number of vectors are
8448 both compile-time constants. */
8449 unsigned int nscalars_per_iter
8450 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8451 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8452
8453 if (scalar_mask)
8454 {
8455 scalar_cond_masked_key cond (scalar_mask, nvectors);
8456 loop_vinfo->scalar_cond_masked_set.add (cond);
8457 }
8458
8459 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8460 {
8461 rgm->max_nscalars_per_iter = nscalars_per_iter;
8462 rgm->type = truth_type_for (vectype);
8463 rgm->factor = 1;
8464 }
8465 }
8466
8467 /* Given a complete set of masks MASKS, extract mask number INDEX
8468 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8469 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8470
8471 See the comment above vec_loop_masks for more details about the mask
8472 arrangement. */
8473
8474 tree
8475 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8476 unsigned int nvectors, tree vectype, unsigned int index)
8477 {
8478 rgroup_controls *rgm = &(*masks)[nvectors - 1];
8479 tree mask_type = rgm->type;
8480
8481 /* Populate the rgroup's mask array, if this is the first time we've
8482 used it. */
8483 if (rgm->controls.is_empty ())
8484 {
8485 rgm->controls.safe_grow_cleared (nvectors, true);
8486 for (unsigned int i = 0; i < nvectors; ++i)
8487 {
8488 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8489 /* Provide a dummy definition until the real one is available. */
8490 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8491 rgm->controls[i] = mask;
8492 }
8493 }
8494
8495 tree mask = rgm->controls[index];
8496 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8497 TYPE_VECTOR_SUBPARTS (vectype)))
8498 {
8499 /* A loop mask for data type X can be reused for data type Y
8500 if X has N times more elements than Y and if Y's elements
8501 are N times bigger than X's. In this case each sequence
8502 of N elements in the loop mask will be all-zero or all-one.
8503 We can then view-convert the mask so that each sequence of
8504 N elements is replaced by a single element. */
8505 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8506 TYPE_VECTOR_SUBPARTS (vectype)));
8507 gimple_seq seq = NULL;
8508 mask_type = truth_type_for (vectype);
8509 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8510 if (seq)
8511 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8512 }
8513 return mask;
8514 }
8515
8516 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
8517 lengths for controlling an operation on VECTYPE. The operation splits
8518 each element of VECTYPE into FACTOR separate subelements, measuring the
8519 length as a number of these subelements. */
8520
8521 void
8522 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
8523 unsigned int nvectors, tree vectype, unsigned int factor)
8524 {
8525 gcc_assert (nvectors != 0);
8526 if (lens->length () < nvectors)
8527 lens->safe_grow_cleared (nvectors, true);
8528 rgroup_controls *rgl = &(*lens)[nvectors - 1];
8529
8530 /* The number of scalars per iteration, scalar occupied bytes and
8531 the number of vectors are both compile-time constants. */
8532 unsigned int nscalars_per_iter
8533 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8534 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8535
8536 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
8537 {
8538 /* For now, we only support cases in which all loads and stores fall back
8539 to VnQI or none do. */
8540 gcc_assert (!rgl->max_nscalars_per_iter
8541 || (rgl->factor == 1 && factor == 1)
8542 || (rgl->max_nscalars_per_iter * rgl->factor
8543 == nscalars_per_iter * factor));
8544 rgl->max_nscalars_per_iter = nscalars_per_iter;
8545 rgl->type = vectype;
8546 rgl->factor = factor;
8547 }
8548 }
8549
8550 /* Given a complete set of length LENS, extract length number INDEX for an
8551 rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
8552
8553 tree
8554 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
8555 unsigned int nvectors, unsigned int index)
8556 {
8557 rgroup_controls *rgl = &(*lens)[nvectors - 1];
8558
8559 /* Populate the rgroup's len array, if this is the first time we've
8560 used it. */
8561 if (rgl->controls.is_empty ())
8562 {
8563 rgl->controls.safe_grow_cleared (nvectors, true);
8564 for (unsigned int i = 0; i < nvectors; ++i)
8565 {
8566 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
8567 gcc_assert (len_type != NULL_TREE);
8568 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
8569
8570 /* Provide a dummy definition until the real one is available. */
8571 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
8572 rgl->controls[i] = len;
8573 }
8574 }
8575
8576 return rgl->controls[index];
8577 }
8578
8579 /* Scale profiling counters by estimation for LOOP which is vectorized
8580 by factor VF. */
8581
8582 static void
8583 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
8584 {
8585 edge preheader = loop_preheader_edge (loop);
8586 /* Reduce loop iterations by the vectorization factor. */
8587 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8588 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8589
8590 if (freq_h.nonzero_p ())
8591 {
8592 profile_probability p;
8593
8594 /* Avoid dropping loop body profile counter to 0 because of zero count
8595 in loop's preheader. */
8596 if (!(freq_e == profile_count::zero ()))
8597 freq_e = freq_e.force_nonzero ();
8598 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8599 scale_loop_frequencies (loop, p);
8600 }
8601
8602 edge exit_e = single_exit (loop);
8603 exit_e->probability = profile_probability::always ()
8604 .apply_scale (1, new_est_niter + 1);
8605
8606 edge exit_l = single_pred_edge (loop->latch);
8607 profile_probability prob = exit_l->probability;
8608 exit_l->probability = exit_e->probability.invert ();
8609 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8610 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8611 }
8612
8613 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
8614 latch edge values originally defined by it. */
8615
8616 static void
8617 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
8618 stmt_vec_info def_stmt_info)
8619 {
8620 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
8621 if (!def || TREE_CODE (def) != SSA_NAME)
8622 return;
8623 stmt_vec_info phi_info;
8624 imm_use_iterator iter;
8625 use_operand_p use_p;
8626 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
8627 if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
8628 if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
8629 && (phi_info = loop_vinfo->lookup_stmt (phi))
8630 && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
8631 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
8632 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
8633 {
8634 loop_p loop = gimple_bb (phi)->loop_father;
8635 edge e = loop_latch_edge (loop);
8636 if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
8637 {
8638 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
8639 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
8640 gcc_assert (phi_defs.length () == latch_defs.length ());
8641 for (unsigned i = 0; i < phi_defs.length (); ++i)
8642 add_phi_arg (as_a <gphi *> (phi_defs[i]),
8643 gimple_get_lhs (latch_defs[i]), e,
8644 gimple_phi_arg_location (phi, e->dest_idx));
8645 }
8646 }
8647 }
8648
8649 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8650 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8651 stmt_vec_info. */
8652
8653 static void
8654 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8655 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8656 {
8657 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8658 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8659
8660 if (dump_enabled_p ())
8661 dump_printf_loc (MSG_NOTE, vect_location,
8662 "------>vectorizing statement: %G", stmt_info->stmt);
8663
8664 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8665 vect_loop_kill_debug_uses (loop, stmt_info);
8666
8667 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8668 && !STMT_VINFO_LIVE_P (stmt_info))
8669 return;
8670
8671 if (STMT_VINFO_VECTYPE (stmt_info))
8672 {
8673 poly_uint64 nunits
8674 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8675 if (!STMT_SLP_TYPE (stmt_info)
8676 && maybe_ne (nunits, vf)
8677 && dump_enabled_p ())
8678 /* For SLP VF is set according to unrolling factor, and not
8679 to vector size, hence for SLP this print is not valid. */
8680 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8681 }
8682
8683 /* Pure SLP statements have already been vectorized. We still need
8684 to apply loop vectorization to hybrid SLP statements. */
8685 if (PURE_SLP_STMT (stmt_info))
8686 return;
8687
8688 if (dump_enabled_p ())
8689 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8690
8691 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
8692 *seen_store = stmt_info;
8693 }
8694
8695 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
8696 in the hash_map with its corresponding values. */
8697
8698 static tree
8699 find_in_mapping (tree t, void *context)
8700 {
8701 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
8702
8703 tree *value = mapping->get (t);
8704 return value ? *value : t;
8705 }
8706
8707 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
8708 original loop that has now been vectorized.
8709
8710 The inits of the data_references need to be advanced with the number of
8711 iterations of the main loop. This has been computed in vect_do_peeling and
8712 is stored in parameter ADVANCE. We first restore the data_references
8713 initial offset with the values recored in ORIG_DRS_INIT.
8714
8715 Since the loop_vec_info of this EPILOGUE was constructed for the original
8716 loop, its stmt_vec_infos all point to the original statements. These need
8717 to be updated to point to their corresponding copies as well as the SSA_NAMES
8718 in their PATTERN_DEF_SEQs and RELATED_STMTs.
8719
8720 The data_reference's connections also need to be updated. Their
8721 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
8722 stmt_vec_infos, their statements need to point to their corresponding copy,
8723 if they are gather loads or scatter stores then their reference needs to be
8724 updated to point to its corresponding copy and finally we set
8725 'base_misaligned' to false as we have already peeled for alignment in the
8726 prologue of the main loop. */
8727
8728 static void
8729 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
8730 {
8731 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
8732 auto_vec<gimple *> stmt_worklist;
8733 hash_map<tree,tree> mapping;
8734 gimple *orig_stmt, *new_stmt;
8735 gimple_stmt_iterator epilogue_gsi;
8736 gphi_iterator epilogue_phi_gsi;
8737 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
8738 basic_block *epilogue_bbs = get_loop_body (epilogue);
8739 unsigned i;
8740
8741 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
8742
8743 /* Advance data_reference's with the number of iterations of the previous
8744 loop and its prologue. */
8745 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
8746
8747
8748 /* The EPILOGUE loop is a copy of the original loop so they share the same
8749 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
8750 point to the copied statements. We also create a mapping of all LHS' in
8751 the original loop and all the LHS' in the EPILOGUE and create worklists to
8752 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
8753 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
8754 {
8755 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
8756 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
8757 {
8758 new_stmt = epilogue_phi_gsi.phi ();
8759
8760 gcc_assert (gimple_uid (new_stmt) > 0);
8761 stmt_vinfo
8762 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8763
8764 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8765 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8766
8767 mapping.put (gimple_phi_result (orig_stmt),
8768 gimple_phi_result (new_stmt));
8769 /* PHI nodes can not have patterns or related statements. */
8770 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
8771 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
8772 }
8773
8774 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
8775 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
8776 {
8777 new_stmt = gsi_stmt (epilogue_gsi);
8778 if (is_gimple_debug (new_stmt))
8779 continue;
8780
8781 gcc_assert (gimple_uid (new_stmt) > 0);
8782 stmt_vinfo
8783 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8784
8785 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8786 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8787
8788 if (tree old_lhs = gimple_get_lhs (orig_stmt))
8789 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
8790
8791 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
8792 {
8793 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
8794 for (gimple_stmt_iterator gsi = gsi_start (seq);
8795 !gsi_end_p (gsi); gsi_next (&gsi))
8796 stmt_worklist.safe_push (gsi_stmt (gsi));
8797 }
8798
8799 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
8800 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
8801 {
8802 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
8803 stmt_worklist.safe_push (stmt);
8804 /* Set BB such that the assert in
8805 'get_initial_def_for_reduction' is able to determine that
8806 the BB of the related stmt is inside this loop. */
8807 gimple_set_bb (stmt,
8808 gimple_bb (new_stmt));
8809 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
8810 gcc_assert (related_vinfo == NULL
8811 || related_vinfo == stmt_vinfo);
8812 }
8813 }
8814 }
8815
8816 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
8817 using the original main loop and thus need to be updated to refer to the
8818 cloned variables used in the epilogue. */
8819 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
8820 {
8821 gimple *stmt = stmt_worklist[i];
8822 tree *new_op;
8823
8824 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
8825 {
8826 tree op = gimple_op (stmt, j);
8827 if ((new_op = mapping.get(op)))
8828 gimple_set_op (stmt, j, *new_op);
8829 else
8830 {
8831 /* PR92429: The last argument of simplify_replace_tree disables
8832 folding when replacing arguments. This is required as
8833 otherwise you might end up with different statements than the
8834 ones analyzed in vect_loop_analyze, leading to different
8835 vectorization. */
8836 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
8837 &find_in_mapping, &mapping, false);
8838 gimple_set_op (stmt, j, op);
8839 }
8840 }
8841 }
8842
8843 struct data_reference *dr;
8844 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
8845 FOR_EACH_VEC_ELT (datarefs, i, dr)
8846 {
8847 orig_stmt = DR_STMT (dr);
8848 gcc_assert (gimple_uid (orig_stmt) > 0);
8849 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
8850 /* Data references for gather loads and scatter stores do not use the
8851 updated offset we set using ADVANCE. Instead we have to make sure the
8852 reference in the data references point to the corresponding copy of
8853 the original in the epilogue. */
8854 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
8855 == VMAT_GATHER_SCATTER)
8856 {
8857 DR_REF (dr)
8858 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
8859 &find_in_mapping, &mapping);
8860 DR_BASE_ADDRESS (dr)
8861 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
8862 &find_in_mapping, &mapping);
8863 }
8864 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
8865 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
8866 /* The vector size of the epilogue is smaller than that of the main loop
8867 so the alignment is either the same or lower. This means the dr will
8868 thus by definition be aligned. */
8869 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
8870 }
8871
8872 epilogue_vinfo->shared->datarefs_copy.release ();
8873 epilogue_vinfo->shared->save_datarefs ();
8874 }
8875
8876 /* Function vect_transform_loop.
8877
8878 The analysis phase has determined that the loop is vectorizable.
8879 Vectorize the loop - created vectorized stmts to replace the scalar
8880 stmts in the loop, and update the loop exit condition.
8881 Returns scalar epilogue loop if any. */
8882
8883 class loop *
8884 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
8885 {
8886 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8887 class loop *epilogue = NULL;
8888 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8889 int nbbs = loop->num_nodes;
8890 int i;
8891 tree niters_vector = NULL_TREE;
8892 tree step_vector = NULL_TREE;
8893 tree niters_vector_mult_vf = NULL_TREE;
8894 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8895 unsigned int lowest_vf = constant_lower_bound (vf);
8896 gimple *stmt;
8897 bool check_profitability = false;
8898 unsigned int th;
8899
8900 DUMP_VECT_SCOPE ("vec_transform_loop");
8901
8902 loop_vinfo->shared->check_datarefs ();
8903
8904 /* Use the more conservative vectorization threshold. If the number
8905 of iterations is constant assume the cost check has been performed
8906 by our caller. If the threshold makes all loops profitable that
8907 run at least the (estimated) vectorization factor number of times
8908 checking is pointless, too. */
8909 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8910 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
8911 {
8912 if (dump_enabled_p ())
8913 dump_printf_loc (MSG_NOTE, vect_location,
8914 "Profitability threshold is %d loop iterations.\n",
8915 th);
8916 check_profitability = true;
8917 }
8918
8919 /* Make sure there exists a single-predecessor exit bb. Do this before
8920 versioning. */
8921 edge e = single_exit (loop);
8922 if (! single_pred_p (e->dest))
8923 {
8924 split_loop_exit_edge (e, true);
8925 if (dump_enabled_p ())
8926 dump_printf (MSG_NOTE, "split exit edge\n");
8927 }
8928
8929 /* Version the loop first, if required, so the profitability check
8930 comes first. */
8931
8932 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8933 {
8934 class loop *sloop
8935 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
8936 sloop->force_vectorize = false;
8937 check_profitability = false;
8938 }
8939
8940 /* Make sure there exists a single-predecessor exit bb also on the
8941 scalar loop copy. Do this after versioning but before peeling
8942 so CFG structure is fine for both scalar and if-converted loop
8943 to make slpeel_duplicate_current_defs_from_edges face matched
8944 loop closed PHI nodes on the exit. */
8945 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8946 {
8947 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8948 if (! single_pred_p (e->dest))
8949 {
8950 split_loop_exit_edge (e, true);
8951 if (dump_enabled_p ())
8952 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8953 }
8954 }
8955
8956 tree niters = vect_build_loop_niters (loop_vinfo);
8957 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8958 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8959 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8960 tree advance;
8961 drs_init_vec orig_drs_init;
8962
8963 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8964 &step_vector, &niters_vector_mult_vf, th,
8965 check_profitability, niters_no_overflow,
8966 &advance);
8967
8968 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
8969 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
8970 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
8971 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
8972
8973 if (niters_vector == NULL_TREE)
8974 {
8975 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8976 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
8977 && known_eq (lowest_vf, vf))
8978 {
8979 niters_vector
8980 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8981 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8982 step_vector = build_one_cst (TREE_TYPE (niters));
8983 }
8984 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
8985 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8986 &step_vector, niters_no_overflow);
8987 else
8988 /* vect_do_peeling subtracted the number of peeled prologue
8989 iterations from LOOP_VINFO_NITERS. */
8990 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
8991 &niters_vector, &step_vector,
8992 niters_no_overflow);
8993 }
8994
8995 /* 1) Make sure the loop header has exactly two entries
8996 2) Make sure we have a preheader basic block. */
8997
8998 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8999
9000 split_edge (loop_preheader_edge (loop));
9001
9002 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
9003 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
9004 /* This will deal with any possible peeling. */
9005 vect_prepare_for_masked_peels (loop_vinfo);
9006
9007 /* Schedule the SLP instances first, then handle loop vectorization
9008 below. */
9009 if (!loop_vinfo->slp_instances.is_empty ())
9010 {
9011 DUMP_VECT_SCOPE ("scheduling SLP instances");
9012 vect_schedule_slp (loop_vinfo);
9013 }
9014
9015 /* FORNOW: the vectorizer supports only loops which body consist
9016 of one basic block (header + empty latch). When the vectorizer will
9017 support more involved loop forms, the order by which the BBs are
9018 traversed need to be reconsidered. */
9019
9020 for (i = 0; i < nbbs; i++)
9021 {
9022 basic_block bb = bbs[i];
9023 stmt_vec_info stmt_info;
9024
9025 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9026 gsi_next (&si))
9027 {
9028 gphi *phi = si.phi ();
9029 if (dump_enabled_p ())
9030 dump_printf_loc (MSG_NOTE, vect_location,
9031 "------>vectorizing phi: %G", phi);
9032 stmt_info = loop_vinfo->lookup_stmt (phi);
9033 if (!stmt_info)
9034 continue;
9035
9036 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9037 vect_loop_kill_debug_uses (loop, stmt_info);
9038
9039 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9040 && !STMT_VINFO_LIVE_P (stmt_info))
9041 continue;
9042
9043 if (STMT_VINFO_VECTYPE (stmt_info)
9044 && (maybe_ne
9045 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9046 && dump_enabled_p ())
9047 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9048
9049 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9050 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9051 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9052 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9053 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9054 && ! PURE_SLP_STMT (stmt_info))
9055 {
9056 if (dump_enabled_p ())
9057 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9058 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9059 }
9060 }
9061
9062 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9063 gsi_next (&si))
9064 {
9065 gphi *phi = si.phi ();
9066 stmt_info = loop_vinfo->lookup_stmt (phi);
9067 if (!stmt_info)
9068 continue;
9069
9070 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9071 && !STMT_VINFO_LIVE_P (stmt_info))
9072 continue;
9073
9074 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9075 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9076 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9077 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9078 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9079 && ! PURE_SLP_STMT (stmt_info))
9080 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9081 }
9082
9083 for (gimple_stmt_iterator si = gsi_start_bb (bb);
9084 !gsi_end_p (si);)
9085 {
9086 stmt = gsi_stmt (si);
9087 /* During vectorization remove existing clobber stmts. */
9088 if (gimple_clobber_p (stmt))
9089 {
9090 unlink_stmt_vdef (stmt);
9091 gsi_remove (&si, true);
9092 release_defs (stmt);
9093 }
9094 else
9095 {
9096 /* Ignore vector stmts created in the outer loop. */
9097 stmt_info = loop_vinfo->lookup_stmt (stmt);
9098
9099 /* vector stmts created in the outer-loop during vectorization of
9100 stmts in an inner-loop may not have a stmt_info, and do not
9101 need to be vectorized. */
9102 stmt_vec_info seen_store = NULL;
9103 if (stmt_info)
9104 {
9105 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9106 {
9107 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9108 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9109 !gsi_end_p (subsi); gsi_next (&subsi))
9110 {
9111 stmt_vec_info pat_stmt_info
9112 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9113 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9114 &si, &seen_store);
9115 }
9116 stmt_vec_info pat_stmt_info
9117 = STMT_VINFO_RELATED_STMT (stmt_info);
9118 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
9119 &seen_store);
9120 maybe_set_vectorized_backedge_value (loop_vinfo,
9121 pat_stmt_info);
9122 }
9123 else
9124 {
9125 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9126 &seen_store);
9127 maybe_set_vectorized_backedge_value (loop_vinfo,
9128 stmt_info);
9129 }
9130 }
9131 gsi_next (&si);
9132 if (seen_store)
9133 {
9134 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9135 /* Interleaving. If IS_STORE is TRUE, the
9136 vectorization of the interleaving chain was
9137 completed - free all the stores in the chain. */
9138 vect_remove_stores (loop_vinfo,
9139 DR_GROUP_FIRST_ELEMENT (seen_store));
9140 else
9141 /* Free the attached stmt_vec_info and remove the stmt. */
9142 loop_vinfo->remove_stmt (stmt_info);
9143 }
9144 }
9145 }
9146
9147 /* Stub out scalar statements that must not survive vectorization.
9148 Doing this here helps with grouped statements, or statements that
9149 are involved in patterns. */
9150 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9151 !gsi_end_p (gsi); gsi_next (&gsi))
9152 {
9153 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9154 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
9155 {
9156 tree lhs = gimple_get_lhs (call);
9157 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9158 {
9159 tree zero = build_zero_cst (TREE_TYPE (lhs));
9160 gimple *new_stmt = gimple_build_assign (lhs, zero);
9161 gsi_replace (&gsi, new_stmt, true);
9162 }
9163 }
9164 }
9165 } /* BBs in loop */
9166
9167 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9168 a zero NITERS becomes a nonzero NITERS_VECTOR. */
9169 if (integer_onep (step_vector))
9170 niters_no_overflow = true;
9171 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9172 niters_vector_mult_vf, !niters_no_overflow);
9173
9174 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9175 scale_profile_for_vect_loop (loop, assumed_vf);
9176
9177 /* True if the final iteration might not handle a full vector's
9178 worth of scalar iterations. */
9179 bool final_iter_may_be_partial
9180 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9181 /* The minimum number of iterations performed by the epilogue. This
9182 is 1 when peeling for gaps because we always need a final scalar
9183 iteration. */
9184 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9185 /* +1 to convert latch counts to loop iteration counts,
9186 -min_epilogue_iters to remove iterations that cannot be performed
9187 by the vector code. */
9188 int bias_for_lowest = 1 - min_epilogue_iters;
9189 int bias_for_assumed = bias_for_lowest;
9190 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9191 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9192 {
9193 /* When the amount of peeling is known at compile time, the first
9194 iteration will have exactly alignment_npeels active elements.
9195 In the worst case it will have at least one. */
9196 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9197 bias_for_lowest += lowest_vf - min_first_active;
9198 bias_for_assumed += assumed_vf - min_first_active;
9199 }
9200 /* In these calculations the "- 1" converts loop iteration counts
9201 back to latch counts. */
9202 if (loop->any_upper_bound)
9203 loop->nb_iterations_upper_bound
9204 = (final_iter_may_be_partial
9205 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9206 lowest_vf) - 1
9207 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9208 lowest_vf) - 1);
9209 if (loop->any_likely_upper_bound)
9210 loop->nb_iterations_likely_upper_bound
9211 = (final_iter_may_be_partial
9212 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9213 + bias_for_lowest, lowest_vf) - 1
9214 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9215 + bias_for_lowest, lowest_vf) - 1);
9216 if (loop->any_estimate)
9217 loop->nb_iterations_estimate
9218 = (final_iter_may_be_partial
9219 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9220 assumed_vf) - 1
9221 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9222 assumed_vf) - 1);
9223
9224 if (dump_enabled_p ())
9225 {
9226 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9227 {
9228 dump_printf_loc (MSG_NOTE, vect_location,
9229 "LOOP VECTORIZED\n");
9230 if (loop->inner)
9231 dump_printf_loc (MSG_NOTE, vect_location,
9232 "OUTER LOOP VECTORIZED\n");
9233 dump_printf (MSG_NOTE, "\n");
9234 }
9235 else
9236 dump_printf_loc (MSG_NOTE, vect_location,
9237 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9238 GET_MODE_NAME (loop_vinfo->vector_mode));
9239 }
9240
9241 /* Loops vectorized with a variable factor won't benefit from
9242 unrolling/peeling. */
9243 if (!vf.is_constant ())
9244 {
9245 loop->unroll = 1;
9246 if (dump_enabled_p ())
9247 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9248 " variable-length vectorization factor\n");
9249 }
9250 /* Free SLP instances here because otherwise stmt reference counting
9251 won't work. */
9252 slp_instance instance;
9253 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9254 vect_free_slp_instance (instance, true);
9255 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9256 /* Clear-up safelen field since its value is invalid after vectorization
9257 since vectorized loop can have loop-carried dependencies. */
9258 loop->safelen = 0;
9259
9260 if (epilogue)
9261 {
9262 update_epilogue_loop_vinfo (epilogue, advance);
9263
9264 epilogue->simduid = loop->simduid;
9265 epilogue->force_vectorize = loop->force_vectorize;
9266 epilogue->dont_vectorize = false;
9267 }
9268
9269 return epilogue;
9270 }
9271
9272 /* The code below is trying to perform simple optimization - revert
9273 if-conversion for masked stores, i.e. if the mask of a store is zero
9274 do not perform it and all stored value producers also if possible.
9275 For example,
9276 for (i=0; i<n; i++)
9277 if (c[i])
9278 {
9279 p1[i] += 1;
9280 p2[i] = p3[i] +2;
9281 }
9282 this transformation will produce the following semi-hammock:
9283
9284 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9285 {
9286 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9287 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9288 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9289 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9290 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9291 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9292 }
9293 */
9294
9295 void
9296 optimize_mask_stores (class loop *loop)
9297 {
9298 basic_block *bbs = get_loop_body (loop);
9299 unsigned nbbs = loop->num_nodes;
9300 unsigned i;
9301 basic_block bb;
9302 class loop *bb_loop;
9303 gimple_stmt_iterator gsi;
9304 gimple *stmt;
9305 auto_vec<gimple *> worklist;
9306 auto_purge_vect_location sentinel;
9307
9308 vect_location = find_loop_location (loop);
9309 /* Pick up all masked stores in loop if any. */
9310 for (i = 0; i < nbbs; i++)
9311 {
9312 bb = bbs[i];
9313 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9314 gsi_next (&gsi))
9315 {
9316 stmt = gsi_stmt (gsi);
9317 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9318 worklist.safe_push (stmt);
9319 }
9320 }
9321
9322 free (bbs);
9323 if (worklist.is_empty ())
9324 return;
9325
9326 /* Loop has masked stores. */
9327 while (!worklist.is_empty ())
9328 {
9329 gimple *last, *last_store;
9330 edge e, efalse;
9331 tree mask;
9332 basic_block store_bb, join_bb;
9333 gimple_stmt_iterator gsi_to;
9334 tree vdef, new_vdef;
9335 gphi *phi;
9336 tree vectype;
9337 tree zero;
9338
9339 last = worklist.pop ();
9340 mask = gimple_call_arg (last, 2);
9341 bb = gimple_bb (last);
9342 /* Create then_bb and if-then structure in CFG, then_bb belongs to
9343 the same loop as if_bb. It could be different to LOOP when two
9344 level loop-nest is vectorized and mask_store belongs to the inner
9345 one. */
9346 e = split_block (bb, last);
9347 bb_loop = bb->loop_father;
9348 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9349 join_bb = e->dest;
9350 store_bb = create_empty_bb (bb);
9351 add_bb_to_loop (store_bb, bb_loop);
9352 e->flags = EDGE_TRUE_VALUE;
9353 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9354 /* Put STORE_BB to likely part. */
9355 efalse->probability = profile_probability::unlikely ();
9356 store_bb->count = efalse->count ();
9357 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9358 if (dom_info_available_p (CDI_DOMINATORS))
9359 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9360 if (dump_enabled_p ())
9361 dump_printf_loc (MSG_NOTE, vect_location,
9362 "Create new block %d to sink mask stores.",
9363 store_bb->index);
9364 /* Create vector comparison with boolean result. */
9365 vectype = TREE_TYPE (mask);
9366 zero = build_zero_cst (vectype);
9367 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9368 gsi = gsi_last_bb (bb);
9369 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9370 /* Create new PHI node for vdef of the last masked store:
9371 .MEM_2 = VDEF <.MEM_1>
9372 will be converted to
9373 .MEM.3 = VDEF <.MEM_1>
9374 and new PHI node will be created in join bb
9375 .MEM_2 = PHI <.MEM_1, .MEM_3>
9376 */
9377 vdef = gimple_vdef (last);
9378 new_vdef = make_ssa_name (gimple_vop (cfun), last);
9379 gimple_set_vdef (last, new_vdef);
9380 phi = create_phi_node (vdef, join_bb);
9381 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9382
9383 /* Put all masked stores with the same mask to STORE_BB if possible. */
9384 while (true)
9385 {
9386 gimple_stmt_iterator gsi_from;
9387 gimple *stmt1 = NULL;
9388
9389 /* Move masked store to STORE_BB. */
9390 last_store = last;
9391 gsi = gsi_for_stmt (last);
9392 gsi_from = gsi;
9393 /* Shift GSI to the previous stmt for further traversal. */
9394 gsi_prev (&gsi);
9395 gsi_to = gsi_start_bb (store_bb);
9396 gsi_move_before (&gsi_from, &gsi_to);
9397 /* Setup GSI_TO to the non-empty block start. */
9398 gsi_to = gsi_start_bb (store_bb);
9399 if (dump_enabled_p ())
9400 dump_printf_loc (MSG_NOTE, vect_location,
9401 "Move stmt to created bb\n%G", last);
9402 /* Move all stored value producers if possible. */
9403 while (!gsi_end_p (gsi))
9404 {
9405 tree lhs;
9406 imm_use_iterator imm_iter;
9407 use_operand_p use_p;
9408 bool res;
9409
9410 /* Skip debug statements. */
9411 if (is_gimple_debug (gsi_stmt (gsi)))
9412 {
9413 gsi_prev (&gsi);
9414 continue;
9415 }
9416 stmt1 = gsi_stmt (gsi);
9417 /* Do not consider statements writing to memory or having
9418 volatile operand. */
9419 if (gimple_vdef (stmt1)
9420 || gimple_has_volatile_ops (stmt1))
9421 break;
9422 gsi_from = gsi;
9423 gsi_prev (&gsi);
9424 lhs = gimple_get_lhs (stmt1);
9425 if (!lhs)
9426 break;
9427
9428 /* LHS of vectorized stmt must be SSA_NAME. */
9429 if (TREE_CODE (lhs) != SSA_NAME)
9430 break;
9431
9432 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9433 {
9434 /* Remove dead scalar statement. */
9435 if (has_zero_uses (lhs))
9436 {
9437 gsi_remove (&gsi_from, true);
9438 continue;
9439 }
9440 }
9441
9442 /* Check that LHS does not have uses outside of STORE_BB. */
9443 res = true;
9444 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9445 {
9446 gimple *use_stmt;
9447 use_stmt = USE_STMT (use_p);
9448 if (is_gimple_debug (use_stmt))
9449 continue;
9450 if (gimple_bb (use_stmt) != store_bb)
9451 {
9452 res = false;
9453 break;
9454 }
9455 }
9456 if (!res)
9457 break;
9458
9459 if (gimple_vuse (stmt1)
9460 && gimple_vuse (stmt1) != gimple_vuse (last_store))
9461 break;
9462
9463 /* Can move STMT1 to STORE_BB. */
9464 if (dump_enabled_p ())
9465 dump_printf_loc (MSG_NOTE, vect_location,
9466 "Move stmt to created bb\n%G", stmt1);
9467 gsi_move_before (&gsi_from, &gsi_to);
9468 /* Shift GSI_TO for further insertion. */
9469 gsi_prev (&gsi_to);
9470 }
9471 /* Put other masked stores with the same mask to STORE_BB. */
9472 if (worklist.is_empty ()
9473 || gimple_call_arg (worklist.last (), 2) != mask
9474 || worklist.last () != stmt1)
9475 break;
9476 last = worklist.pop ();
9477 }
9478 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9479 }
9480 }
9481
9482 /* Decide whether it is possible to use a zero-based induction variable
9483 when vectorizing LOOP_VINFO with partial vectors. If it is, return
9484 the value that the induction variable must be able to hold in order
9485 to ensure that the rgroups eventually have no active vector elements.
9486 Return -1 otherwise. */
9487
9488 widest_int
9489 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
9490 {
9491 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9492 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9493 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9494
9495 /* Calculate the value that the induction variable must be able
9496 to hit in order to ensure that we end the loop with an all-false mask.
9497 This involves adding the maximum number of inactive trailing scalar
9498 iterations. */
9499 widest_int iv_limit = -1;
9500 if (max_loop_iterations (loop, &iv_limit))
9501 {
9502 if (niters_skip)
9503 {
9504 /* Add the maximum number of skipped iterations to the
9505 maximum iteration count. */
9506 if (TREE_CODE (niters_skip) == INTEGER_CST)
9507 iv_limit += wi::to_widest (niters_skip);
9508 else
9509 iv_limit += max_vf - 1;
9510 }
9511 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9512 /* Make a conservatively-correct assumption. */
9513 iv_limit += max_vf - 1;
9514
9515 /* IV_LIMIT is the maximum number of latch iterations, which is also
9516 the maximum in-range IV value. Round this value down to the previous
9517 vector alignment boundary and then add an extra full iteration. */
9518 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9519 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9520 }
9521 return iv_limit;
9522 }
9523
9524 /* For the given rgroup_controls RGC, check whether an induction variable
9525 would ever hit a value that produces a set of all-false masks or zero
9526 lengths before wrapping around. Return true if it's possible to wrap
9527 around before hitting the desirable value, otherwise return false. */
9528
9529 bool
9530 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
9531 {
9532 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
9533
9534 if (iv_limit == -1)
9535 return true;
9536
9537 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9538 unsigned int compare_precision = TYPE_PRECISION (compare_type);
9539 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
9540
9541 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
9542 return true;
9543
9544 return false;
9545 }