add costing to SLP vectorized PHIs
[gcc.git] / gcc / tree-vect-loop.c
1 /* Loop Vectorization
2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
49 #include "cgraph.h"
50 #include "tree-cfg.h"
51 #include "tree-if-conv.h"
52 #include "internal-fn.h"
53 #include "tree-vector-builder.h"
54 #include "vec-perm-indices.h"
55 #include "tree-eh.h"
56
57 /* Loop Vectorization Pass.
58
59 This pass tries to vectorize loops.
60
61 For example, the vectorizer transforms the following simple loop:
62
63 short a[N]; short b[N]; short c[N]; int i;
64
65 for (i=0; i<N; i++){
66 a[i] = b[i] + c[i];
67 }
68
69 as if it was manually vectorized by rewriting the source code into:
70
71 typedef int __attribute__((mode(V8HI))) v8hi;
72 short a[N]; short b[N]; short c[N]; int i;
73 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
74 v8hi va, vb, vc;
75
76 for (i=0; i<N/8; i++){
77 vb = pb[i];
78 vc = pc[i];
79 va = vb + vc;
80 pa[i] = va;
81 }
82
83 The main entry to this pass is vectorize_loops(), in which
84 the vectorizer applies a set of analyses on a given set of loops,
85 followed by the actual vectorization transformation for the loops that
86 had successfully passed the analysis phase.
87 Throughout this pass we make a distinction between two types of
88 data: scalars (which are represented by SSA_NAMES), and memory references
89 ("data-refs"). These two types of data require different handling both
90 during analysis and transformation. The types of data-refs that the
91 vectorizer currently supports are ARRAY_REFS which base is an array DECL
92 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
93 accesses are required to have a simple (consecutive) access pattern.
94
95 Analysis phase:
96 ===============
97 The driver for the analysis phase is vect_analyze_loop().
98 It applies a set of analyses, some of which rely on the scalar evolution
99 analyzer (scev) developed by Sebastian Pop.
100
101 During the analysis phase the vectorizer records some information
102 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
103 loop, as well as general information about the loop as a whole, which is
104 recorded in a "loop_vec_info" struct attached to each loop.
105
106 Transformation phase:
107 =====================
108 The loop transformation phase scans all the stmts in the loop, and
109 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
110 the loop that needs to be vectorized. It inserts the vector code sequence
111 just before the scalar stmt S, and records a pointer to the vector code
112 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
113 attached to S). This pointer will be used for the vectorization of following
114 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
115 otherwise, we rely on dead code elimination for removing it.
116
117 For example, say stmt S1 was vectorized into stmt VS1:
118
119 VS1: vb = px[i];
120 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
121 S2: a = b;
122
123 To vectorize stmt S2, the vectorizer first finds the stmt that defines
124 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
125 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
126 resulting sequence would be:
127
128 VS1: vb = px[i];
129 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
130 VS2: va = vb;
131 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
132
133 Operands that are not SSA_NAMEs, are data-refs that appear in
134 load/store operations (like 'x[i]' in S1), and are handled differently.
135
136 Target modeling:
137 =================
138 Currently the only target specific information that is used is the
139 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
140 Targets that can support different sizes of vectors, for now will need
141 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
142 flexibility will be added in the future.
143
144 Since we only vectorize operations which vector form can be
145 expressed using existing tree codes, to verify that an operation is
146 supported, the vectorizer checks the relevant optab at the relevant
147 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
148 the value found is CODE_FOR_nothing, then there's no target support, and
149 we can't vectorize the stmt.
150
151 For additional information on this project see:
152 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
153 */
154
155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
157 bool *, bool *);
158
159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
160 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
161 may already be set for general statements (not just data refs). */
162
163 static opt_result
164 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
165 bool vectype_maybe_set_p,
166 poly_uint64 *vf)
167 {
168 gimple *stmt = stmt_info->stmt;
169
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
173 {
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return opt_result::success ();
177 }
178
179 tree stmt_vectype, nunits_vectype;
180 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
181 &stmt_vectype,
182 &nunits_vectype);
183 if (!res)
184 return res;
185
186 if (stmt_vectype)
187 {
188 if (STMT_VINFO_VECTYPE (stmt_info))
189 /* The only case when a vectype had been already set is for stmts
190 that contain a data ref, or for "pattern-stmts" (stmts generated
191 by the vectorizer to represent/replace a certain idiom). */
192 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
193 || vectype_maybe_set_p)
194 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
195 else
196 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
197 }
198
199 if (nunits_vectype)
200 vect_update_max_nunits (vf, nunits_vectype);
201
202 return opt_result::success ();
203 }
204
205 /* Subroutine of vect_determine_vectorization_factor. Set the vector
206 types of STMT_INFO and all attached pattern statements and update
207 the vectorization factor VF accordingly. Return true on success
208 or false if something prevented vectorization. */
209
210 static opt_result
211 vect_determine_vf_for_stmt (vec_info *vinfo,
212 stmt_vec_info stmt_info, poly_uint64 *vf)
213 {
214 if (dump_enabled_p ())
215 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
216 stmt_info->stmt);
217 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
218 if (!res)
219 return res;
220
221 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
222 && STMT_VINFO_RELATED_STMT (stmt_info))
223 {
224 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
225 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
226
227 /* If a pattern statement has def stmts, analyze them too. */
228 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
229 !gsi_end_p (si); gsi_next (&si))
230 {
231 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
232 if (dump_enabled_p ())
233 dump_printf_loc (MSG_NOTE, vect_location,
234 "==> examining pattern def stmt: %G",
235 def_stmt_info->stmt);
236 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
237 if (!res)
238 return res;
239 }
240
241 if (dump_enabled_p ())
242 dump_printf_loc (MSG_NOTE, vect_location,
243 "==> examining pattern statement: %G",
244 stmt_info->stmt);
245 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
246 if (!res)
247 return res;
248 }
249
250 return opt_result::success ();
251 }
252
253 /* Function vect_determine_vectorization_factor
254
255 Determine the vectorization factor (VF). VF is the number of data elements
256 that are operated upon in parallel in a single iteration of the vectorized
257 loop. For example, when vectorizing a loop that operates on 4byte elements,
258 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
259 elements can fit in a single vector register.
260
261 We currently support vectorization of loops in which all types operated upon
262 are of the same size. Therefore this function currently sets VF according to
263 the size of the types operated upon, and fails if there are multiple sizes
264 in the loop.
265
266 VF is also the factor by which the loop iterations are strip-mined, e.g.:
267 original loop:
268 for (i=0; i<N; i++){
269 a[i] = b[i] + c[i];
270 }
271
272 vectorized loop:
273 for (i=0; i<N; i+=VF){
274 a[i:VF] = b[i:VF] + c[i:VF];
275 }
276 */
277
278 static opt_result
279 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
280 {
281 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
282 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
283 unsigned nbbs = loop->num_nodes;
284 poly_uint64 vectorization_factor = 1;
285 tree scalar_type = NULL_TREE;
286 gphi *phi;
287 tree vectype;
288 stmt_vec_info stmt_info;
289 unsigned i;
290
291 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
292
293 for (i = 0; i < nbbs; i++)
294 {
295 basic_block bb = bbs[i];
296
297 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
298 gsi_next (&si))
299 {
300 phi = si.phi ();
301 stmt_info = loop_vinfo->lookup_stmt (phi);
302 if (dump_enabled_p ())
303 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
304 phi);
305
306 gcc_assert (stmt_info);
307
308 if (STMT_VINFO_RELEVANT_P (stmt_info)
309 || STMT_VINFO_LIVE_P (stmt_info))
310 {
311 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
312 scalar_type = TREE_TYPE (PHI_RESULT (phi));
313
314 if (dump_enabled_p ())
315 dump_printf_loc (MSG_NOTE, vect_location,
316 "get vectype for scalar type: %T\n",
317 scalar_type);
318
319 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
320 if (!vectype)
321 return opt_result::failure_at (phi,
322 "not vectorized: unsupported "
323 "data-type %T\n",
324 scalar_type);
325 STMT_VINFO_VECTYPE (stmt_info) = vectype;
326
327 if (dump_enabled_p ())
328 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
329 vectype);
330
331 if (dump_enabled_p ())
332 {
333 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
334 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
335 dump_printf (MSG_NOTE, "\n");
336 }
337
338 vect_update_max_nunits (&vectorization_factor, vectype);
339 }
340 }
341
342 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
343 gsi_next (&si))
344 {
345 if (is_gimple_debug (gsi_stmt (si)))
346 continue;
347 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
348 opt_result res
349 = vect_determine_vf_for_stmt (loop_vinfo,
350 stmt_info, &vectorization_factor);
351 if (!res)
352 return res;
353 }
354 }
355
356 /* TODO: Analyze cost. Decide if worth while to vectorize. */
357 if (dump_enabled_p ())
358 {
359 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
360 dump_dec (MSG_NOTE, vectorization_factor);
361 dump_printf (MSG_NOTE, "\n");
362 }
363
364 if (known_le (vectorization_factor, 1U))
365 return opt_result::failure_at (vect_location,
366 "not vectorized: unsupported data-type\n");
367 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
368 return opt_result::success ();
369 }
370
371
372 /* Function vect_is_simple_iv_evolution.
373
374 FORNOW: A simple evolution of an induction variables in the loop is
375 considered a polynomial evolution. */
376
377 static bool
378 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
379 tree * step)
380 {
381 tree init_expr;
382 tree step_expr;
383 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
384 basic_block bb;
385
386 /* When there is no evolution in this loop, the evolution function
387 is not "simple". */
388 if (evolution_part == NULL_TREE)
389 return false;
390
391 /* When the evolution is a polynomial of degree >= 2
392 the evolution function is not "simple". */
393 if (tree_is_chrec (evolution_part))
394 return false;
395
396 step_expr = evolution_part;
397 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
398
399 if (dump_enabled_p ())
400 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
401 step_expr, init_expr);
402
403 *init = init_expr;
404 *step = step_expr;
405
406 if (TREE_CODE (step_expr) != INTEGER_CST
407 && (TREE_CODE (step_expr) != SSA_NAME
408 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
409 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
410 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
411 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
412 || !flag_associative_math)))
413 && (TREE_CODE (step_expr) != REAL_CST
414 || !flag_associative_math))
415 {
416 if (dump_enabled_p ())
417 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
418 "step unknown.\n");
419 return false;
420 }
421
422 return true;
423 }
424
425 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
426 what we are assuming is a double reduction. For example, given
427 a structure like this:
428
429 outer1:
430 x_1 = PHI <x_4(outer2), ...>;
431 ...
432
433 inner:
434 x_2 = PHI <x_1(outer1), ...>;
435 ...
436 x_3 = ...;
437 ...
438
439 outer2:
440 x_4 = PHI <x_3(inner)>;
441 ...
442
443 outer loop analysis would treat x_1 as a double reduction phi and
444 this function would then return true for x_2. */
445
446 static bool
447 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
448 {
449 use_operand_p use_p;
450 ssa_op_iter op_iter;
451 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
452 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
453 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
454 return true;
455 return false;
456 }
457
458 /* Function vect_analyze_scalar_cycles_1.
459
460 Examine the cross iteration def-use cycles of scalar variables
461 in LOOP. LOOP_VINFO represents the loop that is now being
462 considered for vectorization (can be LOOP, or an outer-loop
463 enclosing LOOP). */
464
465 static void
466 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
467 {
468 basic_block bb = loop->header;
469 tree init, step;
470 auto_vec<stmt_vec_info, 64> worklist;
471 gphi_iterator gsi;
472 bool double_reduc, reduc_chain;
473
474 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
475
476 /* First - identify all inductions. Reduction detection assumes that all the
477 inductions have been identified, therefore, this order must not be
478 changed. */
479 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
480 {
481 gphi *phi = gsi.phi ();
482 tree access_fn = NULL;
483 tree def = PHI_RESULT (phi);
484 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
485
486 if (dump_enabled_p ())
487 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
488
489 /* Skip virtual phi's. The data dependences that are associated with
490 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
491 if (virtual_operand_p (def))
492 continue;
493
494 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
495
496 /* Analyze the evolution function. */
497 access_fn = analyze_scalar_evolution (loop, def);
498 if (access_fn)
499 {
500 STRIP_NOPS (access_fn);
501 if (dump_enabled_p ())
502 dump_printf_loc (MSG_NOTE, vect_location,
503 "Access function of PHI: %T\n", access_fn);
504 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
505 = initial_condition_in_loop_num (access_fn, loop->num);
506 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
507 = evolution_part_in_loop_num (access_fn, loop->num);
508 }
509
510 if (!access_fn
511 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
512 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
513 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
514 && TREE_CODE (step) != INTEGER_CST))
515 {
516 worklist.safe_push (stmt_vinfo);
517 continue;
518 }
519
520 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
521 != NULL_TREE);
522 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
523
524 if (dump_enabled_p ())
525 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
526 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
527 }
528
529
530 /* Second - identify all reductions and nested cycles. */
531 while (worklist.length () > 0)
532 {
533 stmt_vec_info stmt_vinfo = worklist.pop ();
534 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
535 tree def = PHI_RESULT (phi);
536
537 if (dump_enabled_p ())
538 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
539
540 gcc_assert (!virtual_operand_p (def)
541 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
542
543 stmt_vec_info reduc_stmt_info
544 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
545 &reduc_chain);
546 if (reduc_stmt_info)
547 {
548 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
549 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
550 if (double_reduc)
551 {
552 if (dump_enabled_p ())
553 dump_printf_loc (MSG_NOTE, vect_location,
554 "Detected double reduction.\n");
555
556 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
557 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
558 }
559 else
560 {
561 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
562 {
563 if (dump_enabled_p ())
564 dump_printf_loc (MSG_NOTE, vect_location,
565 "Detected vectorizable nested cycle.\n");
566
567 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
568 }
569 else
570 {
571 if (dump_enabled_p ())
572 dump_printf_loc (MSG_NOTE, vect_location,
573 "Detected reduction.\n");
574
575 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
576 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
577 /* Store the reduction cycles for possible vectorization in
578 loop-aware SLP if it was not detected as reduction
579 chain. */
580 if (! reduc_chain)
581 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
582 (reduc_stmt_info);
583 }
584 }
585 }
586 else
587 if (dump_enabled_p ())
588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
589 "Unknown def-use cycle pattern.\n");
590 }
591 }
592
593
594 /* Function vect_analyze_scalar_cycles.
595
596 Examine the cross iteration def-use cycles of scalar variables, by
597 analyzing the loop-header PHIs of scalar variables. Classify each
598 cycle as one of the following: invariant, induction, reduction, unknown.
599 We do that for the loop represented by LOOP_VINFO, and also to its
600 inner-loop, if exists.
601 Examples for scalar cycles:
602
603 Example1: reduction:
604
605 loop1:
606 for (i=0; i<N; i++)
607 sum += a[i];
608
609 Example2: induction:
610
611 loop2:
612 for (i=0; i<N; i++)
613 a[i] = i; */
614
615 static void
616 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
617 {
618 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
619
620 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
621
622 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
623 Reductions in such inner-loop therefore have different properties than
624 the reductions in the nest that gets vectorized:
625 1. When vectorized, they are executed in the same order as in the original
626 scalar loop, so we can't change the order of computation when
627 vectorizing them.
628 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
629 current checks are too strict. */
630
631 if (loop->inner)
632 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
633 }
634
635 /* Transfer group and reduction information from STMT_INFO to its
636 pattern stmt. */
637
638 static void
639 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
640 {
641 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
642 stmt_vec_info stmtp;
643 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
644 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
645 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
646 do
647 {
648 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
649 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
650 == STMT_VINFO_DEF_TYPE (stmt_info));
651 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
652 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
653 if (stmt_info)
654 REDUC_GROUP_NEXT_ELEMENT (stmtp)
655 = STMT_VINFO_RELATED_STMT (stmt_info);
656 }
657 while (stmt_info);
658 }
659
660 /* Fixup scalar cycles that now have their stmts detected as patterns. */
661
662 static void
663 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
664 {
665 stmt_vec_info first;
666 unsigned i;
667
668 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
669 {
670 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
671 while (next)
672 {
673 if ((STMT_VINFO_IN_PATTERN_P (next)
674 != STMT_VINFO_IN_PATTERN_P (first))
675 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
676 break;
677 next = REDUC_GROUP_NEXT_ELEMENT (next);
678 }
679 /* If all reduction chain members are well-formed patterns adjust
680 the group to group the pattern stmts instead. */
681 if (! next
682 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
683 {
684 if (STMT_VINFO_IN_PATTERN_P (first))
685 {
686 vect_fixup_reduc_chain (first);
687 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
688 = STMT_VINFO_RELATED_STMT (first);
689 }
690 }
691 /* If not all stmt in the chain are patterns or if we failed
692 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
693 it as regular reduction instead. */
694 else
695 {
696 stmt_vec_info vinfo = first;
697 stmt_vec_info last = NULL;
698 while (vinfo)
699 {
700 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
701 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
702 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
703 last = vinfo;
704 vinfo = next;
705 }
706 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
707 = vect_internal_def;
708 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
709 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
710 --i;
711 }
712 }
713 }
714
715 /* Function vect_get_loop_niters.
716
717 Determine how many iterations the loop is executed and place it
718 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
719 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
720 niter information holds in ASSUMPTIONS.
721
722 Return the loop exit condition. */
723
724
725 static gcond *
726 vect_get_loop_niters (class loop *loop, tree *assumptions,
727 tree *number_of_iterations, tree *number_of_iterationsm1)
728 {
729 edge exit = single_exit (loop);
730 class tree_niter_desc niter_desc;
731 tree niter_assumptions, niter, may_be_zero;
732 gcond *cond = get_loop_exit_condition (loop);
733
734 *assumptions = boolean_true_node;
735 *number_of_iterationsm1 = chrec_dont_know;
736 *number_of_iterations = chrec_dont_know;
737 DUMP_VECT_SCOPE ("get_loop_niters");
738
739 if (!exit)
740 return cond;
741
742 may_be_zero = NULL_TREE;
743 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
744 || chrec_contains_undetermined (niter_desc.niter))
745 return cond;
746
747 niter_assumptions = niter_desc.assumptions;
748 may_be_zero = niter_desc.may_be_zero;
749 niter = niter_desc.niter;
750
751 if (may_be_zero && integer_zerop (may_be_zero))
752 may_be_zero = NULL_TREE;
753
754 if (may_be_zero)
755 {
756 if (COMPARISON_CLASS_P (may_be_zero))
757 {
758 /* Try to combine may_be_zero with assumptions, this can simplify
759 computation of niter expression. */
760 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
761 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
762 niter_assumptions,
763 fold_build1 (TRUTH_NOT_EXPR,
764 boolean_type_node,
765 may_be_zero));
766 else
767 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
768 build_int_cst (TREE_TYPE (niter), 0),
769 rewrite_to_non_trapping_overflow (niter));
770
771 may_be_zero = NULL_TREE;
772 }
773 else if (integer_nonzerop (may_be_zero))
774 {
775 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
776 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
777 return cond;
778 }
779 else
780 return cond;
781 }
782
783 *assumptions = niter_assumptions;
784 *number_of_iterationsm1 = niter;
785
786 /* We want the number of loop header executions which is the number
787 of latch executions plus one.
788 ??? For UINT_MAX latch executions this number overflows to zero
789 for loops like do { n++; } while (n != 0); */
790 if (niter && !chrec_contains_undetermined (niter))
791 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
792 build_int_cst (TREE_TYPE (niter), 1));
793 *number_of_iterations = niter;
794
795 return cond;
796 }
797
798 /* Function bb_in_loop_p
799
800 Used as predicate for dfs order traversal of the loop bbs. */
801
802 static bool
803 bb_in_loop_p (const_basic_block bb, const void *data)
804 {
805 const class loop *const loop = (const class loop *)data;
806 if (flow_bb_inside_loop_p (loop, bb))
807 return true;
808 return false;
809 }
810
811
812 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
813 stmt_vec_info structs for all the stmts in LOOP_IN. */
814
815 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
816 : vec_info (vec_info::loop, init_cost (loop_in), shared),
817 loop (loop_in),
818 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
819 num_itersm1 (NULL_TREE),
820 num_iters (NULL_TREE),
821 num_iters_unchanged (NULL_TREE),
822 num_iters_assumptions (NULL_TREE),
823 th (0),
824 versioning_threshold (0),
825 vectorization_factor (0),
826 max_vectorization_factor (0),
827 mask_skip_niters (NULL_TREE),
828 rgroup_compare_type (NULL_TREE),
829 simd_if_cond (NULL_TREE),
830 unaligned_dr (NULL),
831 peeling_for_alignment (0),
832 ptr_mask (0),
833 ivexpr_map (NULL),
834 scan_map (NULL),
835 slp_unrolling_factor (1),
836 single_scalar_iteration_cost (0),
837 vec_outside_cost (0),
838 vec_inside_cost (0),
839 vectorizable (false),
840 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
841 using_partial_vectors_p (false),
842 epil_using_partial_vectors_p (false),
843 peeling_for_gaps (false),
844 peeling_for_niter (false),
845 no_data_dependencies (false),
846 has_mask_store (false),
847 scalar_loop_scaling (profile_probability::uninitialized ()),
848 scalar_loop (NULL),
849 orig_loop_info (NULL)
850 {
851 /* CHECKME: We want to visit all BBs before their successors (except for
852 latch blocks, for which this assertion wouldn't hold). In the simple
853 case of the loop forms we allow, a dfs order of the BBs would the same
854 as reversed postorder traversal, so we are safe. */
855
856 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
857 bbs, loop->num_nodes, loop);
858 gcc_assert (nbbs == loop->num_nodes);
859
860 for (unsigned int i = 0; i < nbbs; i++)
861 {
862 basic_block bb = bbs[i];
863 gimple_stmt_iterator si;
864
865 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
866 {
867 gimple *phi = gsi_stmt (si);
868 gimple_set_uid (phi, 0);
869 add_stmt (phi);
870 }
871
872 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
873 {
874 gimple *stmt = gsi_stmt (si);
875 gimple_set_uid (stmt, 0);
876 if (is_gimple_debug (stmt))
877 continue;
878 add_stmt (stmt);
879 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
880 third argument is the #pragma omp simd if (x) condition, when 0,
881 loop shouldn't be vectorized, when non-zero constant, it should
882 be vectorized normally, otherwise versioned with vectorized loop
883 done if the condition is non-zero at runtime. */
884 if (loop_in->simduid
885 && is_gimple_call (stmt)
886 && gimple_call_internal_p (stmt)
887 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
888 && gimple_call_num_args (stmt) >= 3
889 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
890 && (loop_in->simduid
891 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
892 {
893 tree arg = gimple_call_arg (stmt, 2);
894 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
895 simd_if_cond = arg;
896 else
897 gcc_assert (integer_nonzerop (arg));
898 }
899 }
900 }
901
902 epilogue_vinfos.create (6);
903 }
904
905 /* Free all levels of rgroup CONTROLS. */
906
907 void
908 release_vec_loop_controls (vec<rgroup_controls> *controls)
909 {
910 rgroup_controls *rgc;
911 unsigned int i;
912 FOR_EACH_VEC_ELT (*controls, i, rgc)
913 rgc->controls.release ();
914 controls->release ();
915 }
916
917 /* Free all memory used by the _loop_vec_info, as well as all the
918 stmt_vec_info structs of all the stmts in the loop. */
919
920 _loop_vec_info::~_loop_vec_info ()
921 {
922 free (bbs);
923
924 release_vec_loop_controls (&masks);
925 release_vec_loop_controls (&lens);
926 delete ivexpr_map;
927 delete scan_map;
928 epilogue_vinfos.release ();
929
930 loop->aux = NULL;
931 }
932
933 /* Return an invariant or register for EXPR and emit necessary
934 computations in the LOOP_VINFO loop preheader. */
935
936 tree
937 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
938 {
939 if (is_gimple_reg (expr)
940 || is_gimple_min_invariant (expr))
941 return expr;
942
943 if (! loop_vinfo->ivexpr_map)
944 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
945 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
946 if (! cached)
947 {
948 gimple_seq stmts = NULL;
949 cached = force_gimple_operand (unshare_expr (expr),
950 &stmts, true, NULL_TREE);
951 if (stmts)
952 {
953 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
954 gsi_insert_seq_on_edge_immediate (e, stmts);
955 }
956 }
957 return cached;
958 }
959
960 /* Return true if we can use CMP_TYPE as the comparison type to produce
961 all masks required to mask LOOP_VINFO. */
962
963 static bool
964 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
965 {
966 rgroup_controls *rgm;
967 unsigned int i;
968 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
969 if (rgm->type != NULL_TREE
970 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
971 cmp_type, rgm->type,
972 OPTIMIZE_FOR_SPEED))
973 return false;
974 return true;
975 }
976
977 /* Calculate the maximum number of scalars per iteration for every
978 rgroup in LOOP_VINFO. */
979
980 static unsigned int
981 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
982 {
983 unsigned int res = 1;
984 unsigned int i;
985 rgroup_controls *rgm;
986 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
987 res = MAX (res, rgm->max_nscalars_per_iter);
988 return res;
989 }
990
991 /* Calculate the minimum precision necessary to represent:
992
993 MAX_NITERS * FACTOR
994
995 as an unsigned integer, where MAX_NITERS is the maximum number of
996 loop header iterations for the original scalar form of LOOP_VINFO. */
997
998 static unsigned
999 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1000 {
1001 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1002
1003 /* Get the maximum number of iterations that is representable
1004 in the counter type. */
1005 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1006 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1007
1008 /* Get a more refined estimate for the number of iterations. */
1009 widest_int max_back_edges;
1010 if (max_loop_iterations (loop, &max_back_edges))
1011 max_ni = wi::smin (max_ni, max_back_edges + 1);
1012
1013 /* Work out how many bits we need to represent the limit. */
1014 return wi::min_precision (max_ni * factor, UNSIGNED);
1015 }
1016
1017 /* True if the loop needs peeling or partial vectors when vectorized. */
1018
1019 static bool
1020 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1021 {
1022 unsigned HOST_WIDE_INT const_vf;
1023 HOST_WIDE_INT max_niter
1024 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1025
1026 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1027 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1028 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1029 (loop_vinfo));
1030
1031 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1032 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1033 {
1034 /* Work out the (constant) number of iterations that need to be
1035 peeled for reasons other than niters. */
1036 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1037 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1038 peel_niter += 1;
1039 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1040 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1041 return true;
1042 }
1043 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1044 /* ??? When peeling for gaps but not alignment, we could
1045 try to check whether the (variable) niters is known to be
1046 VF * N + 1. That's something of a niche case though. */
1047 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1048 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1049 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1050 < (unsigned) exact_log2 (const_vf))
1051 /* In case of versioning, check if the maximum number of
1052 iterations is greater than th. If they are identical,
1053 the epilogue is unnecessary. */
1054 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1055 || ((unsigned HOST_WIDE_INT) max_niter
1056 > (th / const_vf) * const_vf))))
1057 return true;
1058
1059 return false;
1060 }
1061
1062 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1063 whether we can actually generate the masks required. Return true if so,
1064 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1065
1066 static bool
1067 vect_verify_full_masking (loop_vec_info loop_vinfo)
1068 {
1069 unsigned int min_ni_width;
1070 unsigned int max_nscalars_per_iter
1071 = vect_get_max_nscalars_per_iter (loop_vinfo);
1072
1073 /* Use a normal loop if there are no statements that need masking.
1074 This only happens in rare degenerate cases: it means that the loop
1075 has no loads, no stores, and no live-out values. */
1076 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1077 return false;
1078
1079 /* Work out how many bits we need to represent the limit. */
1080 min_ni_width
1081 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1082
1083 /* Find a scalar mode for which WHILE_ULT is supported. */
1084 opt_scalar_int_mode cmp_mode_iter;
1085 tree cmp_type = NULL_TREE;
1086 tree iv_type = NULL_TREE;
1087 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1088 unsigned int iv_precision = UINT_MAX;
1089
1090 if (iv_limit != -1)
1091 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1092 UNSIGNED);
1093
1094 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1095 {
1096 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1097 if (cmp_bits >= min_ni_width
1098 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1099 {
1100 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1101 if (this_type
1102 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1103 {
1104 /* Although we could stop as soon as we find a valid mode,
1105 there are at least two reasons why that's not always the
1106 best choice:
1107
1108 - An IV that's Pmode or wider is more likely to be reusable
1109 in address calculations than an IV that's narrower than
1110 Pmode.
1111
1112 - Doing the comparison in IV_PRECISION or wider allows
1113 a natural 0-based IV, whereas using a narrower comparison
1114 type requires mitigations against wrap-around.
1115
1116 Conversely, if the IV limit is variable, doing the comparison
1117 in a wider type than the original type can introduce
1118 unnecessary extensions, so picking the widest valid mode
1119 is not always a good choice either.
1120
1121 Here we prefer the first IV type that's Pmode or wider,
1122 and the first comparison type that's IV_PRECISION or wider.
1123 (The comparison type must be no wider than the IV type,
1124 to avoid extensions in the vector loop.)
1125
1126 ??? We might want to try continuing beyond Pmode for ILP32
1127 targets if CMP_BITS < IV_PRECISION. */
1128 iv_type = this_type;
1129 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1130 cmp_type = this_type;
1131 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1132 break;
1133 }
1134 }
1135 }
1136
1137 if (!cmp_type)
1138 return false;
1139
1140 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1141 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1142 return true;
1143 }
1144
1145 /* Check whether we can use vector access with length based on precison
1146 comparison. So far, to keep it simple, we only allow the case that the
1147 precision of the target supported length is larger than the precision
1148 required by loop niters. */
1149
1150 static bool
1151 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1152 {
1153 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1154 return false;
1155
1156 unsigned int max_nitems_per_iter = 1;
1157 unsigned int i;
1158 rgroup_controls *rgl;
1159 /* Find the maximum number of items per iteration for every rgroup. */
1160 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1161 {
1162 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1163 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1164 }
1165
1166 /* Work out how many bits we need to represent the length limit. */
1167 unsigned int min_ni_prec
1168 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1169
1170 /* Now use the maximum of below precisions for one suitable IV type:
1171 - the IV's natural precision
1172 - the precision needed to hold: the maximum number of scalar
1173 iterations multiplied by the scale factor (min_ni_prec above)
1174 - the Pmode precision
1175
1176 If min_ni_prec is less than the precision of the current niters,
1177 we perfer to still use the niters type. Prefer to use Pmode and
1178 wider IV to avoid narrow conversions. */
1179
1180 unsigned int ni_prec
1181 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1182 min_ni_prec = MAX (min_ni_prec, ni_prec);
1183 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1184
1185 tree iv_type = NULL_TREE;
1186 opt_scalar_int_mode tmode_iter;
1187 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1188 {
1189 scalar_mode tmode = tmode_iter.require ();
1190 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1191
1192 /* ??? Do we really want to construct one IV whose precision exceeds
1193 BITS_PER_WORD? */
1194 if (tbits > BITS_PER_WORD)
1195 break;
1196
1197 /* Find the first available standard integral type. */
1198 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1199 {
1200 iv_type = build_nonstandard_integer_type (tbits, true);
1201 break;
1202 }
1203 }
1204
1205 if (!iv_type)
1206 {
1207 if (dump_enabled_p ())
1208 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1209 "can't vectorize with length-based partial vectors"
1210 " because there is no suitable iv type.\n");
1211 return false;
1212 }
1213
1214 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1215 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1216
1217 return true;
1218 }
1219
1220 /* Calculate the cost of one scalar iteration of the loop. */
1221 static void
1222 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1223 {
1224 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1225 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1226 int nbbs = loop->num_nodes, factor;
1227 int innerloop_iters, i;
1228
1229 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1230
1231 /* Gather costs for statements in the scalar loop. */
1232
1233 /* FORNOW. */
1234 innerloop_iters = 1;
1235 if (loop->inner)
1236 innerloop_iters = 50; /* FIXME */
1237
1238 for (i = 0; i < nbbs; i++)
1239 {
1240 gimple_stmt_iterator si;
1241 basic_block bb = bbs[i];
1242
1243 if (bb->loop_father == loop->inner)
1244 factor = innerloop_iters;
1245 else
1246 factor = 1;
1247
1248 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1249 {
1250 gimple *stmt = gsi_stmt (si);
1251 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1252
1253 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1254 continue;
1255
1256 /* Skip stmts that are not vectorized inside the loop. */
1257 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1258 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1259 && (!STMT_VINFO_LIVE_P (vstmt_info)
1260 || !VECTORIZABLE_CYCLE_DEF
1261 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1262 continue;
1263
1264 vect_cost_for_stmt kind;
1265 if (STMT_VINFO_DATA_REF (stmt_info))
1266 {
1267 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1268 kind = scalar_load;
1269 else
1270 kind = scalar_store;
1271 }
1272 else if (vect_nop_conversion_p (stmt_info))
1273 continue;
1274 else
1275 kind = scalar_stmt;
1276
1277 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1278 factor, kind, stmt_info, 0, vect_prologue);
1279 }
1280 }
1281
1282 /* Now accumulate cost. */
1283 void *target_cost_data = init_cost (loop);
1284 stmt_info_for_cost *si;
1285 int j;
1286 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1287 j, si)
1288 (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
1289 si->kind, si->stmt_info, si->vectype,
1290 si->misalign, vect_body);
1291 unsigned dummy, body_cost = 0;
1292 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1293 destroy_cost_data (target_cost_data);
1294 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1295 }
1296
1297
1298 /* Function vect_analyze_loop_form_1.
1299
1300 Verify that certain CFG restrictions hold, including:
1301 - the loop has a pre-header
1302 - the loop has a single entry and exit
1303 - the loop exit condition is simple enough
1304 - the number of iterations can be analyzed, i.e, a countable loop. The
1305 niter could be analyzed under some assumptions. */
1306
1307 opt_result
1308 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1309 tree *assumptions, tree *number_of_iterationsm1,
1310 tree *number_of_iterations, gcond **inner_loop_cond)
1311 {
1312 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1313
1314 /* Different restrictions apply when we are considering an inner-most loop,
1315 vs. an outer (nested) loop.
1316 (FORNOW. May want to relax some of these restrictions in the future). */
1317
1318 if (!loop->inner)
1319 {
1320 /* Inner-most loop. We currently require that the number of BBs is
1321 exactly 2 (the header and latch). Vectorizable inner-most loops
1322 look like this:
1323
1324 (pre-header)
1325 |
1326 header <--------+
1327 | | |
1328 | +--> latch --+
1329 |
1330 (exit-bb) */
1331
1332 if (loop->num_nodes != 2)
1333 return opt_result::failure_at (vect_location,
1334 "not vectorized:"
1335 " control flow in loop.\n");
1336
1337 if (empty_block_p (loop->header))
1338 return opt_result::failure_at (vect_location,
1339 "not vectorized: empty loop.\n");
1340 }
1341 else
1342 {
1343 class loop *innerloop = loop->inner;
1344 edge entryedge;
1345
1346 /* Nested loop. We currently require that the loop is doubly-nested,
1347 contains a single inner loop, and the number of BBs is exactly 5.
1348 Vectorizable outer-loops look like this:
1349
1350 (pre-header)
1351 |
1352 header <---+
1353 | |
1354 inner-loop |
1355 | |
1356 tail ------+
1357 |
1358 (exit-bb)
1359
1360 The inner-loop has the properties expected of inner-most loops
1361 as described above. */
1362
1363 if ((loop->inner)->inner || (loop->inner)->next)
1364 return opt_result::failure_at (vect_location,
1365 "not vectorized:"
1366 " multiple nested loops.\n");
1367
1368 if (loop->num_nodes != 5)
1369 return opt_result::failure_at (vect_location,
1370 "not vectorized:"
1371 " control flow in loop.\n");
1372
1373 entryedge = loop_preheader_edge (innerloop);
1374 if (entryedge->src != loop->header
1375 || !single_exit (innerloop)
1376 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1377 return opt_result::failure_at (vect_location,
1378 "not vectorized:"
1379 " unsupported outerloop form.\n");
1380
1381 /* Analyze the inner-loop. */
1382 tree inner_niterm1, inner_niter, inner_assumptions;
1383 opt_result res
1384 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1385 &inner_assumptions, &inner_niterm1,
1386 &inner_niter, NULL);
1387 if (!res)
1388 {
1389 if (dump_enabled_p ())
1390 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1391 "not vectorized: Bad inner loop.\n");
1392 return res;
1393 }
1394
1395 /* Don't support analyzing niter under assumptions for inner
1396 loop. */
1397 if (!integer_onep (inner_assumptions))
1398 return opt_result::failure_at (vect_location,
1399 "not vectorized: Bad inner loop.\n");
1400
1401 if (!expr_invariant_in_loop_p (loop, inner_niter))
1402 return opt_result::failure_at (vect_location,
1403 "not vectorized: inner-loop count not"
1404 " invariant.\n");
1405
1406 if (dump_enabled_p ())
1407 dump_printf_loc (MSG_NOTE, vect_location,
1408 "Considering outer-loop vectorization.\n");
1409 }
1410
1411 if (!single_exit (loop))
1412 return opt_result::failure_at (vect_location,
1413 "not vectorized: multiple exits.\n");
1414 if (EDGE_COUNT (loop->header->preds) != 2)
1415 return opt_result::failure_at (vect_location,
1416 "not vectorized:"
1417 " too many incoming edges.\n");
1418
1419 /* We assume that the loop exit condition is at the end of the loop. i.e,
1420 that the loop is represented as a do-while (with a proper if-guard
1421 before the loop if needed), where the loop header contains all the
1422 executable statements, and the latch is empty. */
1423 if (!empty_block_p (loop->latch)
1424 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1425 return opt_result::failure_at (vect_location,
1426 "not vectorized: latch block not empty.\n");
1427
1428 /* Make sure the exit is not abnormal. */
1429 edge e = single_exit (loop);
1430 if (e->flags & EDGE_ABNORMAL)
1431 return opt_result::failure_at (vect_location,
1432 "not vectorized:"
1433 " abnormal loop exit edge.\n");
1434
1435 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1436 number_of_iterationsm1);
1437 if (!*loop_cond)
1438 return opt_result::failure_at
1439 (vect_location,
1440 "not vectorized: complicated exit condition.\n");
1441
1442 if (integer_zerop (*assumptions)
1443 || !*number_of_iterations
1444 || chrec_contains_undetermined (*number_of_iterations))
1445 return opt_result::failure_at
1446 (*loop_cond,
1447 "not vectorized: number of iterations cannot be computed.\n");
1448
1449 if (integer_zerop (*number_of_iterations))
1450 return opt_result::failure_at
1451 (*loop_cond,
1452 "not vectorized: number of iterations = 0.\n");
1453
1454 return opt_result::success ();
1455 }
1456
1457 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1458
1459 opt_loop_vec_info
1460 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1461 {
1462 tree assumptions, number_of_iterations, number_of_iterationsm1;
1463 gcond *loop_cond, *inner_loop_cond = NULL;
1464
1465 opt_result res
1466 = vect_analyze_loop_form_1 (loop, &loop_cond,
1467 &assumptions, &number_of_iterationsm1,
1468 &number_of_iterations, &inner_loop_cond);
1469 if (!res)
1470 return opt_loop_vec_info::propagate_failure (res);
1471
1472 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1473 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1474 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1475 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1476 if (!integer_onep (assumptions))
1477 {
1478 /* We consider to vectorize this loop by versioning it under
1479 some assumptions. In order to do this, we need to clear
1480 existing information computed by scev and niter analyzer. */
1481 scev_reset_htab ();
1482 free_numbers_of_iterations_estimates (loop);
1483 /* Also set flag for this loop so that following scev and niter
1484 analysis are done under the assumptions. */
1485 loop_constraint_set (loop, LOOP_C_FINITE);
1486 /* Also record the assumptions for versioning. */
1487 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1488 }
1489
1490 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1491 {
1492 if (dump_enabled_p ())
1493 {
1494 dump_printf_loc (MSG_NOTE, vect_location,
1495 "Symbolic number of iterations is ");
1496 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1497 dump_printf (MSG_NOTE, "\n");
1498 }
1499 }
1500
1501 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1502 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1503 if (inner_loop_cond)
1504 {
1505 stmt_vec_info inner_loop_cond_info
1506 = loop_vinfo->lookup_stmt (inner_loop_cond);
1507 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1508 }
1509
1510 gcc_assert (!loop->aux);
1511 loop->aux = loop_vinfo;
1512 return opt_loop_vec_info::success (loop_vinfo);
1513 }
1514
1515
1516
1517 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1518 statements update the vectorization factor. */
1519
1520 static void
1521 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1522 {
1523 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1524 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1525 int nbbs = loop->num_nodes;
1526 poly_uint64 vectorization_factor;
1527 int i;
1528
1529 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1530
1531 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1532 gcc_assert (known_ne (vectorization_factor, 0U));
1533
1534 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1535 vectorization factor of the loop is the unrolling factor required by
1536 the SLP instances. If that unrolling factor is 1, we say, that we
1537 perform pure SLP on loop - cross iteration parallelism is not
1538 exploited. */
1539 bool only_slp_in_loop = true;
1540 for (i = 0; i < nbbs; i++)
1541 {
1542 basic_block bb = bbs[i];
1543 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1544 gsi_next (&si))
1545 {
1546 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1547 if (!stmt_info)
1548 continue;
1549 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1550 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1551 && !PURE_SLP_STMT (stmt_info))
1552 /* STMT needs both SLP and loop-based vectorization. */
1553 only_slp_in_loop = false;
1554 }
1555 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1556 gsi_next (&si))
1557 {
1558 if (is_gimple_debug (gsi_stmt (si)))
1559 continue;
1560 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1561 stmt_info = vect_stmt_to_vectorize (stmt_info);
1562 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1563 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1564 && !PURE_SLP_STMT (stmt_info))
1565 /* STMT needs both SLP and loop-based vectorization. */
1566 only_slp_in_loop = false;
1567 }
1568 }
1569
1570 if (only_slp_in_loop)
1571 {
1572 if (dump_enabled_p ())
1573 dump_printf_loc (MSG_NOTE, vect_location,
1574 "Loop contains only SLP stmts\n");
1575 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1576 }
1577 else
1578 {
1579 if (dump_enabled_p ())
1580 dump_printf_loc (MSG_NOTE, vect_location,
1581 "Loop contains SLP and non-SLP stmts\n");
1582 /* Both the vectorization factor and unroll factor have the form
1583 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1584 so they must have a common multiple. */
1585 vectorization_factor
1586 = force_common_multiple (vectorization_factor,
1587 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1588 }
1589
1590 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1591 if (dump_enabled_p ())
1592 {
1593 dump_printf_loc (MSG_NOTE, vect_location,
1594 "Updating vectorization factor to ");
1595 dump_dec (MSG_NOTE, vectorization_factor);
1596 dump_printf (MSG_NOTE, ".\n");
1597 }
1598 }
1599
1600 /* Return true if STMT_INFO describes a double reduction phi and if
1601 the other phi in the reduction is also relevant for vectorization.
1602 This rejects cases such as:
1603
1604 outer1:
1605 x_1 = PHI <x_3(outer2), ...>;
1606 ...
1607
1608 inner:
1609 x_2 = ...;
1610 ...
1611
1612 outer2:
1613 x_3 = PHI <x_2(inner)>;
1614
1615 if nothing in x_2 or elsewhere makes x_1 relevant. */
1616
1617 static bool
1618 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1619 {
1620 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1621 return false;
1622
1623 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1624 }
1625
1626 /* Function vect_analyze_loop_operations.
1627
1628 Scan the loop stmts and make sure they are all vectorizable. */
1629
1630 static opt_result
1631 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1632 {
1633 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1634 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1635 int nbbs = loop->num_nodes;
1636 int i;
1637 stmt_vec_info stmt_info;
1638 bool need_to_vectorize = false;
1639 bool ok;
1640
1641 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1642
1643 auto_vec<stmt_info_for_cost> cost_vec;
1644
1645 for (i = 0; i < nbbs; i++)
1646 {
1647 basic_block bb = bbs[i];
1648
1649 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1650 gsi_next (&si))
1651 {
1652 gphi *phi = si.phi ();
1653 ok = true;
1654
1655 stmt_info = loop_vinfo->lookup_stmt (phi);
1656 if (dump_enabled_p ())
1657 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1658 if (virtual_operand_p (gimple_phi_result (phi)))
1659 continue;
1660
1661 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1662 (i.e., a phi in the tail of the outer-loop). */
1663 if (! is_loop_header_bb_p (bb))
1664 {
1665 /* FORNOW: we currently don't support the case that these phis
1666 are not used in the outerloop (unless it is double reduction,
1667 i.e., this phi is vect_reduction_def), cause this case
1668 requires to actually do something here. */
1669 if (STMT_VINFO_LIVE_P (stmt_info)
1670 && !vect_active_double_reduction_p (stmt_info))
1671 return opt_result::failure_at (phi,
1672 "Unsupported loop-closed phi"
1673 " in outer-loop.\n");
1674
1675 /* If PHI is used in the outer loop, we check that its operand
1676 is defined in the inner loop. */
1677 if (STMT_VINFO_RELEVANT_P (stmt_info))
1678 {
1679 tree phi_op;
1680
1681 if (gimple_phi_num_args (phi) != 1)
1682 return opt_result::failure_at (phi, "unsupported phi");
1683
1684 phi_op = PHI_ARG_DEF (phi, 0);
1685 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1686 if (!op_def_info)
1687 return opt_result::failure_at (phi, "unsupported phi\n");
1688
1689 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1690 && (STMT_VINFO_RELEVANT (op_def_info)
1691 != vect_used_in_outer_by_reduction))
1692 return opt_result::failure_at (phi, "unsupported phi\n");
1693
1694 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1695 || (STMT_VINFO_DEF_TYPE (stmt_info)
1696 == vect_double_reduction_def))
1697 && !vectorizable_lc_phi (loop_vinfo,
1698 stmt_info, NULL, NULL))
1699 return opt_result::failure_at (phi, "unsupported phi\n");
1700 }
1701
1702 continue;
1703 }
1704
1705 gcc_assert (stmt_info);
1706
1707 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1708 || STMT_VINFO_LIVE_P (stmt_info))
1709 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1710 /* A scalar-dependence cycle that we don't support. */
1711 return opt_result::failure_at (phi,
1712 "not vectorized:"
1713 " scalar dependence cycle.\n");
1714
1715 if (STMT_VINFO_RELEVANT_P (stmt_info))
1716 {
1717 need_to_vectorize = true;
1718 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1719 && ! PURE_SLP_STMT (stmt_info))
1720 ok = vectorizable_induction (loop_vinfo,
1721 stmt_info, NULL, NULL,
1722 &cost_vec);
1723 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1724 || (STMT_VINFO_DEF_TYPE (stmt_info)
1725 == vect_double_reduction_def)
1726 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1727 && ! PURE_SLP_STMT (stmt_info))
1728 ok = vectorizable_reduction (loop_vinfo,
1729 stmt_info, NULL, NULL, &cost_vec);
1730 }
1731
1732 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1733 if (ok
1734 && STMT_VINFO_LIVE_P (stmt_info)
1735 && !PURE_SLP_STMT (stmt_info))
1736 ok = vectorizable_live_operation (loop_vinfo,
1737 stmt_info, NULL, NULL, NULL,
1738 -1, false, &cost_vec);
1739
1740 if (!ok)
1741 return opt_result::failure_at (phi,
1742 "not vectorized: relevant phi not "
1743 "supported: %G",
1744 static_cast <gimple *> (phi));
1745 }
1746
1747 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1748 gsi_next (&si))
1749 {
1750 gimple *stmt = gsi_stmt (si);
1751 if (!gimple_clobber_p (stmt)
1752 && !is_gimple_debug (stmt))
1753 {
1754 opt_result res
1755 = vect_analyze_stmt (loop_vinfo,
1756 loop_vinfo->lookup_stmt (stmt),
1757 &need_to_vectorize,
1758 NULL, NULL, &cost_vec);
1759 if (!res)
1760 return res;
1761 }
1762 }
1763 } /* bbs */
1764
1765 add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
1766
1767 /* All operations in the loop are either irrelevant (deal with loop
1768 control, or dead), or only used outside the loop and can be moved
1769 out of the loop (e.g. invariants, inductions). The loop can be
1770 optimized away by scalar optimizations. We're better off not
1771 touching this loop. */
1772 if (!need_to_vectorize)
1773 {
1774 if (dump_enabled_p ())
1775 dump_printf_loc (MSG_NOTE, vect_location,
1776 "All the computation can be taken out of the loop.\n");
1777 return opt_result::failure_at
1778 (vect_location,
1779 "not vectorized: redundant loop. no profit to vectorize.\n");
1780 }
1781
1782 return opt_result::success ();
1783 }
1784
1785 /* Return true if we know that the iteration count is smaller than the
1786 vectorization factor. Return false if it isn't, or if we can't be sure
1787 either way. */
1788
1789 static bool
1790 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1791 {
1792 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1793
1794 HOST_WIDE_INT max_niter;
1795 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1796 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1797 else
1798 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1799
1800 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1801 return true;
1802
1803 return false;
1804 }
1805
1806 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1807 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1808 definitely no, or -1 if it's worth retrying. */
1809
1810 static int
1811 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1812 {
1813 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1814 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1815
1816 /* Only loops that can handle partially-populated vectors can have iteration
1817 counts less than the vectorization factor. */
1818 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1819 {
1820 if (vect_known_niters_smaller_than_vf (loop_vinfo))
1821 {
1822 if (dump_enabled_p ())
1823 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1824 "not vectorized: iteration count smaller than "
1825 "vectorization factor.\n");
1826 return 0;
1827 }
1828 }
1829
1830 int min_profitable_iters, min_profitable_estimate;
1831 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1832 &min_profitable_estimate);
1833
1834 if (min_profitable_iters < 0)
1835 {
1836 if (dump_enabled_p ())
1837 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1838 "not vectorized: vectorization not profitable.\n");
1839 if (dump_enabled_p ())
1840 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1841 "not vectorized: vector version will never be "
1842 "profitable.\n");
1843 return -1;
1844 }
1845
1846 int min_scalar_loop_bound = (param_min_vect_loop_bound
1847 * assumed_vf);
1848
1849 /* Use the cost model only if it is more conservative than user specified
1850 threshold. */
1851 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1852 min_profitable_iters);
1853
1854 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1855
1856 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1857 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1858 {
1859 if (dump_enabled_p ())
1860 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1861 "not vectorized: vectorization not profitable.\n");
1862 if (dump_enabled_p ())
1863 dump_printf_loc (MSG_NOTE, vect_location,
1864 "not vectorized: iteration count smaller than user "
1865 "specified loop bound parameter or minimum profitable "
1866 "iterations (whichever is more conservative).\n");
1867 return 0;
1868 }
1869
1870 /* The static profitablity threshold min_profitable_estimate includes
1871 the cost of having to check at runtime whether the scalar loop
1872 should be used instead. If it turns out that we don't need or want
1873 such a check, the threshold we should use for the static estimate
1874 is simply the point at which the vector loop becomes more profitable
1875 than the scalar loop. */
1876 if (min_profitable_estimate > min_profitable_iters
1877 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1878 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1879 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1880 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1881 {
1882 if (dump_enabled_p ())
1883 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1884 " choice between the scalar and vector loops\n");
1885 min_profitable_estimate = min_profitable_iters;
1886 }
1887
1888 HOST_WIDE_INT estimated_niter;
1889
1890 /* If we are vectorizing an epilogue then we know the maximum number of
1891 scalar iterations it will cover is at least one lower than the
1892 vectorization factor of the main loop. */
1893 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1894 estimated_niter
1895 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1896 else
1897 {
1898 estimated_niter = estimated_stmt_executions_int (loop);
1899 if (estimated_niter == -1)
1900 estimated_niter = likely_max_stmt_executions_int (loop);
1901 }
1902 if (estimated_niter != -1
1903 && ((unsigned HOST_WIDE_INT) estimated_niter
1904 < MAX (th, (unsigned) min_profitable_estimate)))
1905 {
1906 if (dump_enabled_p ())
1907 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1908 "not vectorized: estimated iteration count too "
1909 "small.\n");
1910 if (dump_enabled_p ())
1911 dump_printf_loc (MSG_NOTE, vect_location,
1912 "not vectorized: estimated iteration count smaller "
1913 "than specified loop bound parameter or minimum "
1914 "profitable iterations (whichever is more "
1915 "conservative).\n");
1916 return -1;
1917 }
1918
1919 return 1;
1920 }
1921
1922 static opt_result
1923 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1924 vec<data_reference_p> *datarefs,
1925 unsigned int *n_stmts)
1926 {
1927 *n_stmts = 0;
1928 for (unsigned i = 0; i < loop->num_nodes; i++)
1929 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1930 !gsi_end_p (gsi); gsi_next (&gsi))
1931 {
1932 gimple *stmt = gsi_stmt (gsi);
1933 if (is_gimple_debug (stmt))
1934 continue;
1935 ++(*n_stmts);
1936 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1937 NULL, 0);
1938 if (!res)
1939 {
1940 if (is_gimple_call (stmt) && loop->safelen)
1941 {
1942 tree fndecl = gimple_call_fndecl (stmt), op;
1943 if (fndecl != NULL_TREE)
1944 {
1945 cgraph_node *node = cgraph_node::get (fndecl);
1946 if (node != NULL && node->simd_clones != NULL)
1947 {
1948 unsigned int j, n = gimple_call_num_args (stmt);
1949 for (j = 0; j < n; j++)
1950 {
1951 op = gimple_call_arg (stmt, j);
1952 if (DECL_P (op)
1953 || (REFERENCE_CLASS_P (op)
1954 && get_base_address (op)))
1955 break;
1956 }
1957 op = gimple_call_lhs (stmt);
1958 /* Ignore #pragma omp declare simd functions
1959 if they don't have data references in the
1960 call stmt itself. */
1961 if (j == n
1962 && !(op
1963 && (DECL_P (op)
1964 || (REFERENCE_CLASS_P (op)
1965 && get_base_address (op)))))
1966 continue;
1967 }
1968 }
1969 }
1970 return res;
1971 }
1972 /* If dependence analysis will give up due to the limit on the
1973 number of datarefs stop here and fail fatally. */
1974 if (datarefs->length ()
1975 > (unsigned)param_loop_max_datarefs_for_datadeps)
1976 return opt_result::failure_at (stmt, "exceeded param "
1977 "loop-max-datarefs-for-datadeps\n");
1978 }
1979 return opt_result::success ();
1980 }
1981
1982 /* Look for SLP-only access groups and turn each individual access into its own
1983 group. */
1984 static void
1985 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1986 {
1987 unsigned int i;
1988 struct data_reference *dr;
1989
1990 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1991
1992 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1993 FOR_EACH_VEC_ELT (datarefs, i, dr)
1994 {
1995 gcc_assert (DR_REF (dr));
1996 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1997
1998 /* Check if the load is a part of an interleaving chain. */
1999 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2000 {
2001 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2002 unsigned int group_size = DR_GROUP_SIZE (first_element);
2003
2004 /* Check if SLP-only groups. */
2005 if (!STMT_SLP_TYPE (stmt_info)
2006 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2007 {
2008 /* Dissolve the group. */
2009 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2010
2011 stmt_vec_info vinfo = first_element;
2012 while (vinfo)
2013 {
2014 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2015 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2016 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2017 DR_GROUP_SIZE (vinfo) = 1;
2018 if (STMT_VINFO_STRIDED_P (first_element))
2019 DR_GROUP_GAP (vinfo) = 0;
2020 else
2021 DR_GROUP_GAP (vinfo) = group_size - 1;
2022 vinfo = next;
2023 }
2024 }
2025 }
2026 }
2027 }
2028
2029 /* Determine if operating on full vectors for LOOP_VINFO might leave
2030 some scalar iterations still to do. If so, decide how we should
2031 handle those scalar iterations. The possibilities are:
2032
2033 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2034 In this case:
2035
2036 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2037 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2038 LOOP_VINFO_PEELING_FOR_NITER == false
2039
2040 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2041 to handle the remaining scalar iterations. In this case:
2042
2043 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2044 LOOP_VINFO_PEELING_FOR_NITER == true
2045
2046 There are two choices:
2047
2048 (2a) Consider vectorizing the epilogue loop at the same VF as the
2049 main loop, but using partial vectors instead of full vectors.
2050 In this case:
2051
2052 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2053
2054 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2055 In this case:
2056
2057 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2058
2059 When FOR_EPILOGUE_P is true, make this determination based on the
2060 assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2061 based on the assumption that LOOP_VINFO is the main loop. The caller
2062 has made sure that the number of iterations is set appropriately for
2063 this value of FOR_EPILOGUE_P. */
2064
2065 opt_result
2066 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2067 bool for_epilogue_p)
2068 {
2069 /* Determine whether there would be any scalar iterations left over. */
2070 bool need_peeling_or_partial_vectors_p
2071 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2072
2073 /* Decide whether to vectorize the loop with partial vectors. */
2074 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2075 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2076 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2077 && need_peeling_or_partial_vectors_p)
2078 {
2079 /* For partial-vector-usage=1, try to push the handling of partial
2080 vectors to the epilogue, with the main loop continuing to operate
2081 on full vectors.
2082
2083 ??? We could then end up failing to use partial vectors if we
2084 decide to peel iterations into a prologue, and if the main loop
2085 then ends up processing fewer than VF iterations. */
2086 if (param_vect_partial_vector_usage == 1
2087 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2088 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2089 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2090 else
2091 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2092 }
2093
2094 if (dump_enabled_p ())
2095 {
2096 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2097 dump_printf_loc (MSG_NOTE, vect_location,
2098 "operating on partial vectors%s.\n",
2099 for_epilogue_p ? " for epilogue loop" : "");
2100 else
2101 dump_printf_loc (MSG_NOTE, vect_location,
2102 "operating only on full vectors%s.\n",
2103 for_epilogue_p ? " for epilogue loop" : "");
2104 }
2105
2106 if (for_epilogue_p)
2107 {
2108 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2109 gcc_assert (orig_loop_vinfo);
2110 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2111 gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2112 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2113 }
2114
2115 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2116 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2117 {
2118 /* Check that the loop processes at least one full vector. */
2119 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2120 tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2121 if (known_lt (wi::to_widest (scalar_niters), vf))
2122 return opt_result::failure_at (vect_location,
2123 "loop does not have enough iterations"
2124 " to support vectorization.\n");
2125
2126 /* If we need to peel an extra epilogue iteration to handle data
2127 accesses with gaps, check that there are enough scalar iterations
2128 available.
2129
2130 The check above is redundant with this one when peeling for gaps,
2131 but the distinction is useful for diagnostics. */
2132 tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2133 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2134 && known_lt (wi::to_widest (scalar_nitersm1), vf))
2135 return opt_result::failure_at (vect_location,
2136 "loop does not have enough iterations"
2137 " to support peeling for gaps.\n");
2138 }
2139
2140 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2141 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2142 && need_peeling_or_partial_vectors_p);
2143
2144 return opt_result::success ();
2145 }
2146
2147 /* Function vect_analyze_loop_2.
2148
2149 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2150 for it. The different analyses will record information in the
2151 loop_vec_info struct. */
2152 static opt_result
2153 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
2154 {
2155 opt_result ok = opt_result::success ();
2156 int res;
2157 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2158 poly_uint64 min_vf = 2;
2159 loop_vec_info orig_loop_vinfo = NULL;
2160
2161 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2162 loop_vec_info of the first vectorized loop. */
2163 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2164 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2165 else
2166 orig_loop_vinfo = loop_vinfo;
2167 gcc_assert (orig_loop_vinfo);
2168
2169 /* The first group of checks is independent of the vector size. */
2170 fatal = true;
2171
2172 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2173 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2174 return opt_result::failure_at (vect_location,
2175 "not vectorized: simd if(0)\n");
2176
2177 /* Find all data references in the loop (which correspond to vdefs/vuses)
2178 and analyze their evolution in the loop. */
2179
2180 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2181
2182 /* Gather the data references and count stmts in the loop. */
2183 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2184 {
2185 opt_result res
2186 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2187 &LOOP_VINFO_DATAREFS (loop_vinfo),
2188 n_stmts);
2189 if (!res)
2190 {
2191 if (dump_enabled_p ())
2192 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2193 "not vectorized: loop contains function "
2194 "calls or data references that cannot "
2195 "be analyzed\n");
2196 return res;
2197 }
2198 loop_vinfo->shared->save_datarefs ();
2199 }
2200 else
2201 loop_vinfo->shared->check_datarefs ();
2202
2203 /* Analyze the data references and also adjust the minimal
2204 vectorization factor according to the loads and stores. */
2205
2206 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2207 if (!ok)
2208 {
2209 if (dump_enabled_p ())
2210 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2211 "bad data references.\n");
2212 return ok;
2213 }
2214
2215 /* Classify all cross-iteration scalar data-flow cycles.
2216 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2217 vect_analyze_scalar_cycles (loop_vinfo);
2218
2219 vect_pattern_recog (loop_vinfo);
2220
2221 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2222
2223 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2224 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2225
2226 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2227 if (!ok)
2228 {
2229 if (dump_enabled_p ())
2230 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2231 "bad data access.\n");
2232 return ok;
2233 }
2234
2235 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2236
2237 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2238 if (!ok)
2239 {
2240 if (dump_enabled_p ())
2241 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2242 "unexpected pattern.\n");
2243 return ok;
2244 }
2245
2246 /* While the rest of the analysis below depends on it in some way. */
2247 fatal = false;
2248
2249 /* Analyze data dependences between the data-refs in the loop
2250 and adjust the maximum vectorization factor according to
2251 the dependences.
2252 FORNOW: fail at the first data dependence that we encounter. */
2253
2254 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2255 if (!ok)
2256 {
2257 if (dump_enabled_p ())
2258 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2259 "bad data dependence.\n");
2260 return ok;
2261 }
2262 if (max_vf != MAX_VECTORIZATION_FACTOR
2263 && maybe_lt (max_vf, min_vf))
2264 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2265 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2266
2267 ok = vect_determine_vectorization_factor (loop_vinfo);
2268 if (!ok)
2269 {
2270 if (dump_enabled_p ())
2271 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2272 "can't determine vectorization factor.\n");
2273 return ok;
2274 }
2275 if (max_vf != MAX_VECTORIZATION_FACTOR
2276 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2277 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2278
2279 /* Compute the scalar iteration cost. */
2280 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2281
2282 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2283
2284 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2285 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2286 if (!ok)
2287 return ok;
2288
2289 /* If there are any SLP instances mark them as pure_slp. */
2290 bool slp = vect_make_slp_decision (loop_vinfo);
2291 if (slp)
2292 {
2293 /* Find stmts that need to be both vectorized and SLPed. */
2294 vect_detect_hybrid_slp (loop_vinfo);
2295
2296 /* Update the vectorization factor based on the SLP decision. */
2297 vect_update_vf_for_slp (loop_vinfo);
2298
2299 /* Optimize the SLP graph with the vectorization factor fixed. */
2300 vect_optimize_slp (loop_vinfo);
2301 }
2302
2303 bool saved_can_use_partial_vectors_p
2304 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2305
2306 /* We don't expect to have to roll back to anything other than an empty
2307 set of rgroups. */
2308 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2309
2310 /* This is the point where we can re-start analysis with SLP forced off. */
2311 start_over:
2312
2313 /* Now the vectorization factor is final. */
2314 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2315 gcc_assert (known_ne (vectorization_factor, 0U));
2316
2317 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2318 {
2319 dump_printf_loc (MSG_NOTE, vect_location,
2320 "vectorization_factor = ");
2321 dump_dec (MSG_NOTE, vectorization_factor);
2322 dump_printf (MSG_NOTE, ", niters = %wd\n",
2323 LOOP_VINFO_INT_NITERS (loop_vinfo));
2324 }
2325
2326 /* Analyze the alignment of the data-refs in the loop.
2327 Fail if a data reference is found that cannot be vectorized. */
2328
2329 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2330 if (!ok)
2331 {
2332 if (dump_enabled_p ())
2333 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2334 "bad data alignment.\n");
2335 return ok;
2336 }
2337
2338 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2339 It is important to call pruning after vect_analyze_data_ref_accesses,
2340 since we use grouping information gathered by interleaving analysis. */
2341 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2342 if (!ok)
2343 return ok;
2344
2345 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2346 vectorization, since we do not want to add extra peeling or
2347 add versioning for alignment. */
2348 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2349 /* This pass will decide on using loop versioning and/or loop peeling in
2350 order to enhance the alignment of data references in the loop. */
2351 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2352 if (!ok)
2353 return ok;
2354
2355 if (slp)
2356 {
2357 /* Analyze operations in the SLP instances. Note this may
2358 remove unsupported SLP instances which makes the above
2359 SLP kind detection invalid. */
2360 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2361 vect_slp_analyze_operations (loop_vinfo);
2362 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2363 {
2364 ok = opt_result::failure_at (vect_location,
2365 "unsupported SLP instances\n");
2366 goto again;
2367 }
2368 }
2369
2370 /* Dissolve SLP-only groups. */
2371 vect_dissolve_slp_only_groups (loop_vinfo);
2372
2373 /* Scan all the remaining operations in the loop that are not subject
2374 to SLP and make sure they are vectorizable. */
2375 ok = vect_analyze_loop_operations (loop_vinfo);
2376 if (!ok)
2377 {
2378 if (dump_enabled_p ())
2379 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2380 "bad operation or unsupported loop bound.\n");
2381 return ok;
2382 }
2383
2384 /* For now, we don't expect to mix both masking and length approaches for one
2385 loop, disable it if both are recorded. */
2386 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2387 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2388 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2389 {
2390 if (dump_enabled_p ())
2391 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2392 "can't vectorize a loop with partial vectors"
2393 " because we don't expect to mix different"
2394 " approaches with partial vectors for the"
2395 " same loop.\n");
2396 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2397 }
2398
2399 /* If we still have the option of using partial vectors,
2400 check whether we can generate the necessary loop controls. */
2401 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2402 && !vect_verify_full_masking (loop_vinfo)
2403 && !vect_verify_loop_lens (loop_vinfo))
2404 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2405
2406 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2407 to be able to handle fewer than VF scalars, or needs to have a lower VF
2408 than the main loop. */
2409 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2410 && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2411 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2412 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2413 return opt_result::failure_at (vect_location,
2414 "Vectorization factor too high for"
2415 " epilogue loop.\n");
2416
2417 /* Decide whether this loop_vinfo should use partial vectors or peeling,
2418 assuming that the loop will be used as a main loop. We will redo
2419 this analysis later if we instead decide to use the loop as an
2420 epilogue loop. */
2421 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2422 if (!ok)
2423 return ok;
2424
2425 /* Check the costings of the loop make vectorizing worthwhile. */
2426 res = vect_analyze_loop_costing (loop_vinfo);
2427 if (res < 0)
2428 {
2429 ok = opt_result::failure_at (vect_location,
2430 "Loop costings may not be worthwhile.\n");
2431 goto again;
2432 }
2433 if (!res)
2434 return opt_result::failure_at (vect_location,
2435 "Loop costings not worthwhile.\n");
2436
2437 /* If an epilogue loop is required make sure we can create one. */
2438 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2439 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2440 {
2441 if (dump_enabled_p ())
2442 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2443 if (!vect_can_advance_ivs_p (loop_vinfo)
2444 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2445 single_exit (LOOP_VINFO_LOOP
2446 (loop_vinfo))))
2447 {
2448 ok = opt_result::failure_at (vect_location,
2449 "not vectorized: can't create required "
2450 "epilog loop\n");
2451 goto again;
2452 }
2453 }
2454
2455 /* During peeling, we need to check if number of loop iterations is
2456 enough for both peeled prolog loop and vector loop. This check
2457 can be merged along with threshold check of loop versioning, so
2458 increase threshold for this case if necessary.
2459
2460 If we are analyzing an epilogue we still want to check what its
2461 versioning threshold would be. If we decide to vectorize the epilogues we
2462 will want to use the lowest versioning threshold of all epilogues and main
2463 loop. This will enable us to enter a vectorized epilogue even when
2464 versioning the loop. We can't simply check whether the epilogue requires
2465 versioning though since we may have skipped some versioning checks when
2466 analyzing the epilogue. For instance, checks for alias versioning will be
2467 skipped when dealing with epilogues as we assume we already checked them
2468 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2469 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2470 {
2471 poly_uint64 niters_th = 0;
2472 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2473
2474 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2475 {
2476 /* Niters for peeled prolog loop. */
2477 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2478 {
2479 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2480 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2481 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2482 }
2483 else
2484 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2485 }
2486
2487 /* Niters for at least one iteration of vectorized loop. */
2488 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2489 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2490 /* One additional iteration because of peeling for gap. */
2491 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2492 niters_th += 1;
2493
2494 /* Use the same condition as vect_transform_loop to decide when to use
2495 the cost to determine a versioning threshold. */
2496 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2497 && ordered_p (th, niters_th))
2498 niters_th = ordered_max (poly_uint64 (th), niters_th);
2499
2500 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2501 }
2502
2503 gcc_assert (known_eq (vectorization_factor,
2504 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2505
2506 /* Ok to vectorize! */
2507 return opt_result::success ();
2508
2509 again:
2510 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2511 gcc_assert (!ok);
2512
2513 /* Try again with SLP forced off but if we didn't do any SLP there is
2514 no point in re-trying. */
2515 if (!slp)
2516 return ok;
2517
2518 /* If there are reduction chains re-trying will fail anyway. */
2519 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2520 return ok;
2521
2522 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2523 via interleaving or lane instructions. */
2524 slp_instance instance;
2525 slp_tree node;
2526 unsigned i, j;
2527 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2528 {
2529 stmt_vec_info vinfo;
2530 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2531 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2532 continue;
2533 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2534 unsigned int size = DR_GROUP_SIZE (vinfo);
2535 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2536 if (! vect_store_lanes_supported (vectype, size, false)
2537 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2538 && ! vect_grouped_store_supported (vectype, size))
2539 return opt_result::failure_at (vinfo->stmt,
2540 "unsupported grouped store\n");
2541 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2542 {
2543 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2544 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2545 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2546 size = DR_GROUP_SIZE (vinfo);
2547 vectype = STMT_VINFO_VECTYPE (vinfo);
2548 if (! vect_load_lanes_supported (vectype, size, false)
2549 && ! vect_grouped_load_supported (vectype, single_element_p,
2550 size))
2551 return opt_result::failure_at (vinfo->stmt,
2552 "unsupported grouped load\n");
2553 }
2554 }
2555
2556 if (dump_enabled_p ())
2557 dump_printf_loc (MSG_NOTE, vect_location,
2558 "re-trying with SLP disabled\n");
2559
2560 /* Roll back state appropriately. No SLP this time. */
2561 slp = false;
2562 /* Restore vectorization factor as it were without SLP. */
2563 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2564 /* Free the SLP instances. */
2565 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2566 vect_free_slp_instance (instance);
2567 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2568 /* Reset SLP type to loop_vect on all stmts. */
2569 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2570 {
2571 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2572 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2573 !gsi_end_p (si); gsi_next (&si))
2574 {
2575 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2576 STMT_SLP_TYPE (stmt_info) = loop_vect;
2577 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2578 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2579 {
2580 /* vectorizable_reduction adjusts reduction stmt def-types,
2581 restore them to that of the PHI. */
2582 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2583 = STMT_VINFO_DEF_TYPE (stmt_info);
2584 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2585 (STMT_VINFO_REDUC_DEF (stmt_info)))
2586 = STMT_VINFO_DEF_TYPE (stmt_info);
2587 }
2588 }
2589 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2590 !gsi_end_p (si); gsi_next (&si))
2591 {
2592 if (is_gimple_debug (gsi_stmt (si)))
2593 continue;
2594 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2595 STMT_SLP_TYPE (stmt_info) = loop_vect;
2596 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2597 {
2598 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2599 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2600 STMT_SLP_TYPE (stmt_info) = loop_vect;
2601 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2602 !gsi_end_p (pi); gsi_next (&pi))
2603 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2604 = loop_vect;
2605 }
2606 }
2607 }
2608 /* Free optimized alias test DDRS. */
2609 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2610 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2611 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2612 /* Reset target cost data. */
2613 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2614 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2615 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2616 /* Reset accumulated rgroup information. */
2617 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2618 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2619 /* Reset assorted flags. */
2620 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2621 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2622 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2623 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2624 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2625 = saved_can_use_partial_vectors_p;
2626
2627 goto start_over;
2628 }
2629
2630 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2631 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2632 OLD_LOOP_VINFO is better unless something specifically indicates
2633 otherwise.
2634
2635 Note that this deliberately isn't a partial order. */
2636
2637 static bool
2638 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2639 loop_vec_info old_loop_vinfo)
2640 {
2641 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2642 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2643
2644 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2645 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2646
2647 /* Always prefer a VF of loop->simdlen over any other VF. */
2648 if (loop->simdlen)
2649 {
2650 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2651 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2652 if (new_simdlen_p != old_simdlen_p)
2653 return new_simdlen_p;
2654 }
2655
2656 /* Limit the VFs to what is likely to be the maximum number of iterations,
2657 to handle cases in which at least one loop_vinfo is fully-masked. */
2658 HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2659 if (estimated_max_niter != -1)
2660 {
2661 if (known_le (estimated_max_niter, new_vf))
2662 new_vf = estimated_max_niter;
2663 if (known_le (estimated_max_niter, old_vf))
2664 old_vf = estimated_max_niter;
2665 }
2666
2667 /* Check whether the (fractional) cost per scalar iteration is lower
2668 or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
2669 poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
2670 * poly_widest_int (old_vf));
2671 poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
2672 * poly_widest_int (new_vf));
2673 if (maybe_lt (rel_old, rel_new))
2674 {
2675 /* When old_loop_vinfo uses a variable vectorization factor,
2676 we know that it has a lower cost for at least one runtime VF.
2677 However, we don't know how likely that VF is.
2678
2679 One option would be to compare the costs for the estimated VFs.
2680 The problem is that that can put too much pressure on the cost
2681 model. E.g. if the estimated VF is also the lowest possible VF,
2682 and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2683 for the estimated VF, we'd then choose new_loop_vinfo even
2684 though (a) new_loop_vinfo might not actually be better than
2685 old_loop_vinfo for that VF and (b) it would be significantly
2686 worse at larger VFs.
2687
2688 Here we go for a hacky compromise: pick new_loop_vinfo if it is
2689 no more expensive than old_loop_vinfo even after doubling the
2690 estimated old_loop_vinfo VF. For all but trivial loops, this
2691 ensures that we only pick new_loop_vinfo if it is significantly
2692 better than old_loop_vinfo at the estimated VF. */
2693 if (rel_new.is_constant ())
2694 return false;
2695
2696 HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf);
2697 HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf);
2698 widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost
2699 * widest_int (old_estimated_vf));
2700 widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost
2701 * widest_int (new_estimated_vf));
2702 return estimated_rel_new * 2 <= estimated_rel_old;
2703 }
2704 if (known_lt (rel_new, rel_old))
2705 return true;
2706
2707 /* If there's nothing to choose between the loop bodies, see whether
2708 there's a difference in the prologue and epilogue costs. */
2709 if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2710 return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2711
2712 return false;
2713 }
2714
2715 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2716 true if we should. */
2717
2718 static bool
2719 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2720 loop_vec_info old_loop_vinfo)
2721 {
2722 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2723 return false;
2724
2725 if (dump_enabled_p ())
2726 dump_printf_loc (MSG_NOTE, vect_location,
2727 "***** Preferring vector mode %s to vector mode %s\n",
2728 GET_MODE_NAME (new_loop_vinfo->vector_mode),
2729 GET_MODE_NAME (old_loop_vinfo->vector_mode));
2730 return true;
2731 }
2732
2733 /* Function vect_analyze_loop.
2734
2735 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2736 for it. The different analyses will record information in the
2737 loop_vec_info struct. */
2738 opt_loop_vec_info
2739 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2740 {
2741 auto_vector_modes vector_modes;
2742
2743 /* Autodetect first vector size we try. */
2744 unsigned int autovec_flags
2745 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2746 loop->simdlen != 0);
2747 unsigned int mode_i = 0;
2748
2749 DUMP_VECT_SCOPE ("analyze_loop_nest");
2750
2751 if (loop_outer (loop)
2752 && loop_vec_info_for_loop (loop_outer (loop))
2753 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2754 return opt_loop_vec_info::failure_at (vect_location,
2755 "outer-loop already vectorized.\n");
2756
2757 if (!find_loop_nest (loop, &shared->loop_nest))
2758 return opt_loop_vec_info::failure_at
2759 (vect_location,
2760 "not vectorized: loop nest containing two or more consecutive inner"
2761 " loops cannot be vectorized\n");
2762
2763 unsigned n_stmts = 0;
2764 machine_mode autodetected_vector_mode = VOIDmode;
2765 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2766 machine_mode next_vector_mode = VOIDmode;
2767 poly_uint64 lowest_th = 0;
2768 unsigned vectorized_loops = 0;
2769 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2770 && !unlimited_cost_model (loop));
2771
2772 bool vect_epilogues = false;
2773 opt_result res = opt_result::success ();
2774 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2775 while (1)
2776 {
2777 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2778 opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2779 if (!loop_vinfo)
2780 {
2781 if (dump_enabled_p ())
2782 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2783 "bad loop form.\n");
2784 gcc_checking_assert (first_loop_vinfo == NULL);
2785 return loop_vinfo;
2786 }
2787 loop_vinfo->vector_mode = next_vector_mode;
2788
2789 bool fatal = false;
2790
2791 /* When pick_lowest_cost_p is true, we should in principle iterate
2792 over all the loop_vec_infos that LOOP_VINFO could replace and
2793 try to vectorize LOOP_VINFO under the same conditions.
2794 E.g. when trying to replace an epilogue loop, we should vectorize
2795 LOOP_VINFO as an epilogue loop with the same VF limit. When trying
2796 to replace the main loop, we should vectorize LOOP_VINFO as a main
2797 loop too.
2798
2799 However, autovectorize_vector_modes is usually sorted as follows:
2800
2801 - Modes that naturally produce lower VFs usually follow modes that
2802 naturally produce higher VFs.
2803
2804 - When modes naturally produce the same VF, maskable modes
2805 usually follow unmaskable ones, so that the maskable mode
2806 can be used to vectorize the epilogue of the unmaskable mode.
2807
2808 This order is preferred because it leads to the maximum
2809 epilogue vectorization opportunities. Targets should only use
2810 a different order if they want to make wide modes available while
2811 disparaging them relative to earlier, smaller modes. The assumption
2812 in that case is that the wider modes are more expensive in some
2813 way that isn't reflected directly in the costs.
2814
2815 There should therefore be few interesting cases in which
2816 LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2817 treated as a standalone loop, and ends up being genuinely cheaper
2818 than FIRST_LOOP_VINFO. */
2819 if (vect_epilogues)
2820 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2821
2822 res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2823 if (mode_i == 0)
2824 autodetected_vector_mode = loop_vinfo->vector_mode;
2825 if (dump_enabled_p ())
2826 {
2827 if (res)
2828 dump_printf_loc (MSG_NOTE, vect_location,
2829 "***** Analysis succeeded with vector mode %s\n",
2830 GET_MODE_NAME (loop_vinfo->vector_mode));
2831 else
2832 dump_printf_loc (MSG_NOTE, vect_location,
2833 "***** Analysis failed with vector mode %s\n",
2834 GET_MODE_NAME (loop_vinfo->vector_mode));
2835 }
2836
2837 loop->aux = NULL;
2838
2839 if (!fatal)
2840 while (mode_i < vector_modes.length ()
2841 && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
2842 {
2843 if (dump_enabled_p ())
2844 dump_printf_loc (MSG_NOTE, vect_location,
2845 "***** The result for vector mode %s would"
2846 " be the same\n",
2847 GET_MODE_NAME (vector_modes[mode_i]));
2848 mode_i += 1;
2849 }
2850
2851 if (res)
2852 {
2853 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2854 vectorized_loops++;
2855
2856 /* Once we hit the desired simdlen for the first time,
2857 discard any previous attempts. */
2858 if (simdlen
2859 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2860 {
2861 delete first_loop_vinfo;
2862 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2863 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2864 simdlen = 0;
2865 }
2866 else if (pick_lowest_cost_p && first_loop_vinfo)
2867 {
2868 /* Keep trying to roll back vectorization attempts while the
2869 loop_vec_infos they produced were worse than this one. */
2870 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
2871 while (!vinfos.is_empty ()
2872 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
2873 {
2874 gcc_assert (vect_epilogues);
2875 delete vinfos.pop ();
2876 }
2877 if (vinfos.is_empty ()
2878 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2879 {
2880 delete first_loop_vinfo;
2881 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2882 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2883 }
2884 }
2885
2886 if (first_loop_vinfo == NULL)
2887 {
2888 first_loop_vinfo = loop_vinfo;
2889 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
2890 }
2891 else if (vect_epilogues
2892 /* For now only allow one epilogue loop. */
2893 && first_loop_vinfo->epilogue_vinfos.is_empty ())
2894 {
2895 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
2896 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
2897 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2898 || maybe_ne (lowest_th, 0U));
2899 /* Keep track of the known smallest versioning
2900 threshold. */
2901 if (ordered_p (lowest_th, th))
2902 lowest_th = ordered_min (lowest_th, th);
2903 }
2904 else
2905 {
2906 delete loop_vinfo;
2907 loop_vinfo = opt_loop_vec_info::success (NULL);
2908 }
2909
2910 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
2911 enabled, SIMDUID is not set, it is the innermost loop and we have
2912 either already found the loop's SIMDLEN or there was no SIMDLEN to
2913 begin with.
2914 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
2915 vect_epilogues = (!simdlen
2916 && loop->inner == NULL
2917 && param_vect_epilogues_nomask
2918 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
2919 && !loop->simduid
2920 /* For now only allow one epilogue loop, but allow
2921 pick_lowest_cost_p to replace it. */
2922 && (first_loop_vinfo->epilogue_vinfos.is_empty ()
2923 || pick_lowest_cost_p));
2924
2925 /* Commit to first_loop_vinfo if we have no reason to try
2926 alternatives. */
2927 if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
2928 break;
2929 }
2930 else
2931 {
2932 delete loop_vinfo;
2933 loop_vinfo = opt_loop_vec_info::success (NULL);
2934 if (fatal)
2935 {
2936 gcc_checking_assert (first_loop_vinfo == NULL);
2937 break;
2938 }
2939 }
2940
2941 /* Handle the case that the original loop can use partial
2942 vectorization, but want to only adopt it for the epilogue.
2943 The retry should be in the same mode as original. */
2944 if (vect_epilogues
2945 && loop_vinfo
2946 && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
2947 {
2948 gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2949 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
2950 if (dump_enabled_p ())
2951 dump_printf_loc (MSG_NOTE, vect_location,
2952 "***** Re-trying analysis with same vector mode"
2953 " %s for epilogue with partial vectors.\n",
2954 GET_MODE_NAME (loop_vinfo->vector_mode));
2955 continue;
2956 }
2957
2958 if (mode_i < vector_modes.length ()
2959 && VECTOR_MODE_P (autodetected_vector_mode)
2960 && (related_vector_mode (vector_modes[mode_i],
2961 GET_MODE_INNER (autodetected_vector_mode))
2962 == autodetected_vector_mode)
2963 && (related_vector_mode (autodetected_vector_mode,
2964 GET_MODE_INNER (vector_modes[mode_i]))
2965 == vector_modes[mode_i]))
2966 {
2967 if (dump_enabled_p ())
2968 dump_printf_loc (MSG_NOTE, vect_location,
2969 "***** Skipping vector mode %s, which would"
2970 " repeat the analysis for %s\n",
2971 GET_MODE_NAME (vector_modes[mode_i]),
2972 GET_MODE_NAME (autodetected_vector_mode));
2973 mode_i += 1;
2974 }
2975
2976 if (mode_i == vector_modes.length ()
2977 || autodetected_vector_mode == VOIDmode)
2978 break;
2979
2980 /* Try the next biggest vector size. */
2981 next_vector_mode = vector_modes[mode_i++];
2982 if (dump_enabled_p ())
2983 dump_printf_loc (MSG_NOTE, vect_location,
2984 "***** Re-trying analysis with vector mode %s\n",
2985 GET_MODE_NAME (next_vector_mode));
2986 }
2987
2988 if (first_loop_vinfo)
2989 {
2990 loop->aux = (loop_vec_info) first_loop_vinfo;
2991 if (dump_enabled_p ())
2992 dump_printf_loc (MSG_NOTE, vect_location,
2993 "***** Choosing vector mode %s\n",
2994 GET_MODE_NAME (first_loop_vinfo->vector_mode));
2995 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
2996 return first_loop_vinfo;
2997 }
2998
2999 return opt_loop_vec_info::propagate_failure (res);
3000 }
3001
3002 /* Return true if there is an in-order reduction function for CODE, storing
3003 it in *REDUC_FN if so. */
3004
3005 static bool
3006 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
3007 {
3008 switch (code)
3009 {
3010 case PLUS_EXPR:
3011 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3012 return true;
3013
3014 default:
3015 return false;
3016 }
3017 }
3018
3019 /* Function reduction_fn_for_scalar_code
3020
3021 Input:
3022 CODE - tree_code of a reduction operations.
3023
3024 Output:
3025 REDUC_FN - the corresponding internal function to be used to reduce the
3026 vector of partial results into a single scalar result, or IFN_LAST
3027 if the operation is a supported reduction operation, but does not have
3028 such an internal function.
3029
3030 Return FALSE if CODE currently cannot be vectorized as reduction. */
3031
3032 static bool
3033 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
3034 {
3035 switch (code)
3036 {
3037 case MAX_EXPR:
3038 *reduc_fn = IFN_REDUC_MAX;
3039 return true;
3040
3041 case MIN_EXPR:
3042 *reduc_fn = IFN_REDUC_MIN;
3043 return true;
3044
3045 case PLUS_EXPR:
3046 *reduc_fn = IFN_REDUC_PLUS;
3047 return true;
3048
3049 case BIT_AND_EXPR:
3050 *reduc_fn = IFN_REDUC_AND;
3051 return true;
3052
3053 case BIT_IOR_EXPR:
3054 *reduc_fn = IFN_REDUC_IOR;
3055 return true;
3056
3057 case BIT_XOR_EXPR:
3058 *reduc_fn = IFN_REDUC_XOR;
3059 return true;
3060
3061 case MULT_EXPR:
3062 case MINUS_EXPR:
3063 *reduc_fn = IFN_LAST;
3064 return true;
3065
3066 default:
3067 return false;
3068 }
3069 }
3070
3071 /* If there is a neutral value X such that SLP reduction NODE would not
3072 be affected by the introduction of additional X elements, return that X,
3073 otherwise return null. CODE is the code of the reduction and VECTOR_TYPE
3074 is the vector type that would hold element X. REDUC_CHAIN is true if
3075 the SLP statements perform a single reduction, false if each statement
3076 performs an independent reduction. */
3077
3078 static tree
3079 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
3080 tree_code code, bool reduc_chain)
3081 {
3082 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3083 stmt_vec_info stmt_vinfo = stmts[0];
3084 tree scalar_type = TREE_TYPE (vector_type);
3085 class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
3086 gcc_assert (loop);
3087
3088 switch (code)
3089 {
3090 case WIDEN_SUM_EXPR:
3091 case DOT_PROD_EXPR:
3092 case SAD_EXPR:
3093 case PLUS_EXPR:
3094 case MINUS_EXPR:
3095 case BIT_IOR_EXPR:
3096 case BIT_XOR_EXPR:
3097 return build_zero_cst (scalar_type);
3098
3099 case MULT_EXPR:
3100 return build_one_cst (scalar_type);
3101
3102 case BIT_AND_EXPR:
3103 return build_all_ones_cst (scalar_type);
3104
3105 case MAX_EXPR:
3106 case MIN_EXPR:
3107 /* For MIN/MAX the initial values are neutral. A reduction chain
3108 has only a single initial value, so that value is neutral for
3109 all statements. */
3110 if (reduc_chain)
3111 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
3112 loop_preheader_edge (loop));
3113 return NULL_TREE;
3114
3115 default:
3116 return NULL_TREE;
3117 }
3118 }
3119
3120 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3121 STMT is printed with a message MSG. */
3122
3123 static void
3124 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3125 {
3126 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3127 }
3128
3129 /* Return true if we need an in-order reduction for operation CODE
3130 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3131 overflow must wrap. */
3132
3133 bool
3134 needs_fold_left_reduction_p (tree type, tree_code code)
3135 {
3136 /* CHECKME: check for !flag_finite_math_only too? */
3137 if (SCALAR_FLOAT_TYPE_P (type))
3138 switch (code)
3139 {
3140 case MIN_EXPR:
3141 case MAX_EXPR:
3142 return false;
3143
3144 default:
3145 return !flag_associative_math;
3146 }
3147
3148 if (INTEGRAL_TYPE_P (type))
3149 {
3150 if (!operation_no_trapping_overflow (type, code))
3151 return true;
3152 return false;
3153 }
3154
3155 if (SAT_FIXED_POINT_TYPE_P (type))
3156 return true;
3157
3158 return false;
3159 }
3160
3161 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3162 has a handled computation expression. Store the main reduction
3163 operation in *CODE. */
3164
3165 static bool
3166 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3167 tree loop_arg, enum tree_code *code,
3168 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3169 {
3170 auto_bitmap visited;
3171 tree lookfor = PHI_RESULT (phi);
3172 ssa_op_iter curri;
3173 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3174 while (USE_FROM_PTR (curr) != loop_arg)
3175 curr = op_iter_next_use (&curri);
3176 curri.i = curri.numops;
3177 do
3178 {
3179 path.safe_push (std::make_pair (curri, curr));
3180 tree use = USE_FROM_PTR (curr);
3181 if (use == lookfor)
3182 break;
3183 gimple *def = SSA_NAME_DEF_STMT (use);
3184 if (gimple_nop_p (def)
3185 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3186 {
3187 pop:
3188 do
3189 {
3190 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3191 curri = x.first;
3192 curr = x.second;
3193 do
3194 curr = op_iter_next_use (&curri);
3195 /* Skip already visited or non-SSA operands (from iterating
3196 over PHI args). */
3197 while (curr != NULL_USE_OPERAND_P
3198 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3199 || ! bitmap_set_bit (visited,
3200 SSA_NAME_VERSION
3201 (USE_FROM_PTR (curr)))));
3202 }
3203 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3204 if (curr == NULL_USE_OPERAND_P)
3205 break;
3206 }
3207 else
3208 {
3209 if (gimple_code (def) == GIMPLE_PHI)
3210 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3211 else
3212 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3213 while (curr != NULL_USE_OPERAND_P
3214 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3215 || ! bitmap_set_bit (visited,
3216 SSA_NAME_VERSION
3217 (USE_FROM_PTR (curr)))))
3218 curr = op_iter_next_use (&curri);
3219 if (curr == NULL_USE_OPERAND_P)
3220 goto pop;
3221 }
3222 }
3223 while (1);
3224 if (dump_file && (dump_flags & TDF_DETAILS))
3225 {
3226 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3227 unsigned i;
3228 std::pair<ssa_op_iter, use_operand_p> *x;
3229 FOR_EACH_VEC_ELT (path, i, x)
3230 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3231 dump_printf (MSG_NOTE, "\n");
3232 }
3233
3234 /* Check whether the reduction path detected is valid. */
3235 bool fail = path.length () == 0;
3236 bool neg = false;
3237 int sign = -1;
3238 *code = ERROR_MARK;
3239 for (unsigned i = 1; i < path.length (); ++i)
3240 {
3241 gimple *use_stmt = USE_STMT (path[i].second);
3242 tree op = USE_FROM_PTR (path[i].second);
3243 if (! is_gimple_assign (use_stmt)
3244 /* The following make sure we can compute the operand index
3245 easily plus it mostly disallows chaining via COND_EXPR condition
3246 operands. */
3247 || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3248 && (gimple_num_ops (use_stmt) <= 2
3249 || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3250 && (gimple_num_ops (use_stmt) <= 3
3251 || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3252 {
3253 fail = true;
3254 break;
3255 }
3256 /* Check there's only a single stmt the op is used on inside
3257 of the loop. */
3258 imm_use_iterator imm_iter;
3259 gimple *op_use_stmt;
3260 unsigned cnt = 0;
3261 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3262 if (!is_gimple_debug (op_use_stmt)
3263 && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))
3264 {
3265 /* We want to allow x + x but not x < 1 ? x : 2. */
3266 if (is_gimple_assign (op_use_stmt)
3267 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3268 {
3269 use_operand_p use_p;
3270 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3271 cnt++;
3272 }
3273 else
3274 cnt++;
3275 }
3276 if (cnt != 1)
3277 {
3278 fail = true;
3279 break;
3280 }
3281 tree_code use_code = gimple_assign_rhs_code (use_stmt);
3282 if (use_code == MINUS_EXPR)
3283 {
3284 use_code = PLUS_EXPR;
3285 /* Track whether we negate the reduction value each iteration. */
3286 if (gimple_assign_rhs2 (use_stmt) == op)
3287 neg = ! neg;
3288 }
3289 if (CONVERT_EXPR_CODE_P (use_code)
3290 && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3291 TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3292 ;
3293 else if (*code == ERROR_MARK)
3294 {
3295 *code = use_code;
3296 sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3297 }
3298 else if (use_code != *code)
3299 {
3300 fail = true;
3301 break;
3302 }
3303 else if ((use_code == MIN_EXPR
3304 || use_code == MAX_EXPR)
3305 && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3306 {
3307 fail = true;
3308 break;
3309 }
3310 }
3311 return ! fail && ! neg && *code != ERROR_MARK;
3312 }
3313
3314 bool
3315 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3316 tree loop_arg, enum tree_code code)
3317 {
3318 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3319 enum tree_code code_;
3320 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3321 && code_ == code);
3322 }
3323
3324
3325
3326 /* Function vect_is_simple_reduction
3327
3328 (1) Detect a cross-iteration def-use cycle that represents a simple
3329 reduction computation. We look for the following pattern:
3330
3331 loop_header:
3332 a1 = phi < a0, a2 >
3333 a3 = ...
3334 a2 = operation (a3, a1)
3335
3336 or
3337
3338 a3 = ...
3339 loop_header:
3340 a1 = phi < a0, a2 >
3341 a2 = operation (a3, a1)
3342
3343 such that:
3344 1. operation is commutative and associative and it is safe to
3345 change the order of the computation
3346 2. no uses for a2 in the loop (a2 is used out of the loop)
3347 3. no uses of a1 in the loop besides the reduction operation
3348 4. no uses of a1 outside the loop.
3349
3350 Conditions 1,4 are tested here.
3351 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3352
3353 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3354 nested cycles.
3355
3356 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3357 reductions:
3358
3359 a1 = phi < a0, a2 >
3360 inner loop (def of a3)
3361 a2 = phi < a3 >
3362
3363 (4) Detect condition expressions, ie:
3364 for (int i = 0; i < N; i++)
3365 if (a[i] < val)
3366 ret_val = a[i];
3367
3368 */
3369
3370 static stmt_vec_info
3371 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3372 bool *double_reduc, bool *reduc_chain_p)
3373 {
3374 gphi *phi = as_a <gphi *> (phi_info->stmt);
3375 gimple *phi_use_stmt = NULL;
3376 imm_use_iterator imm_iter;
3377 use_operand_p use_p;
3378
3379 *double_reduc = false;
3380 *reduc_chain_p = false;
3381 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3382
3383 tree phi_name = PHI_RESULT (phi);
3384 /* ??? If there are no uses of the PHI result the inner loop reduction
3385 won't be detected as possibly double-reduction by vectorizable_reduction
3386 because that tries to walk the PHI arg from the preheader edge which
3387 can be constant. See PR60382. */
3388 if (has_zero_uses (phi_name))
3389 return NULL;
3390 class loop *loop = (gimple_bb (phi))->loop_father;
3391 unsigned nphi_def_loop_uses = 0;
3392 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3393 {
3394 gimple *use_stmt = USE_STMT (use_p);
3395 if (is_gimple_debug (use_stmt))
3396 continue;
3397
3398 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3399 {
3400 if (dump_enabled_p ())
3401 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3402 "intermediate value used outside loop.\n");
3403
3404 return NULL;
3405 }
3406
3407 nphi_def_loop_uses++;
3408 phi_use_stmt = use_stmt;
3409 }
3410
3411 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3412 if (TREE_CODE (latch_def) != SSA_NAME)
3413 {
3414 if (dump_enabled_p ())
3415 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3416 "reduction: not ssa_name: %T\n", latch_def);
3417 return NULL;
3418 }
3419
3420 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3421 if (!def_stmt_info
3422 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3423 return NULL;
3424
3425 bool nested_in_vect_loop
3426 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3427 unsigned nlatch_def_loop_uses = 0;
3428 auto_vec<gphi *, 3> lcphis;
3429 bool inner_loop_of_double_reduc = false;
3430 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3431 {
3432 gimple *use_stmt = USE_STMT (use_p);
3433 if (is_gimple_debug (use_stmt))
3434 continue;
3435 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3436 nlatch_def_loop_uses++;
3437 else
3438 {
3439 /* We can have more than one loop-closed PHI. */
3440 lcphis.safe_push (as_a <gphi *> (use_stmt));
3441 if (nested_in_vect_loop
3442 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3443 == vect_double_reduction_def))
3444 inner_loop_of_double_reduc = true;
3445 }
3446 }
3447
3448 /* If we are vectorizing an inner reduction we are executing that
3449 in the original order only in case we are not dealing with a
3450 double reduction. */
3451 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3452 {
3453 if (dump_enabled_p ())
3454 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3455 "detected nested cycle: ");
3456 return def_stmt_info;
3457 }
3458
3459 /* If this isn't a nested cycle or if the nested cycle reduction value
3460 is used ouside of the inner loop we cannot handle uses of the reduction
3461 value. */
3462 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3463 {
3464 if (dump_enabled_p ())
3465 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3466 "reduction used in loop.\n");
3467 return NULL;
3468 }
3469
3470 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3471 defined in the inner loop. */
3472 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3473 {
3474 tree op1 = PHI_ARG_DEF (def_stmt, 0);
3475 if (gimple_phi_num_args (def_stmt) != 1
3476 || TREE_CODE (op1) != SSA_NAME)
3477 {
3478 if (dump_enabled_p ())
3479 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3480 "unsupported phi node definition.\n");
3481
3482 return NULL;
3483 }
3484
3485 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3486 if (gimple_bb (def1)
3487 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3488 && loop->inner
3489 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3490 && is_gimple_assign (def1)
3491 && is_a <gphi *> (phi_use_stmt)
3492 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3493 {
3494 if (dump_enabled_p ())
3495 report_vect_op (MSG_NOTE, def_stmt,
3496 "detected double reduction: ");
3497
3498 *double_reduc = true;
3499 return def_stmt_info;
3500 }
3501
3502 return NULL;
3503 }
3504
3505 /* Look for the expression computing latch_def from then loop PHI result. */
3506 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3507 enum tree_code code;
3508 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3509 path))
3510 {
3511 STMT_VINFO_REDUC_CODE (phi_info) = code;
3512 if (code == COND_EXPR && !nested_in_vect_loop)
3513 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3514
3515 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3516 reduction chain for which the additional restriction is that
3517 all operations in the chain are the same. */
3518 auto_vec<stmt_vec_info, 8> reduc_chain;
3519 unsigned i;
3520 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3521 for (i = path.length () - 1; i >= 1; --i)
3522 {
3523 gimple *stmt = USE_STMT (path[i].second);
3524 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3525 STMT_VINFO_REDUC_IDX (stmt_info)
3526 = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3527 enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3528 bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3529 && (i == 1 || i == path.length () - 1));
3530 if ((stmt_code != code && !leading_conversion)
3531 /* We can only handle the final value in epilogue
3532 generation for reduction chains. */
3533 || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3534 is_slp_reduc = false;
3535 /* For reduction chains we support a trailing/leading
3536 conversions. We do not store those in the actual chain. */
3537 if (leading_conversion)
3538 continue;
3539 reduc_chain.safe_push (stmt_info);
3540 }
3541 if (is_slp_reduc && reduc_chain.length () > 1)
3542 {
3543 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3544 {
3545 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3546 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3547 }
3548 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3549 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3550
3551 /* Save the chain for further analysis in SLP detection. */
3552 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3553 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3554
3555 *reduc_chain_p = true;
3556 if (dump_enabled_p ())
3557 dump_printf_loc (MSG_NOTE, vect_location,
3558 "reduction: detected reduction chain\n");
3559 }
3560 else if (dump_enabled_p ())
3561 dump_printf_loc (MSG_NOTE, vect_location,
3562 "reduction: detected reduction\n");
3563
3564 return def_stmt_info;
3565 }
3566
3567 if (dump_enabled_p ())
3568 dump_printf_loc (MSG_NOTE, vect_location,
3569 "reduction: unknown pattern\n");
3570
3571 return NULL;
3572 }
3573
3574 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3575 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3576 or -1 if not known. */
3577
3578 static int
3579 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3580 {
3581 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3582 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3583 {
3584 if (dump_enabled_p ())
3585 dump_printf_loc (MSG_NOTE, vect_location,
3586 "cost model: epilogue peel iters set to vf/2 "
3587 "because loop iterations are unknown .\n");
3588 return assumed_vf / 2;
3589 }
3590 else
3591 {
3592 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3593 peel_iters_prologue = MIN (niters, peel_iters_prologue);
3594 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3595 /* If we need to peel for gaps, but no peeling is required, we have to
3596 peel VF iterations. */
3597 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3598 peel_iters_epilogue = assumed_vf;
3599 return peel_iters_epilogue;
3600 }
3601 }
3602
3603 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3604 int
3605 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3606 int *peel_iters_epilogue,
3607 stmt_vector_for_cost *scalar_cost_vec,
3608 stmt_vector_for_cost *prologue_cost_vec,
3609 stmt_vector_for_cost *epilogue_cost_vec)
3610 {
3611 int retval = 0;
3612
3613 *peel_iters_epilogue
3614 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3615
3616 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3617 {
3618 /* If peeled iterations are known but number of scalar loop
3619 iterations are unknown, count a taken branch per peeled loop. */
3620 if (peel_iters_prologue > 0)
3621 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3622 NULL, NULL_TREE, 0, vect_prologue);
3623 if (*peel_iters_epilogue > 0)
3624 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3625 NULL, NULL_TREE, 0, vect_epilogue);
3626 }
3627
3628 stmt_info_for_cost *si;
3629 int j;
3630 if (peel_iters_prologue)
3631 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3632 retval += record_stmt_cost (prologue_cost_vec,
3633 si->count * peel_iters_prologue,
3634 si->kind, si->stmt_info, si->misalign,
3635 vect_prologue);
3636 if (*peel_iters_epilogue)
3637 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3638 retval += record_stmt_cost (epilogue_cost_vec,
3639 si->count * *peel_iters_epilogue,
3640 si->kind, si->stmt_info, si->misalign,
3641 vect_epilogue);
3642
3643 return retval;
3644 }
3645
3646 /* Function vect_estimate_min_profitable_iters
3647
3648 Return the number of iterations required for the vector version of the
3649 loop to be profitable relative to the cost of the scalar version of the
3650 loop.
3651
3652 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3653 of iterations for vectorization. -1 value means loop vectorization
3654 is not profitable. This returned value may be used for dynamic
3655 profitability check.
3656
3657 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3658 for static check against estimated number of iterations. */
3659
3660 static void
3661 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3662 int *ret_min_profitable_niters,
3663 int *ret_min_profitable_estimate)
3664 {
3665 int min_profitable_iters;
3666 int min_profitable_estimate;
3667 int peel_iters_prologue;
3668 int peel_iters_epilogue;
3669 unsigned vec_inside_cost = 0;
3670 int vec_outside_cost = 0;
3671 unsigned vec_prologue_cost = 0;
3672 unsigned vec_epilogue_cost = 0;
3673 int scalar_single_iter_cost = 0;
3674 int scalar_outside_cost = 0;
3675 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3676 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3677 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3678
3679 /* Cost model disabled. */
3680 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3681 {
3682 if (dump_enabled_p ())
3683 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3684 *ret_min_profitable_niters = 0;
3685 *ret_min_profitable_estimate = 0;
3686 return;
3687 }
3688
3689 /* Requires loop versioning tests to handle misalignment. */
3690 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3691 {
3692 /* FIXME: Make cost depend on complexity of individual check. */
3693 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3694 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3695 NULL, NULL_TREE, 0, vect_prologue);
3696 if (dump_enabled_p ())
3697 dump_printf (MSG_NOTE,
3698 "cost model: Adding cost of checks for loop "
3699 "versioning to treat misalignment.\n");
3700 }
3701
3702 /* Requires loop versioning with alias checks. */
3703 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3704 {
3705 /* FIXME: Make cost depend on complexity of individual check. */
3706 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3707 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3708 NULL, NULL_TREE, 0, vect_prologue);
3709 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3710 if (len)
3711 /* Count LEN - 1 ANDs and LEN comparisons. */
3712 (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
3713 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3714 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3715 if (len)
3716 {
3717 /* Count LEN - 1 ANDs and LEN comparisons. */
3718 unsigned int nstmts = len * 2 - 1;
3719 /* +1 for each bias that needs adding. */
3720 for (unsigned int i = 0; i < len; ++i)
3721 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3722 nstmts += 1;
3723 (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
3724 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3725 }
3726 if (dump_enabled_p ())
3727 dump_printf (MSG_NOTE,
3728 "cost model: Adding cost of checks for loop "
3729 "versioning aliasing.\n");
3730 }
3731
3732 /* Requires loop versioning with niter checks. */
3733 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3734 {
3735 /* FIXME: Make cost depend on complexity of individual check. */
3736 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
3737 NULL, NULL_TREE, 0, vect_prologue);
3738 if (dump_enabled_p ())
3739 dump_printf (MSG_NOTE,
3740 "cost model: Adding cost of checks for loop "
3741 "versioning niters.\n");
3742 }
3743
3744 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3745 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3746 NULL, NULL_TREE, 0, vect_prologue);
3747
3748 /* Count statements in scalar loop. Using this as scalar cost for a single
3749 iteration for now.
3750
3751 TODO: Add outer loop support.
3752
3753 TODO: Consider assigning different costs to different scalar
3754 statements. */
3755
3756 scalar_single_iter_cost
3757 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3758
3759 /* Add additional cost for the peeled instructions in prologue and epilogue
3760 loop. (For fully-masked loops there will be no peeling.)
3761
3762 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3763 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3764
3765 TODO: Build an expression that represents peel_iters for prologue and
3766 epilogue to be used in a run-time test. */
3767
3768 bool prologue_need_br_taken_cost = false;
3769 bool prologue_need_br_not_taken_cost = false;
3770
3771 /* Calculate peel_iters_prologue. */
3772 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3773 peel_iters_prologue = 0;
3774 else if (npeel < 0)
3775 {
3776 peel_iters_prologue = assumed_vf / 2;
3777 if (dump_enabled_p ())
3778 dump_printf (MSG_NOTE, "cost model: "
3779 "prologue peel iters set to vf/2.\n");
3780
3781 /* If peeled iterations are unknown, count a taken branch and a not taken
3782 branch per peeled loop. Even if scalar loop iterations are known,
3783 vector iterations are not known since peeled prologue iterations are
3784 not known. Hence guards remain the same. */
3785 prologue_need_br_taken_cost = true;
3786 prologue_need_br_not_taken_cost = true;
3787 }
3788 else
3789 {
3790 peel_iters_prologue = npeel;
3791 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
3792 /* If peeled iterations are known but number of scalar loop
3793 iterations are unknown, count a taken branch per peeled loop. */
3794 prologue_need_br_taken_cost = true;
3795 }
3796
3797 bool epilogue_need_br_taken_cost = false;
3798 bool epilogue_need_br_not_taken_cost = false;
3799
3800 /* Calculate peel_iters_epilogue. */
3801 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3802 /* We need to peel exactly one iteration for gaps. */
3803 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
3804 else if (npeel < 0)
3805 {
3806 /* If peeling for alignment is unknown, loop bound of main loop
3807 becomes unknown. */
3808 peel_iters_epilogue = assumed_vf / 2;
3809 if (dump_enabled_p ())
3810 dump_printf (MSG_NOTE, "cost model: "
3811 "epilogue peel iters set to vf/2 because "
3812 "peeling for alignment is unknown.\n");
3813
3814 /* See the same reason above in peel_iters_prologue calculation. */
3815 epilogue_need_br_taken_cost = true;
3816 epilogue_need_br_not_taken_cost = true;
3817 }
3818 else
3819 {
3820 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
3821 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
3822 /* If peeled iterations are known but number of scalar loop
3823 iterations are unknown, count a taken branch per peeled loop. */
3824 epilogue_need_br_taken_cost = true;
3825 }
3826
3827 stmt_info_for_cost *si;
3828 int j;
3829 /* Add costs associated with peel_iters_prologue. */
3830 if (peel_iters_prologue)
3831 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3832 {
3833 (void) add_stmt_cost (loop_vinfo, target_cost_data,
3834 si->count * peel_iters_prologue, si->kind,
3835 si->stmt_info, si->vectype, si->misalign,
3836 vect_prologue);
3837 }
3838
3839 /* Add costs associated with peel_iters_epilogue. */
3840 if (peel_iters_epilogue)
3841 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3842 {
3843 (void) add_stmt_cost (loop_vinfo, target_cost_data,
3844 si->count * peel_iters_epilogue, si->kind,
3845 si->stmt_info, si->vectype, si->misalign,
3846 vect_epilogue);
3847 }
3848
3849 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
3850
3851 if (prologue_need_br_taken_cost)
3852 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3853 NULL, NULL_TREE, 0, vect_prologue);
3854
3855 if (prologue_need_br_not_taken_cost)
3856 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
3857 cond_branch_not_taken, NULL, NULL_TREE, 0,
3858 vect_prologue);
3859
3860 if (epilogue_need_br_taken_cost)
3861 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3862 NULL, NULL_TREE, 0, vect_epilogue);
3863
3864 if (epilogue_need_br_not_taken_cost)
3865 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
3866 cond_branch_not_taken, NULL, NULL_TREE, 0,
3867 vect_epilogue);
3868
3869 /* Take care of special costs for rgroup controls of partial vectors. */
3870 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3871 {
3872 /* Calculate how many masks we need to generate. */
3873 unsigned int num_masks = 0;
3874 rgroup_controls *rgm;
3875 unsigned int num_vectors_m1;
3876 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
3877 if (rgm->type)
3878 num_masks += num_vectors_m1 + 1;
3879 gcc_assert (num_masks > 0);
3880
3881 /* In the worst case, we need to generate each mask in the prologue
3882 and in the loop body. One of the loop body mask instructions
3883 replaces the comparison in the scalar loop, and since we don't
3884 count the scalar comparison against the scalar body, we shouldn't
3885 count that vector instruction against the vector body either.
3886
3887 Sometimes we can use unpacks instead of generating prologue
3888 masks and sometimes the prologue mask will fold to a constant,
3889 so the actual prologue cost might be smaller. However, it's
3890 simpler and safer to use the worst-case cost; if this ends up
3891 being the tie-breaker between vectorizing or not, then it's
3892 probably better not to vectorize. */
3893 (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks,
3894 vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
3895 (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
3896 vector_stmt, NULL, NULL_TREE, 0, vect_body);
3897 }
3898 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
3899 {
3900 /* Referring to the functions vect_set_loop_condition_partial_vectors
3901 and vect_set_loop_controls_directly, we need to generate each
3902 length in the prologue and in the loop body if required. Although
3903 there are some possible optimizations, we consider the worst case
3904 here. */
3905
3906 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
3907 bool need_iterate_p
3908 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3909 && !vect_known_niters_smaller_than_vf (loop_vinfo));
3910
3911 /* Calculate how many statements to be added. */
3912 unsigned int prologue_stmts = 0;
3913 unsigned int body_stmts = 0;
3914
3915 rgroup_controls *rgc;
3916 unsigned int num_vectors_m1;
3917 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
3918 if (rgc->type)
3919 {
3920 /* May need one SHIFT for nitems_total computation. */
3921 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
3922 if (nitems != 1 && !niters_known_p)
3923 prologue_stmts += 1;
3924
3925 /* May need one MAX and one MINUS for wrap around. */
3926 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
3927 prologue_stmts += 2;
3928
3929 /* Need one MAX and one MINUS for each batch limit excepting for
3930 the 1st one. */
3931 prologue_stmts += num_vectors_m1 * 2;
3932
3933 unsigned int num_vectors = num_vectors_m1 + 1;
3934
3935 /* Need to set up lengths in prologue, only one MIN required
3936 for each since start index is zero. */
3937 prologue_stmts += num_vectors;
3938
3939 /* Each may need two MINs and one MINUS to update lengths in body
3940 for next iteration. */
3941 if (need_iterate_p)
3942 body_stmts += 3 * num_vectors;
3943 }
3944
3945 (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts,
3946 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3947 (void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts,
3948 scalar_stmt, NULL, NULL_TREE, 0, vect_body);
3949 }
3950
3951 /* FORNOW: The scalar outside cost is incremented in one of the
3952 following ways:
3953
3954 1. The vectorizer checks for alignment and aliasing and generates
3955 a condition that allows dynamic vectorization. A cost model
3956 check is ANDED with the versioning condition. Hence scalar code
3957 path now has the added cost of the versioning check.
3958
3959 if (cost > th & versioning_check)
3960 jmp to vector code
3961
3962 Hence run-time scalar is incremented by not-taken branch cost.
3963
3964 2. The vectorizer then checks if a prologue is required. If the
3965 cost model check was not done before during versioning, it has to
3966 be done before the prologue check.
3967
3968 if (cost <= th)
3969 prologue = scalar_iters
3970 if (prologue == 0)
3971 jmp to vector code
3972 else
3973 execute prologue
3974 if (prologue == num_iters)
3975 go to exit
3976
3977 Hence the run-time scalar cost is incremented by a taken branch,
3978 plus a not-taken branch, plus a taken branch cost.
3979
3980 3. The vectorizer then checks if an epilogue is required. If the
3981 cost model check was not done before during prologue check, it
3982 has to be done with the epilogue check.
3983
3984 if (prologue == 0)
3985 jmp to vector code
3986 else
3987 execute prologue
3988 if (prologue == num_iters)
3989 go to exit
3990 vector code:
3991 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3992 jmp to epilogue
3993
3994 Hence the run-time scalar cost should be incremented by 2 taken
3995 branches.
3996
3997 TODO: The back end may reorder the BBS's differently and reverse
3998 conditions/branch directions. Change the estimates below to
3999 something more reasonable. */
4000
4001 /* If the number of iterations is known and we do not do versioning, we can
4002 decide whether to vectorize at compile time. Hence the scalar version
4003 do not carry cost model guard costs. */
4004 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4005 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4006 {
4007 /* Cost model check occurs at versioning. */
4008 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4009 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4010 else
4011 {
4012 /* Cost model check occurs at prologue generation. */
4013 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4014 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4015 + vect_get_stmt_cost (cond_branch_not_taken);
4016 /* Cost model check occurs at epilogue generation. */
4017 else
4018 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4019 }
4020 }
4021
4022 /* Complete the target-specific cost calculations. */
4023 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
4024 &vec_inside_cost, &vec_epilogue_cost);
4025
4026 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4027
4028 /* Stash the costs so that we can compare two loop_vec_infos. */
4029 loop_vinfo->vec_inside_cost = vec_inside_cost;
4030 loop_vinfo->vec_outside_cost = vec_outside_cost;
4031
4032 if (dump_enabled_p ())
4033 {
4034 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4035 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4036 vec_inside_cost);
4037 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4038 vec_prologue_cost);
4039 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4040 vec_epilogue_cost);
4041 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4042 scalar_single_iter_cost);
4043 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4044 scalar_outside_cost);
4045 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4046 vec_outside_cost);
4047 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4048 peel_iters_prologue);
4049 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4050 peel_iters_epilogue);
4051 }
4052
4053 /* Calculate number of iterations required to make the vector version
4054 profitable, relative to the loop bodies only. The following condition
4055 must hold true:
4056 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4057 where
4058 SIC = scalar iteration cost, VIC = vector iteration cost,
4059 VOC = vector outside cost, VF = vectorization factor,
4060 NPEEL = prologue iterations + epilogue iterations,
4061 SOC = scalar outside cost for run time cost model check. */
4062
4063 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4064 - vec_inside_cost);
4065 if (saving_per_viter <= 0)
4066 {
4067 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4068 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4069 "vectorization did not happen for a simd loop");
4070
4071 if (dump_enabled_p ())
4072 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4073 "cost model: the vector iteration cost = %d "
4074 "divided by the scalar iteration cost = %d "
4075 "is greater or equal to the vectorization factor = %d"
4076 ".\n",
4077 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4078 *ret_min_profitable_niters = -1;
4079 *ret_min_profitable_estimate = -1;
4080 return;
4081 }
4082
4083 /* ??? The "if" arm is written to handle all cases; see below for what
4084 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4085 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4086 {
4087 /* Rewriting the condition above in terms of the number of
4088 vector iterations (vniters) rather than the number of
4089 scalar iterations (niters) gives:
4090
4091 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4092
4093 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4094
4095 For integer N, X and Y when X > 0:
4096
4097 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4098 int outside_overhead = (vec_outside_cost
4099 - scalar_single_iter_cost * peel_iters_prologue
4100 - scalar_single_iter_cost * peel_iters_epilogue
4101 - scalar_outside_cost);
4102 /* We're only interested in cases that require at least one
4103 vector iteration. */
4104 int min_vec_niters = 1;
4105 if (outside_overhead > 0)
4106 min_vec_niters = outside_overhead / saving_per_viter + 1;
4107
4108 if (dump_enabled_p ())
4109 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4110 min_vec_niters);
4111
4112 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4113 {
4114 /* Now that we know the minimum number of vector iterations,
4115 find the minimum niters for which the scalar cost is larger:
4116
4117 SIC * niters > VIC * vniters + VOC - SOC
4118
4119 We know that the minimum niters is no more than
4120 vniters * VF + NPEEL, but it might be (and often is) less
4121 than that if a partial vector iteration is cheaper than the
4122 equivalent scalar code. */
4123 int threshold = (vec_inside_cost * min_vec_niters
4124 + vec_outside_cost
4125 - scalar_outside_cost);
4126 if (threshold <= 0)
4127 min_profitable_iters = 1;
4128 else
4129 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4130 }
4131 else
4132 /* Convert the number of vector iterations into a number of
4133 scalar iterations. */
4134 min_profitable_iters = (min_vec_niters * assumed_vf
4135 + peel_iters_prologue
4136 + peel_iters_epilogue);
4137 }
4138 else
4139 {
4140 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4141 * assumed_vf
4142 - vec_inside_cost * peel_iters_prologue
4143 - vec_inside_cost * peel_iters_epilogue);
4144 if (min_profitable_iters <= 0)
4145 min_profitable_iters = 0;
4146 else
4147 {
4148 min_profitable_iters /= saving_per_viter;
4149
4150 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4151 <= (((int) vec_inside_cost * min_profitable_iters)
4152 + (((int) vec_outside_cost - scalar_outside_cost)
4153 * assumed_vf)))
4154 min_profitable_iters++;
4155 }
4156 }
4157
4158 if (dump_enabled_p ())
4159 dump_printf (MSG_NOTE,
4160 " Calculated minimum iters for profitability: %d\n",
4161 min_profitable_iters);
4162
4163 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4164 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4165 /* We want the vectorized loop to execute at least once. */
4166 min_profitable_iters = assumed_vf + peel_iters_prologue;
4167 else if (min_profitable_iters < peel_iters_prologue)
4168 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4169 vectorized loop executes at least once. */
4170 min_profitable_iters = peel_iters_prologue;
4171
4172 if (dump_enabled_p ())
4173 dump_printf_loc (MSG_NOTE, vect_location,
4174 " Runtime profitability threshold = %d\n",
4175 min_profitable_iters);
4176
4177 *ret_min_profitable_niters = min_profitable_iters;
4178
4179 /* Calculate number of iterations required to make the vector version
4180 profitable, relative to the loop bodies only.
4181
4182 Non-vectorized variant is SIC * niters and it must win over vector
4183 variant on the expected loop trip count. The following condition must hold true:
4184 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4185
4186 if (vec_outside_cost <= 0)
4187 min_profitable_estimate = 0;
4188 /* ??? This "else if" arm is written to handle all cases; see below for
4189 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4190 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4191 {
4192 /* This is a repeat of the code above, but with + SOC rather
4193 than - SOC. */
4194 int outside_overhead = (vec_outside_cost
4195 - scalar_single_iter_cost * peel_iters_prologue
4196 - scalar_single_iter_cost * peel_iters_epilogue
4197 + scalar_outside_cost);
4198 int min_vec_niters = 1;
4199 if (outside_overhead > 0)
4200 min_vec_niters = outside_overhead / saving_per_viter + 1;
4201
4202 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4203 {
4204 int threshold = (vec_inside_cost * min_vec_niters
4205 + vec_outside_cost
4206 + scalar_outside_cost);
4207 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4208 }
4209 else
4210 min_profitable_estimate = (min_vec_niters * assumed_vf
4211 + peel_iters_prologue
4212 + peel_iters_epilogue);
4213 }
4214 else
4215 {
4216 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4217 * assumed_vf
4218 - vec_inside_cost * peel_iters_prologue
4219 - vec_inside_cost * peel_iters_epilogue)
4220 / ((scalar_single_iter_cost * assumed_vf)
4221 - vec_inside_cost);
4222 }
4223 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4224 if (dump_enabled_p ())
4225 dump_printf_loc (MSG_NOTE, vect_location,
4226 " Static estimate profitability threshold = %d\n",
4227 min_profitable_estimate);
4228
4229 *ret_min_profitable_estimate = min_profitable_estimate;
4230 }
4231
4232 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4233 vector elements (not bits) for a vector with NELT elements. */
4234 static void
4235 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4236 vec_perm_builder *sel)
4237 {
4238 /* The encoding is a single stepped pattern. Any wrap-around is handled
4239 by vec_perm_indices. */
4240 sel->new_vector (nelt, 1, 3);
4241 for (unsigned int i = 0; i < 3; i++)
4242 sel->quick_push (i + offset);
4243 }
4244
4245 /* Checks whether the target supports whole-vector shifts for vectors of mode
4246 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4247 it supports vec_perm_const with masks for all necessary shift amounts. */
4248 static bool
4249 have_whole_vector_shift (machine_mode mode)
4250 {
4251 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4252 return true;
4253
4254 /* Variable-length vectors should be handled via the optab. */
4255 unsigned int nelt;
4256 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4257 return false;
4258
4259 vec_perm_builder sel;
4260 vec_perm_indices indices;
4261 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4262 {
4263 calc_vec_perm_mask_for_shift (i, nelt, &sel);
4264 indices.new_vector (sel, 2, nelt);
4265 if (!can_vec_perm_const_p (mode, indices, false))
4266 return false;
4267 }
4268 return true;
4269 }
4270
4271 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4272 functions. Design better to avoid maintenance issues. */
4273
4274 /* Function vect_model_reduction_cost.
4275
4276 Models cost for a reduction operation, including the vector ops
4277 generated within the strip-mine loop, the initial definition before
4278 the loop, and the epilogue code that must be generated. */
4279
4280 static void
4281 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4282 stmt_vec_info stmt_info, internal_fn reduc_fn,
4283 vect_reduction_type reduction_type,
4284 int ncopies, stmt_vector_for_cost *cost_vec)
4285 {
4286 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4287 enum tree_code code;
4288 optab optab;
4289 tree vectype;
4290 machine_mode mode;
4291 class loop *loop = NULL;
4292
4293 if (loop_vinfo)
4294 loop = LOOP_VINFO_LOOP (loop_vinfo);
4295
4296 /* Condition reductions generate two reductions in the loop. */
4297 if (reduction_type == COND_REDUCTION)
4298 ncopies *= 2;
4299
4300 vectype = STMT_VINFO_VECTYPE (stmt_info);
4301 mode = TYPE_MODE (vectype);
4302 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4303
4304 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4305
4306 if (reduction_type == EXTRACT_LAST_REDUCTION)
4307 /* No extra instructions are needed in the prologue. The loop body
4308 operations are costed in vectorizable_condition. */
4309 inside_cost = 0;
4310 else if (reduction_type == FOLD_LEFT_REDUCTION)
4311 {
4312 /* No extra instructions needed in the prologue. */
4313 prologue_cost = 0;
4314
4315 if (reduc_fn != IFN_LAST)
4316 /* Count one reduction-like operation per vector. */
4317 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4318 stmt_info, 0, vect_body);
4319 else
4320 {
4321 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4322 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4323 inside_cost = record_stmt_cost (cost_vec, nelements,
4324 vec_to_scalar, stmt_info, 0,
4325 vect_body);
4326 inside_cost += record_stmt_cost (cost_vec, nelements,
4327 scalar_stmt, stmt_info, 0,
4328 vect_body);
4329 }
4330 }
4331 else
4332 {
4333 /* Add in cost for initial definition.
4334 For cond reduction we have four vectors: initial index, step,
4335 initial result of the data reduction, initial value of the index
4336 reduction. */
4337 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4338 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4339 scalar_to_vec, stmt_info, 0,
4340 vect_prologue);
4341
4342 /* Cost of reduction op inside loop. */
4343 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4344 stmt_info, 0, vect_body);
4345 }
4346
4347 /* Determine cost of epilogue code.
4348
4349 We have a reduction operator that will reduce the vector in one statement.
4350 Also requires scalar extract. */
4351
4352 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4353 {
4354 if (reduc_fn != IFN_LAST)
4355 {
4356 if (reduction_type == COND_REDUCTION)
4357 {
4358 /* An EQ stmt and an COND_EXPR stmt. */
4359 epilogue_cost += record_stmt_cost (cost_vec, 2,
4360 vector_stmt, stmt_info, 0,
4361 vect_epilogue);
4362 /* Reduction of the max index and a reduction of the found
4363 values. */
4364 epilogue_cost += record_stmt_cost (cost_vec, 2,
4365 vec_to_scalar, stmt_info, 0,
4366 vect_epilogue);
4367 /* A broadcast of the max value. */
4368 epilogue_cost += record_stmt_cost (cost_vec, 1,
4369 scalar_to_vec, stmt_info, 0,
4370 vect_epilogue);
4371 }
4372 else
4373 {
4374 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4375 stmt_info, 0, vect_epilogue);
4376 epilogue_cost += record_stmt_cost (cost_vec, 1,
4377 vec_to_scalar, stmt_info, 0,
4378 vect_epilogue);
4379 }
4380 }
4381 else if (reduction_type == COND_REDUCTION)
4382 {
4383 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4384 /* Extraction of scalar elements. */
4385 epilogue_cost += record_stmt_cost (cost_vec,
4386 2 * estimated_nunits,
4387 vec_to_scalar, stmt_info, 0,
4388 vect_epilogue);
4389 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4390 epilogue_cost += record_stmt_cost (cost_vec,
4391 2 * estimated_nunits - 3,
4392 scalar_stmt, stmt_info, 0,
4393 vect_epilogue);
4394 }
4395 else if (reduction_type == EXTRACT_LAST_REDUCTION
4396 || reduction_type == FOLD_LEFT_REDUCTION)
4397 /* No extra instructions need in the epilogue. */
4398 ;
4399 else
4400 {
4401 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4402 tree bitsize =
4403 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4404 int element_bitsize = tree_to_uhwi (bitsize);
4405 int nelements = vec_size_in_bits / element_bitsize;
4406
4407 if (code == COND_EXPR)
4408 code = MAX_EXPR;
4409
4410 optab = optab_for_tree_code (code, vectype, optab_default);
4411
4412 /* We have a whole vector shift available. */
4413 if (optab != unknown_optab
4414 && VECTOR_MODE_P (mode)
4415 && optab_handler (optab, mode) != CODE_FOR_nothing
4416 && have_whole_vector_shift (mode))
4417 {
4418 /* Final reduction via vector shifts and the reduction operator.
4419 Also requires scalar extract. */
4420 epilogue_cost += record_stmt_cost (cost_vec,
4421 exact_log2 (nelements) * 2,
4422 vector_stmt, stmt_info, 0,
4423 vect_epilogue);
4424 epilogue_cost += record_stmt_cost (cost_vec, 1,
4425 vec_to_scalar, stmt_info, 0,
4426 vect_epilogue);
4427 }
4428 else
4429 /* Use extracts and reduction op for final reduction. For N
4430 elements, we have N extracts and N-1 reduction ops. */
4431 epilogue_cost += record_stmt_cost (cost_vec,
4432 nelements + nelements - 1,
4433 vector_stmt, stmt_info, 0,
4434 vect_epilogue);
4435 }
4436 }
4437
4438 if (dump_enabled_p ())
4439 dump_printf (MSG_NOTE,
4440 "vect_model_reduction_cost: inside_cost = %d, "
4441 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4442 prologue_cost, epilogue_cost);
4443 }
4444
4445
4446
4447 /* Function get_initial_def_for_reduction
4448
4449 Input:
4450 STMT_VINFO - a stmt that performs a reduction operation in the loop.
4451 INIT_VAL - the initial value of the reduction variable
4452
4453 Output:
4454 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4455 of the reduction (used for adjusting the epilog - see below).
4456 Return a vector variable, initialized according to the operation that
4457 STMT_VINFO performs. This vector will be used as the initial value
4458 of the vector of partial results.
4459
4460 Option1 (adjust in epilog): Initialize the vector as follows:
4461 add/bit or/xor: [0,0,...,0,0]
4462 mult/bit and: [1,1,...,1,1]
4463 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4464 and when necessary (e.g. add/mult case) let the caller know
4465 that it needs to adjust the result by init_val.
4466
4467 Option2: Initialize the vector as follows:
4468 add/bit or/xor: [init_val,0,0,...,0]
4469 mult/bit and: [init_val,1,1,...,1]
4470 min/max/cond_expr: [init_val,init_val,...,init_val]
4471 and no adjustments are needed.
4472
4473 For example, for the following code:
4474
4475 s = init_val;
4476 for (i=0;i<n;i++)
4477 s = s + a[i];
4478
4479 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4480 For a vector of 4 units, we want to return either [0,0,0,init_val],
4481 or [0,0,0,0] and let the caller know that it needs to adjust
4482 the result at the end by 'init_val'.
4483
4484 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4485 initialization vector is simpler (same element in all entries), if
4486 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4487
4488 A cost model should help decide between these two schemes. */
4489
4490 static tree
4491 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4492 stmt_vec_info stmt_vinfo,
4493 enum tree_code code, tree init_val,
4494 tree *adjustment_def)
4495 {
4496 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4497 tree scalar_type = TREE_TYPE (init_val);
4498 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4499 tree def_for_init;
4500 tree init_def;
4501 REAL_VALUE_TYPE real_init_val = dconst0;
4502 int int_init_val = 0;
4503 gimple_seq stmts = NULL;
4504
4505 gcc_assert (vectype);
4506
4507 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4508 || SCALAR_FLOAT_TYPE_P (scalar_type));
4509
4510 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4511 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4512
4513 /* ADJUSTMENT_DEF is NULL when called from
4514 vect_create_epilog_for_reduction to vectorize double reduction. */
4515 if (adjustment_def)
4516 *adjustment_def = NULL;
4517
4518 switch (code)
4519 {
4520 case WIDEN_SUM_EXPR:
4521 case DOT_PROD_EXPR:
4522 case SAD_EXPR:
4523 case PLUS_EXPR:
4524 case MINUS_EXPR:
4525 case BIT_IOR_EXPR:
4526 case BIT_XOR_EXPR:
4527 case MULT_EXPR:
4528 case BIT_AND_EXPR:
4529 {
4530 if (code == MULT_EXPR)
4531 {
4532 real_init_val = dconst1;
4533 int_init_val = 1;
4534 }
4535
4536 if (code == BIT_AND_EXPR)
4537 int_init_val = -1;
4538
4539 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4540 def_for_init = build_real (scalar_type, real_init_val);
4541 else
4542 def_for_init = build_int_cst (scalar_type, int_init_val);
4543
4544 if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4545 {
4546 /* Option1: the first element is '0' or '1' as well. */
4547 if (!operand_equal_p (def_for_init, init_val, 0))
4548 *adjustment_def = init_val;
4549 init_def = gimple_build_vector_from_val (&stmts, vectype,
4550 def_for_init);
4551 }
4552 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4553 {
4554 /* Option2 (variable length): the first element is INIT_VAL. */
4555 init_def = gimple_build_vector_from_val (&stmts, vectype,
4556 def_for_init);
4557 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4558 vectype, init_def, init_val);
4559 }
4560 else
4561 {
4562 /* Option2: the first element is INIT_VAL. */
4563 tree_vector_builder elts (vectype, 1, 2);
4564 elts.quick_push (init_val);
4565 elts.quick_push (def_for_init);
4566 init_def = gimple_build_vector (&stmts, &elts);
4567 }
4568 }
4569 break;
4570
4571 case MIN_EXPR:
4572 case MAX_EXPR:
4573 case COND_EXPR:
4574 {
4575 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4576 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4577 }
4578 break;
4579
4580 default:
4581 gcc_unreachable ();
4582 }
4583
4584 if (stmts)
4585 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4586 return init_def;
4587 }
4588
4589 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4590 NUMBER_OF_VECTORS is the number of vector defs to create.
4591 If NEUTRAL_OP is nonnull, introducing extra elements of that
4592 value will not change the result. */
4593
4594 static void
4595 get_initial_defs_for_reduction (vec_info *vinfo,
4596 slp_tree slp_node,
4597 vec<tree> *vec_oprnds,
4598 unsigned int number_of_vectors,
4599 bool reduc_chain, tree neutral_op)
4600 {
4601 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4602 stmt_vec_info stmt_vinfo = stmts[0];
4603 unsigned HOST_WIDE_INT nunits;
4604 unsigned j, number_of_places_left_in_vector;
4605 tree vector_type;
4606 unsigned int group_size = stmts.length ();
4607 unsigned int i;
4608 class loop *loop;
4609
4610 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4611
4612 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4613
4614 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4615 gcc_assert (loop);
4616 edge pe = loop_preheader_edge (loop);
4617
4618 gcc_assert (!reduc_chain || neutral_op);
4619
4620 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4621 created vectors. It is greater than 1 if unrolling is performed.
4622
4623 For example, we have two scalar operands, s1 and s2 (e.g., group of
4624 strided accesses of size two), while NUNITS is four (i.e., four scalars
4625 of this type can be packed in a vector). The output vector will contain
4626 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4627 will be 2).
4628
4629 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4630 vectors containing the operands.
4631
4632 For example, NUNITS is four as before, and the group size is 8
4633 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4634 {s5, s6, s7, s8}. */
4635
4636 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4637 nunits = group_size;
4638
4639 number_of_places_left_in_vector = nunits;
4640 bool constant_p = true;
4641 tree_vector_builder elts (vector_type, nunits, 1);
4642 elts.quick_grow (nunits);
4643 gimple_seq ctor_seq = NULL;
4644 for (j = 0; j < nunits * number_of_vectors; ++j)
4645 {
4646 tree op;
4647 i = j % group_size;
4648 stmt_vinfo = stmts[i];
4649
4650 /* Get the def before the loop. In reduction chain we have only
4651 one initial value. Else we have as many as PHIs in the group. */
4652 if (reduc_chain)
4653 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4654 else if (((vec_oprnds->length () + 1) * nunits
4655 - number_of_places_left_in_vector >= group_size)
4656 && neutral_op)
4657 op = neutral_op;
4658 else
4659 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4660
4661 /* Create 'vect_ = {op0,op1,...,opn}'. */
4662 number_of_places_left_in_vector--;
4663 elts[nunits - number_of_places_left_in_vector - 1] = op;
4664 if (!CONSTANT_CLASS_P (op))
4665 constant_p = false;
4666
4667 if (number_of_places_left_in_vector == 0)
4668 {
4669 tree init;
4670 if (constant_p && !neutral_op
4671 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4672 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4673 /* Build the vector directly from ELTS. */
4674 init = gimple_build_vector (&ctor_seq, &elts);
4675 else if (neutral_op)
4676 {
4677 /* Build a vector of the neutral value and shift the
4678 other elements into place. */
4679 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4680 neutral_op);
4681 int k = nunits;
4682 while (k > 0 && elts[k - 1] == neutral_op)
4683 k -= 1;
4684 while (k > 0)
4685 {
4686 k -= 1;
4687 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4688 vector_type, init, elts[k]);
4689 }
4690 }
4691 else
4692 {
4693 /* First time round, duplicate ELTS to fill the
4694 required number of vectors. */
4695 duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4696 number_of_vectors, *vec_oprnds);
4697 break;
4698 }
4699 vec_oprnds->quick_push (init);
4700
4701 number_of_places_left_in_vector = nunits;
4702 elts.new_vector (vector_type, nunits, 1);
4703 elts.quick_grow (nunits);
4704 constant_p = true;
4705 }
4706 }
4707 if (ctor_seq != NULL)
4708 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4709 }
4710
4711 /* For a statement STMT_INFO taking part in a reduction operation return
4712 the stmt_vec_info the meta information is stored on. */
4713
4714 stmt_vec_info
4715 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4716 {
4717 stmt_info = vect_orig_stmt (stmt_info);
4718 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4719 if (!is_a <gphi *> (stmt_info->stmt)
4720 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4721 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4722 gphi *phi = as_a <gphi *> (stmt_info->stmt);
4723 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4724 {
4725 if (gimple_phi_num_args (phi) == 1)
4726 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4727 }
4728 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4729 {
4730 edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4731 stmt_vec_info info
4732 = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4733 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4734 stmt_info = info;
4735 }
4736 return stmt_info;
4737 }
4738
4739 /* Function vect_create_epilog_for_reduction
4740
4741 Create code at the loop-epilog to finalize the result of a reduction
4742 computation.
4743
4744 STMT_INFO is the scalar reduction stmt that is being vectorized.
4745 SLP_NODE is an SLP node containing a group of reduction statements. The
4746 first one in this group is STMT_INFO.
4747 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4748 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4749 (counting from 0)
4750
4751 This function:
4752 1. Completes the reduction def-use cycles.
4753 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4754 by calling the function specified by REDUC_FN if available, or by
4755 other means (whole-vector shifts or a scalar loop).
4756 The function also creates a new phi node at the loop exit to preserve
4757 loop-closed form, as illustrated below.
4758
4759 The flow at the entry to this function:
4760
4761 loop:
4762 vec_def = phi <vec_init, null> # REDUCTION_PHI
4763 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4764 s_loop = scalar_stmt # (scalar) STMT_INFO
4765 loop_exit:
4766 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4767 use <s_out0>
4768 use <s_out0>
4769
4770 The above is transformed by this function into:
4771
4772 loop:
4773 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4774 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4775 s_loop = scalar_stmt # (scalar) STMT_INFO
4776 loop_exit:
4777 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4778 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4779 v_out2 = reduce <v_out1>
4780 s_out3 = extract_field <v_out2, 0>
4781 s_out4 = adjust_result <s_out3>
4782 use <s_out4>
4783 use <s_out4>
4784 */
4785
4786 static void
4787 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
4788 stmt_vec_info stmt_info,
4789 slp_tree slp_node,
4790 slp_instance slp_node_instance)
4791 {
4792 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4793 gcc_assert (reduc_info->is_reduc_info);
4794 /* For double reductions we need to get at the inner loop reduction
4795 stmt which has the meta info attached. Our stmt_info is that of the
4796 loop-closed PHI of the inner loop which we remember as
4797 def for the reduction PHI generation. */
4798 bool double_reduc = false;
4799 stmt_vec_info rdef_info = stmt_info;
4800 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4801 {
4802 gcc_assert (!slp_node);
4803 double_reduc = true;
4804 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4805 (stmt_info->stmt, 0));
4806 stmt_info = vect_stmt_to_vectorize (stmt_info);
4807 }
4808 gphi *reduc_def_stmt
4809 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4810 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4811 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4812 tree vectype;
4813 machine_mode mode;
4814 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4815 basic_block exit_bb;
4816 tree scalar_dest;
4817 tree scalar_type;
4818 gimple *new_phi = NULL, *phi;
4819 gimple_stmt_iterator exit_gsi;
4820 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4821 gimple *epilog_stmt = NULL;
4822 gimple *exit_phi;
4823 tree bitsize;
4824 tree def;
4825 tree orig_name, scalar_result;
4826 imm_use_iterator imm_iter, phi_imm_iter;
4827 use_operand_p use_p, phi_use_p;
4828 gimple *use_stmt;
4829 bool nested_in_vect_loop = false;
4830 auto_vec<gimple *> new_phis;
4831 int j, i;
4832 auto_vec<tree> scalar_results;
4833 unsigned int group_size = 1, k;
4834 auto_vec<gimple *> phis;
4835 bool slp_reduc = false;
4836 bool direct_slp_reduc;
4837 tree new_phi_result;
4838 tree induction_index = NULL_TREE;
4839
4840 if (slp_node)
4841 group_size = SLP_TREE_LANES (slp_node);
4842
4843 if (nested_in_vect_loop_p (loop, stmt_info))
4844 {
4845 outer_loop = loop;
4846 loop = loop->inner;
4847 nested_in_vect_loop = true;
4848 gcc_assert (!slp_node);
4849 }
4850 gcc_assert (!nested_in_vect_loop || double_reduc);
4851
4852 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
4853 gcc_assert (vectype);
4854 mode = TYPE_MODE (vectype);
4855
4856 tree initial_def = NULL;
4857 tree induc_val = NULL_TREE;
4858 tree adjustment_def = NULL;
4859 if (slp_node)
4860 ;
4861 else
4862 {
4863 /* Get at the scalar def before the loop, that defines the initial value
4864 of the reduction variable. */
4865 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4866 loop_preheader_edge (loop));
4867 /* Optimize: for induction condition reduction, if we can't use zero
4868 for induc_val, use initial_def. */
4869 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4870 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4871 else if (double_reduc)
4872 ;
4873 else if (nested_in_vect_loop)
4874 ;
4875 else
4876 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4877 }
4878
4879 unsigned vec_num;
4880 int ncopies;
4881 if (slp_node)
4882 {
4883 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4884 ncopies = 1;
4885 }
4886 else
4887 {
4888 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
4889 vec_num = 1;
4890 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
4891 }
4892
4893 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4894 which is updated with the current index of the loop for every match of
4895 the original loop's cond_expr (VEC_STMT). This results in a vector
4896 containing the last time the condition passed for that vector lane.
4897 The first match will be a 1 to allow 0 to be used for non-matching
4898 indexes. If there are no matches at all then the vector will be all
4899 zeroes.
4900
4901 PR92772: This algorithm is broken for architectures that support
4902 masked vectors, but do not provide fold_extract_last. */
4903 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4904 {
4905 auto_vec<std::pair<tree, bool>, 2> ccompares;
4906 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
4907 cond_info = vect_stmt_to_vectorize (cond_info);
4908 while (cond_info != reduc_info)
4909 {
4910 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
4911 {
4912 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
4913 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4914 ccompares.safe_push
4915 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
4916 STMT_VINFO_REDUC_IDX (cond_info) == 2));
4917 }
4918 cond_info
4919 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
4920 1 + STMT_VINFO_REDUC_IDX
4921 (cond_info)));
4922 cond_info = vect_stmt_to_vectorize (cond_info);
4923 }
4924 gcc_assert (ccompares.length () != 0);
4925
4926 tree indx_before_incr, indx_after_incr;
4927 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4928 int scalar_precision
4929 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4930 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4931 tree cr_index_vector_type = get_related_vectype_for_scalar_type
4932 (TYPE_MODE (vectype), cr_index_scalar_type,
4933 TYPE_VECTOR_SUBPARTS (vectype));
4934
4935 /* First we create a simple vector induction variable which starts
4936 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4937 vector size (STEP). */
4938
4939 /* Create a {1,2,3,...} vector. */
4940 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4941
4942 /* Create a vector of the step value. */
4943 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4944 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4945
4946 /* Create an induction variable. */
4947 gimple_stmt_iterator incr_gsi;
4948 bool insert_after;
4949 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4950 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4951 insert_after, &indx_before_incr, &indx_after_incr);
4952
4953 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4954 filled with zeros (VEC_ZERO). */
4955
4956 /* Create a vector of 0s. */
4957 tree zero = build_zero_cst (cr_index_scalar_type);
4958 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4959
4960 /* Create a vector phi node. */
4961 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4962 new_phi = create_phi_node (new_phi_tree, loop->header);
4963 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4964 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4965
4966 /* Now take the condition from the loops original cond_exprs
4967 and produce a new cond_exprs (INDEX_COND_EXPR) which for
4968 every match uses values from the induction variable
4969 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4970 (NEW_PHI_TREE).
4971 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4972 the new cond_expr (INDEX_COND_EXPR). */
4973 gimple_seq stmts = NULL;
4974 for (int i = ccompares.length () - 1; i != -1; --i)
4975 {
4976 tree ccompare = ccompares[i].first;
4977 if (ccompares[i].second)
4978 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4979 cr_index_vector_type,
4980 ccompare,
4981 indx_before_incr, new_phi_tree);
4982 else
4983 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4984 cr_index_vector_type,
4985 ccompare,
4986 new_phi_tree, indx_before_incr);
4987 }
4988 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
4989
4990 /* Update the phi with the vec cond. */
4991 induction_index = new_phi_tree;
4992 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4993 loop_latch_edge (loop), UNKNOWN_LOCATION);
4994 }
4995
4996 /* 2. Create epilog code.
4997 The reduction epilog code operates across the elements of the vector
4998 of partial results computed by the vectorized loop.
4999 The reduction epilog code consists of:
5000
5001 step 1: compute the scalar result in a vector (v_out2)
5002 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5003 step 3: adjust the scalar result (s_out3) if needed.
5004
5005 Step 1 can be accomplished using one the following three schemes:
5006 (scheme 1) using reduc_fn, if available.
5007 (scheme 2) using whole-vector shifts, if available.
5008 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5009 combined.
5010
5011 The overall epilog code looks like this:
5012
5013 s_out0 = phi <s_loop> # original EXIT_PHI
5014 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5015 v_out2 = reduce <v_out1> # step 1
5016 s_out3 = extract_field <v_out2, 0> # step 2
5017 s_out4 = adjust_result <s_out3> # step 3
5018
5019 (step 3 is optional, and steps 1 and 2 may be combined).
5020 Lastly, the uses of s_out0 are replaced by s_out4. */
5021
5022
5023 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5024 v_out1 = phi <VECT_DEF>
5025 Store them in NEW_PHIS. */
5026 if (double_reduc)
5027 loop = outer_loop;
5028 exit_bb = single_exit (loop)->dest;
5029 new_phis.create (slp_node ? vec_num : ncopies);
5030 for (unsigned i = 0; i < vec_num; i++)
5031 {
5032 if (slp_node)
5033 def = vect_get_slp_vect_def (slp_node, i);
5034 else
5035 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5036 for (j = 0; j < ncopies; j++)
5037 {
5038 tree new_def = copy_ssa_name (def);
5039 phi = create_phi_node (new_def, exit_bb);
5040 if (j == 0)
5041 new_phis.quick_push (phi);
5042 else
5043 {
5044 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5045 new_phis.quick_push (phi);
5046 }
5047
5048 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5049 }
5050 }
5051
5052 exit_gsi = gsi_after_labels (exit_bb);
5053
5054 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5055 (i.e. when reduc_fn is not available) and in the final adjustment
5056 code (if needed). Also get the original scalar reduction variable as
5057 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5058 represents a reduction pattern), the tree-code and scalar-def are
5059 taken from the original stmt that the pattern-stmt (STMT) replaces.
5060 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5061 are taken from STMT. */
5062
5063 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5064 if (orig_stmt_info != stmt_info)
5065 {
5066 /* Reduction pattern */
5067 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5068 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5069 }
5070
5071 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
5072 scalar_type = TREE_TYPE (scalar_dest);
5073 scalar_results.create (group_size);
5074 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5075 bitsize = TYPE_SIZE (scalar_type);
5076
5077 /* SLP reduction without reduction chain, e.g.,
5078 # a1 = phi <a2, a0>
5079 # b1 = phi <b2, b0>
5080 a2 = operation (a1)
5081 b2 = operation (b1) */
5082 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5083
5084 /* True if we should implement SLP_REDUC using native reduction operations
5085 instead of scalar operations. */
5086 direct_slp_reduc = (reduc_fn != IFN_LAST
5087 && slp_reduc
5088 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5089
5090 /* In case of reduction chain, e.g.,
5091 # a1 = phi <a3, a0>
5092 a2 = operation (a1)
5093 a3 = operation (a2),
5094
5095 we may end up with more than one vector result. Here we reduce them to
5096 one vector. */
5097 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
5098 {
5099 gimple_seq stmts = NULL;
5100 tree first_vect = PHI_RESULT (new_phis[0]);
5101 first_vect = gimple_convert (&stmts, vectype, first_vect);
5102 for (k = 1; k < new_phis.length (); k++)
5103 {
5104 gimple *next_phi = new_phis[k];
5105 tree second_vect = PHI_RESULT (next_phi);
5106 second_vect = gimple_convert (&stmts, vectype, second_vect);
5107 first_vect = gimple_build (&stmts, code, vectype,
5108 first_vect, second_vect);
5109 }
5110 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5111
5112 new_phi_result = first_vect;
5113 new_phis.truncate (0);
5114 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5115 }
5116 /* Likewise if we couldn't use a single defuse cycle. */
5117 else if (ncopies > 1)
5118 {
5119 gimple_seq stmts = NULL;
5120 tree first_vect = PHI_RESULT (new_phis[0]);
5121 first_vect = gimple_convert (&stmts, vectype, first_vect);
5122 for (int k = 1; k < ncopies; ++k)
5123 {
5124 tree second_vect = PHI_RESULT (new_phis[k]);
5125 second_vect = gimple_convert (&stmts, vectype, second_vect);
5126 first_vect = gimple_build (&stmts, code, vectype,
5127 first_vect, second_vect);
5128 }
5129 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5130 new_phi_result = first_vect;
5131 new_phis.truncate (0);
5132 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5133 }
5134 else
5135 new_phi_result = PHI_RESULT (new_phis[0]);
5136
5137 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5138 && reduc_fn != IFN_LAST)
5139 {
5140 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5141 various data values where the condition matched and another vector
5142 (INDUCTION_INDEX) containing all the indexes of those matches. We
5143 need to extract the last matching index (which will be the index with
5144 highest value) and use this to index into the data vector.
5145 For the case where there were no matches, the data vector will contain
5146 all default values and the index vector will be all zeros. */
5147
5148 /* Get various versions of the type of the vector of indexes. */
5149 tree index_vec_type = TREE_TYPE (induction_index);
5150 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5151 tree index_scalar_type = TREE_TYPE (index_vec_type);
5152 tree index_vec_cmp_type = truth_type_for (index_vec_type);
5153
5154 /* Get an unsigned integer version of the type of the data vector. */
5155 int scalar_precision
5156 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5157 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5158 tree vectype_unsigned = build_vector_type
5159 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5160
5161 /* First we need to create a vector (ZERO_VEC) of zeros and another
5162 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5163 can create using a MAX reduction and then expanding.
5164 In the case where the loop never made any matches, the max index will
5165 be zero. */
5166
5167 /* Vector of {0, 0, 0,...}. */
5168 tree zero_vec = build_zero_cst (vectype);
5169
5170 gimple_seq stmts = NULL;
5171 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5172 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5173
5174 /* Find maximum value from the vector of found indexes. */
5175 tree max_index = make_ssa_name (index_scalar_type);
5176 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5177 1, induction_index);
5178 gimple_call_set_lhs (max_index_stmt, max_index);
5179 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5180
5181 /* Vector of {max_index, max_index, max_index,...}. */
5182 tree max_index_vec = make_ssa_name (index_vec_type);
5183 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5184 max_index);
5185 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5186 max_index_vec_rhs);
5187 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5188
5189 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5190 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5191 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5192 otherwise. Only one value should match, resulting in a vector
5193 (VEC_COND) with one data value and the rest zeros.
5194 In the case where the loop never made any matches, every index will
5195 match, resulting in a vector with all data values (which will all be
5196 the default value). */
5197
5198 /* Compare the max index vector to the vector of found indexes to find
5199 the position of the max value. */
5200 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5201 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5202 induction_index,
5203 max_index_vec);
5204 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5205
5206 /* Use the compare to choose either values from the data vector or
5207 zero. */
5208 tree vec_cond = make_ssa_name (vectype);
5209 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5210 vec_compare, new_phi_result,
5211 zero_vec);
5212 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5213
5214 /* Finally we need to extract the data value from the vector (VEC_COND)
5215 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5216 reduction, but because this doesn't exist, we can use a MAX reduction
5217 instead. The data value might be signed or a float so we need to cast
5218 it first.
5219 In the case where the loop never made any matches, the data values are
5220 all identical, and so will reduce down correctly. */
5221
5222 /* Make the matched data values unsigned. */
5223 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5224 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5225 vec_cond);
5226 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5227 VIEW_CONVERT_EXPR,
5228 vec_cond_cast_rhs);
5229 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5230
5231 /* Reduce down to a scalar value. */
5232 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5233 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5234 1, vec_cond_cast);
5235 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5236 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5237
5238 /* Convert the reduced value back to the result type and set as the
5239 result. */
5240 stmts = NULL;
5241 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5242 data_reduc);
5243 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5244 scalar_results.safe_push (new_temp);
5245 }
5246 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5247 && reduc_fn == IFN_LAST)
5248 {
5249 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5250 idx = 0;
5251 idx_val = induction_index[0];
5252 val = data_reduc[0];
5253 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5254 if (induction_index[i] > idx_val)
5255 val = data_reduc[i], idx_val = induction_index[i];
5256 return val; */
5257
5258 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5259 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5260 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5261 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5262 /* Enforced by vectorizable_reduction, which ensures we have target
5263 support before allowing a conditional reduction on variable-length
5264 vectors. */
5265 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5266 tree idx_val = NULL_TREE, val = NULL_TREE;
5267 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5268 {
5269 tree old_idx_val = idx_val;
5270 tree old_val = val;
5271 idx_val = make_ssa_name (idx_eltype);
5272 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5273 build3 (BIT_FIELD_REF, idx_eltype,
5274 induction_index,
5275 bitsize_int (el_size),
5276 bitsize_int (off)));
5277 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5278 val = make_ssa_name (data_eltype);
5279 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5280 build3 (BIT_FIELD_REF,
5281 data_eltype,
5282 new_phi_result,
5283 bitsize_int (el_size),
5284 bitsize_int (off)));
5285 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5286 if (off != 0)
5287 {
5288 tree new_idx_val = idx_val;
5289 if (off != v_size - el_size)
5290 {
5291 new_idx_val = make_ssa_name (idx_eltype);
5292 epilog_stmt = gimple_build_assign (new_idx_val,
5293 MAX_EXPR, idx_val,
5294 old_idx_val);
5295 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5296 }
5297 tree new_val = make_ssa_name (data_eltype);
5298 epilog_stmt = gimple_build_assign (new_val,
5299 COND_EXPR,
5300 build2 (GT_EXPR,
5301 boolean_type_node,
5302 idx_val,
5303 old_idx_val),
5304 val, old_val);
5305 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5306 idx_val = new_idx_val;
5307 val = new_val;
5308 }
5309 }
5310 /* Convert the reduced value back to the result type and set as the
5311 result. */
5312 gimple_seq stmts = NULL;
5313 val = gimple_convert (&stmts, scalar_type, val);
5314 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5315 scalar_results.safe_push (val);
5316 }
5317
5318 /* 2.3 Create the reduction code, using one of the three schemes described
5319 above. In SLP we simply need to extract all the elements from the
5320 vector (without reducing them), so we use scalar shifts. */
5321 else if (reduc_fn != IFN_LAST && !slp_reduc)
5322 {
5323 tree tmp;
5324 tree vec_elem_type;
5325
5326 /* Case 1: Create:
5327 v_out2 = reduc_expr <v_out1> */
5328
5329 if (dump_enabled_p ())
5330 dump_printf_loc (MSG_NOTE, vect_location,
5331 "Reduce using direct vector reduction.\n");
5332
5333 gimple_seq stmts = NULL;
5334 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5335 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5336 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5337 vec_elem_type, new_phi_result);
5338 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5339 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5340
5341 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5342 && induc_val)
5343 {
5344 /* Earlier we set the initial value to be a vector if induc_val
5345 values. Check the result and if it is induc_val then replace
5346 with the original initial value, unless induc_val is
5347 the same as initial_def already. */
5348 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5349 induc_val);
5350
5351 tmp = make_ssa_name (new_scalar_dest);
5352 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5353 initial_def, new_temp);
5354 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5355 new_temp = tmp;
5356 }
5357
5358 scalar_results.safe_push (new_temp);
5359 }
5360 else if (direct_slp_reduc)
5361 {
5362 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5363 with the elements for other SLP statements replaced with the
5364 neutral value. We can then do a normal reduction on each vector. */
5365
5366 /* Enforced by vectorizable_reduction. */
5367 gcc_assert (new_phis.length () == 1);
5368 gcc_assert (pow2p_hwi (group_size));
5369
5370 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5371 vec<stmt_vec_info> orig_phis
5372 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5373 gimple_seq seq = NULL;
5374
5375 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5376 and the same element size as VECTYPE. */
5377 tree index = build_index_vector (vectype, 0, 1);
5378 tree index_type = TREE_TYPE (index);
5379 tree index_elt_type = TREE_TYPE (index_type);
5380 tree mask_type = truth_type_for (index_type);
5381
5382 /* Create a vector that, for each element, identifies which of
5383 the REDUC_GROUP_SIZE results should use it. */
5384 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5385 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5386 build_vector_from_val (index_type, index_mask));
5387
5388 /* Get a neutral vector value. This is simply a splat of the neutral
5389 scalar value if we have one, otherwise the initial scalar value
5390 is itself a neutral value. */
5391 tree vector_identity = NULL_TREE;
5392 tree neutral_op = NULL_TREE;
5393 if (slp_node)
5394 {
5395 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5396 neutral_op
5397 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5398 vectype, code, first != NULL);
5399 }
5400 if (neutral_op)
5401 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5402 neutral_op);
5403 for (unsigned int i = 0; i < group_size; ++i)
5404 {
5405 /* If there's no univeral neutral value, we can use the
5406 initial scalar value from the original PHI. This is used
5407 for MIN and MAX reduction, for example. */
5408 if (!neutral_op)
5409 {
5410 tree scalar_value
5411 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5412 loop_preheader_edge (loop));
5413 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5414 scalar_value);
5415 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5416 scalar_value);
5417 }
5418
5419 /* Calculate the equivalent of:
5420
5421 sel[j] = (index[j] == i);
5422
5423 which selects the elements of NEW_PHI_RESULT that should
5424 be included in the result. */
5425 tree compare_val = build_int_cst (index_elt_type, i);
5426 compare_val = build_vector_from_val (index_type, compare_val);
5427 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5428 index, compare_val);
5429
5430 /* Calculate the equivalent of:
5431
5432 vec = seq ? new_phi_result : vector_identity;
5433
5434 VEC is now suitable for a full vector reduction. */
5435 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5436 sel, new_phi_result, vector_identity);
5437
5438 /* Do the reduction and convert it to the appropriate type. */
5439 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5440 TREE_TYPE (vectype), vec);
5441 scalar = gimple_convert (&seq, scalar_type, scalar);
5442 scalar_results.safe_push (scalar);
5443 }
5444 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5445 }
5446 else
5447 {
5448 bool reduce_with_shift;
5449 tree vec_temp;
5450
5451 gcc_assert (slp_reduc || new_phis.length () == 1);
5452
5453 /* See if the target wants to do the final (shift) reduction
5454 in a vector mode of smaller size and first reduce upper/lower
5455 halves against each other. */
5456 enum machine_mode mode1 = mode;
5457 tree stype = TREE_TYPE (vectype);
5458 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5459 unsigned nunits1 = nunits;
5460 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5461 && new_phis.length () == 1)
5462 {
5463 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5464 /* For SLP reductions we have to make sure lanes match up, but
5465 since we're doing individual element final reduction reducing
5466 vector width here is even more important.
5467 ??? We can also separate lanes with permutes, for the common
5468 case of power-of-two group-size odd/even extracts would work. */
5469 if (slp_reduc && nunits != nunits1)
5470 {
5471 nunits1 = least_common_multiple (nunits1, group_size);
5472 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5473 }
5474 }
5475 if (!slp_reduc
5476 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5477 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5478
5479 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5480 stype, nunits1);
5481 reduce_with_shift = have_whole_vector_shift (mode1);
5482 if (!VECTOR_MODE_P (mode1))
5483 reduce_with_shift = false;
5484 else
5485 {
5486 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5487 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5488 reduce_with_shift = false;
5489 }
5490
5491 /* First reduce the vector to the desired vector size we should
5492 do shift reduction on by combining upper and lower halves. */
5493 new_temp = new_phi_result;
5494 while (nunits > nunits1)
5495 {
5496 nunits /= 2;
5497 vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5498 stype, nunits);
5499 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5500
5501 /* The target has to make sure we support lowpart/highpart
5502 extraction, either via direct vector extract or through
5503 an integer mode punning. */
5504 tree dst1, dst2;
5505 if (convert_optab_handler (vec_extract_optab,
5506 TYPE_MODE (TREE_TYPE (new_temp)),
5507 TYPE_MODE (vectype1))
5508 != CODE_FOR_nothing)
5509 {
5510 /* Extract sub-vectors directly once vec_extract becomes
5511 a conversion optab. */
5512 dst1 = make_ssa_name (vectype1);
5513 epilog_stmt
5514 = gimple_build_assign (dst1, BIT_FIELD_REF,
5515 build3 (BIT_FIELD_REF, vectype1,
5516 new_temp, TYPE_SIZE (vectype1),
5517 bitsize_int (0)));
5518 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5519 dst2 = make_ssa_name (vectype1);
5520 epilog_stmt
5521 = gimple_build_assign (dst2, BIT_FIELD_REF,
5522 build3 (BIT_FIELD_REF, vectype1,
5523 new_temp, TYPE_SIZE (vectype1),
5524 bitsize_int (bitsize)));
5525 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5526 }
5527 else
5528 {
5529 /* Extract via punning to appropriately sized integer mode
5530 vector. */
5531 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5532 tree etype = build_vector_type (eltype, 2);
5533 gcc_assert (convert_optab_handler (vec_extract_optab,
5534 TYPE_MODE (etype),
5535 TYPE_MODE (eltype))
5536 != CODE_FOR_nothing);
5537 tree tem = make_ssa_name (etype);
5538 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5539 build1 (VIEW_CONVERT_EXPR,
5540 etype, new_temp));
5541 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5542 new_temp = tem;
5543 tem = make_ssa_name (eltype);
5544 epilog_stmt
5545 = gimple_build_assign (tem, BIT_FIELD_REF,
5546 build3 (BIT_FIELD_REF, eltype,
5547 new_temp, TYPE_SIZE (eltype),
5548 bitsize_int (0)));
5549 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5550 dst1 = make_ssa_name (vectype1);
5551 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5552 build1 (VIEW_CONVERT_EXPR,
5553 vectype1, tem));
5554 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5555 tem = make_ssa_name (eltype);
5556 epilog_stmt
5557 = gimple_build_assign (tem, BIT_FIELD_REF,
5558 build3 (BIT_FIELD_REF, eltype,
5559 new_temp, TYPE_SIZE (eltype),
5560 bitsize_int (bitsize)));
5561 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5562 dst2 = make_ssa_name (vectype1);
5563 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5564 build1 (VIEW_CONVERT_EXPR,
5565 vectype1, tem));
5566 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5567 }
5568
5569 new_temp = make_ssa_name (vectype1);
5570 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5571 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5572 new_phis[0] = epilog_stmt;
5573 }
5574
5575 if (reduce_with_shift && !slp_reduc)
5576 {
5577 int element_bitsize = tree_to_uhwi (bitsize);
5578 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5579 for variable-length vectors and also requires direct target support
5580 for loop reductions. */
5581 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5582 int nelements = vec_size_in_bits / element_bitsize;
5583 vec_perm_builder sel;
5584 vec_perm_indices indices;
5585
5586 int elt_offset;
5587
5588 tree zero_vec = build_zero_cst (vectype1);
5589 /* Case 2: Create:
5590 for (offset = nelements/2; offset >= 1; offset/=2)
5591 {
5592 Create: va' = vec_shift <va, offset>
5593 Create: va = vop <va, va'>
5594 } */
5595
5596 tree rhs;
5597
5598 if (dump_enabled_p ())
5599 dump_printf_loc (MSG_NOTE, vect_location,
5600 "Reduce using vector shifts\n");
5601
5602 gimple_seq stmts = NULL;
5603 new_temp = gimple_convert (&stmts, vectype1, new_temp);
5604 for (elt_offset = nelements / 2;
5605 elt_offset >= 1;
5606 elt_offset /= 2)
5607 {
5608 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5609 indices.new_vector (sel, 2, nelements);
5610 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5611 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5612 new_temp, zero_vec, mask);
5613 new_temp = gimple_build (&stmts, code,
5614 vectype1, new_name, new_temp);
5615 }
5616 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5617
5618 /* 2.4 Extract the final scalar result. Create:
5619 s_out3 = extract_field <v_out2, bitpos> */
5620
5621 if (dump_enabled_p ())
5622 dump_printf_loc (MSG_NOTE, vect_location,
5623 "extract scalar result\n");
5624
5625 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5626 bitsize, bitsize_zero_node);
5627 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5628 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5629 gimple_assign_set_lhs (epilog_stmt, new_temp);
5630 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5631 scalar_results.safe_push (new_temp);
5632 }
5633 else
5634 {
5635 /* Case 3: Create:
5636 s = extract_field <v_out2, 0>
5637 for (offset = element_size;
5638 offset < vector_size;
5639 offset += element_size;)
5640 {
5641 Create: s' = extract_field <v_out2, offset>
5642 Create: s = op <s, s'> // For non SLP cases
5643 } */
5644
5645 if (dump_enabled_p ())
5646 dump_printf_loc (MSG_NOTE, vect_location,
5647 "Reduce using scalar code.\n");
5648
5649 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5650 int element_bitsize = tree_to_uhwi (bitsize);
5651 tree compute_type = TREE_TYPE (vectype);
5652 gimple_seq stmts = NULL;
5653 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5654 {
5655 int bit_offset;
5656 if (gimple_code (new_phi) == GIMPLE_PHI)
5657 vec_temp = PHI_RESULT (new_phi);
5658 else
5659 vec_temp = gimple_assign_lhs (new_phi);
5660 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5661 vec_temp, bitsize, bitsize_zero_node);
5662
5663 /* In SLP we don't need to apply reduction operation, so we just
5664 collect s' values in SCALAR_RESULTS. */
5665 if (slp_reduc)
5666 scalar_results.safe_push (new_temp);
5667
5668 for (bit_offset = element_bitsize;
5669 bit_offset < vec_size_in_bits;
5670 bit_offset += element_bitsize)
5671 {
5672 tree bitpos = bitsize_int (bit_offset);
5673 new_name = gimple_build (&stmts, BIT_FIELD_REF,
5674 compute_type, vec_temp,
5675 bitsize, bitpos);
5676 if (slp_reduc)
5677 {
5678 /* In SLP we don't need to apply reduction operation, so
5679 we just collect s' values in SCALAR_RESULTS. */
5680 new_temp = new_name;
5681 scalar_results.safe_push (new_name);
5682 }
5683 else
5684 new_temp = gimple_build (&stmts, code, compute_type,
5685 new_name, new_temp);
5686 }
5687 }
5688
5689 /* The only case where we need to reduce scalar results in SLP, is
5690 unrolling. If the size of SCALAR_RESULTS is greater than
5691 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5692 REDUC_GROUP_SIZE. */
5693 if (slp_reduc)
5694 {
5695 tree res, first_res, new_res;
5696
5697 /* Reduce multiple scalar results in case of SLP unrolling. */
5698 for (j = group_size; scalar_results.iterate (j, &res);
5699 j++)
5700 {
5701 first_res = scalar_results[j % group_size];
5702 new_res = gimple_build (&stmts, code, compute_type,
5703 first_res, res);
5704 scalar_results[j % group_size] = new_res;
5705 }
5706 for (k = 0; k < group_size; k++)
5707 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5708 scalar_results[k]);
5709 }
5710 else
5711 {
5712 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5713 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5714 scalar_results.safe_push (new_temp);
5715 }
5716
5717 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5718 }
5719
5720 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5721 && induc_val)
5722 {
5723 /* Earlier we set the initial value to be a vector if induc_val
5724 values. Check the result and if it is induc_val then replace
5725 with the original initial value, unless induc_val is
5726 the same as initial_def already. */
5727 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5728 induc_val);
5729
5730 tree tmp = make_ssa_name (new_scalar_dest);
5731 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5732 initial_def, new_temp);
5733 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5734 scalar_results[0] = tmp;
5735 }
5736 }
5737
5738 /* 2.5 Adjust the final result by the initial value of the reduction
5739 variable. (When such adjustment is not needed, then
5740 'adjustment_def' is zero). For example, if code is PLUS we create:
5741 new_temp = loop_exit_def + adjustment_def */
5742
5743 if (adjustment_def)
5744 {
5745 gcc_assert (!slp_reduc);
5746 gimple_seq stmts = NULL;
5747 if (nested_in_vect_loop)
5748 {
5749 new_phi = new_phis[0];
5750 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5751 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5752 new_temp = gimple_build (&stmts, code, vectype,
5753 PHI_RESULT (new_phi), adjustment_def);
5754 }
5755 else
5756 {
5757 new_temp = scalar_results[0];
5758 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5759 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5760 new_temp = gimple_build (&stmts, code, scalar_type,
5761 new_temp, adjustment_def);
5762 }
5763
5764 epilog_stmt = gimple_seq_last_stmt (stmts);
5765 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5766 if (nested_in_vect_loop)
5767 {
5768 if (!double_reduc)
5769 scalar_results.quick_push (new_temp);
5770 else
5771 scalar_results[0] = new_temp;
5772 }
5773 else
5774 scalar_results[0] = new_temp;
5775
5776 new_phis[0] = epilog_stmt;
5777 }
5778
5779 if (double_reduc)
5780 loop = loop->inner;
5781
5782 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5783 phis with new adjusted scalar results, i.e., replace use <s_out0>
5784 with use <s_out4>.
5785
5786 Transform:
5787 loop_exit:
5788 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5789 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5790 v_out2 = reduce <v_out1>
5791 s_out3 = extract_field <v_out2, 0>
5792 s_out4 = adjust_result <s_out3>
5793 use <s_out0>
5794 use <s_out0>
5795
5796 into:
5797
5798 loop_exit:
5799 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5800 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5801 v_out2 = reduce <v_out1>
5802 s_out3 = extract_field <v_out2, 0>
5803 s_out4 = adjust_result <s_out3>
5804 use <s_out4>
5805 use <s_out4> */
5806
5807
5808 /* In SLP reduction chain we reduce vector results into one vector if
5809 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5810 LHS of the last stmt in the reduction chain, since we are looking for
5811 the loop exit phi node. */
5812 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5813 {
5814 stmt_vec_info dest_stmt_info
5815 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5816 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5817 group_size = 1;
5818 }
5819
5820 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5821 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5822 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5823 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5824 correspond to the first vector stmt, etc.
5825 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5826 if (group_size > new_phis.length ())
5827 gcc_assert (!(group_size % new_phis.length ()));
5828
5829 for (k = 0; k < group_size; k++)
5830 {
5831 if (slp_reduc)
5832 {
5833 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5834
5835 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5836 /* SLP statements can't participate in patterns. */
5837 gcc_assert (!orig_stmt_info);
5838 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5839 }
5840
5841 if (nested_in_vect_loop)
5842 {
5843 if (double_reduc)
5844 loop = outer_loop;
5845 else
5846 gcc_unreachable ();
5847 }
5848
5849 phis.create (3);
5850 /* Find the loop-closed-use at the loop exit of the original scalar
5851 result. (The reduction result is expected to have two immediate uses,
5852 one at the latch block, and one at the loop exit). For double
5853 reductions we are looking for exit phis of the outer loop. */
5854 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5855 {
5856 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5857 {
5858 if (!is_gimple_debug (USE_STMT (use_p)))
5859 phis.safe_push (USE_STMT (use_p));
5860 }
5861 else
5862 {
5863 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5864 {
5865 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5866
5867 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5868 {
5869 if (!flow_bb_inside_loop_p (loop,
5870 gimple_bb (USE_STMT (phi_use_p)))
5871 && !is_gimple_debug (USE_STMT (phi_use_p)))
5872 phis.safe_push (USE_STMT (phi_use_p));
5873 }
5874 }
5875 }
5876 }
5877
5878 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5879 {
5880 /* Replace the uses: */
5881 orig_name = PHI_RESULT (exit_phi);
5882 scalar_result = scalar_results[k];
5883 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5884 {
5885 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5886 SET_USE (use_p, scalar_result);
5887 update_stmt (use_stmt);
5888 }
5889 }
5890
5891 phis.release ();
5892 }
5893 }
5894
5895 /* Return a vector of type VECTYPE that is equal to the vector select
5896 operation "MASK ? VEC : IDENTITY". Insert the select statements
5897 before GSI. */
5898
5899 static tree
5900 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5901 tree vec, tree identity)
5902 {
5903 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5904 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5905 mask, vec, identity);
5906 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5907 return cond;
5908 }
5909
5910 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5911 order, starting with LHS. Insert the extraction statements before GSI and
5912 associate the new scalar SSA names with variable SCALAR_DEST.
5913 Return the SSA name for the result. */
5914
5915 static tree
5916 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5917 tree_code code, tree lhs, tree vector_rhs)
5918 {
5919 tree vectype = TREE_TYPE (vector_rhs);
5920 tree scalar_type = TREE_TYPE (vectype);
5921 tree bitsize = TYPE_SIZE (scalar_type);
5922 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5923 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5924
5925 for (unsigned HOST_WIDE_INT bit_offset = 0;
5926 bit_offset < vec_size_in_bits;
5927 bit_offset += element_bitsize)
5928 {
5929 tree bitpos = bitsize_int (bit_offset);
5930 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5931 bitsize, bitpos);
5932
5933 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5934 rhs = make_ssa_name (scalar_dest, stmt);
5935 gimple_assign_set_lhs (stmt, rhs);
5936 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5937
5938 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5939 tree new_name = make_ssa_name (scalar_dest, stmt);
5940 gimple_assign_set_lhs (stmt, new_name);
5941 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5942 lhs = new_name;
5943 }
5944 return lhs;
5945 }
5946
5947 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
5948 type of the vector input. */
5949
5950 static internal_fn
5951 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5952 {
5953 internal_fn mask_reduc_fn;
5954
5955 switch (reduc_fn)
5956 {
5957 case IFN_FOLD_LEFT_PLUS:
5958 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5959 break;
5960
5961 default:
5962 return IFN_LAST;
5963 }
5964
5965 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5966 OPTIMIZE_FOR_SPEED))
5967 return mask_reduc_fn;
5968 return IFN_LAST;
5969 }
5970
5971 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5972 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5973 statement. CODE is the operation performed by STMT_INFO and OPS are
5974 its scalar operands. REDUC_INDEX is the index of the operand in
5975 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5976 implements in-order reduction, or IFN_LAST if we should open-code it.
5977 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5978 that should be used to control the operation in a fully-masked loop. */
5979
5980 static bool
5981 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
5982 stmt_vec_info stmt_info,
5983 gimple_stmt_iterator *gsi,
5984 gimple **vec_stmt, slp_tree slp_node,
5985 gimple *reduc_def_stmt,
5986 tree_code code, internal_fn reduc_fn,
5987 tree ops[3], tree vectype_in,
5988 int reduc_index, vec_loop_masks *masks)
5989 {
5990 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5991 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5992 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5993
5994 int ncopies;
5995 if (slp_node)
5996 ncopies = 1;
5997 else
5998 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5999
6000 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6001 gcc_assert (ncopies == 1);
6002 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6003
6004 if (slp_node)
6005 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6006 TYPE_VECTOR_SUBPARTS (vectype_in)));
6007
6008 tree op0 = ops[1 - reduc_index];
6009
6010 int group_size = 1;
6011 stmt_vec_info scalar_dest_def_info;
6012 auto_vec<tree> vec_oprnds0;
6013 if (slp_node)
6014 {
6015 auto_vec<vec<tree> > vec_defs (2);
6016 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6017 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6018 vec_defs[0].release ();
6019 vec_defs[1].release ();
6020 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6021 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6022 }
6023 else
6024 {
6025 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6026 op0, &vec_oprnds0);
6027 scalar_dest_def_info = stmt_info;
6028 }
6029
6030 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6031 tree scalar_type = TREE_TYPE (scalar_dest);
6032 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6033
6034 int vec_num = vec_oprnds0.length ();
6035 gcc_assert (vec_num == 1 || slp_node);
6036 tree vec_elem_type = TREE_TYPE (vectype_out);
6037 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6038
6039 tree vector_identity = NULL_TREE;
6040 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6041 vector_identity = build_zero_cst (vectype_out);
6042
6043 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6044 int i;
6045 tree def0;
6046 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6047 {
6048 gimple *new_stmt;
6049 tree mask = NULL_TREE;
6050 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6051 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6052
6053 /* Handle MINUS by adding the negative. */
6054 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6055 {
6056 tree negated = make_ssa_name (vectype_out);
6057 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6058 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6059 def0 = negated;
6060 }
6061
6062 if (mask && mask_reduc_fn == IFN_LAST)
6063 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6064 vector_identity);
6065
6066 /* On the first iteration the input is simply the scalar phi
6067 result, and for subsequent iterations it is the output of
6068 the preceding operation. */
6069 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6070 {
6071 if (mask && mask_reduc_fn != IFN_LAST)
6072 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6073 def0, mask);
6074 else
6075 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6076 def0);
6077 /* For chained SLP reductions the output of the previous reduction
6078 operation serves as the input of the next. For the final statement
6079 the output cannot be a temporary - we reuse the original
6080 scalar destination of the last statement. */
6081 if (i != vec_num - 1)
6082 {
6083 gimple_set_lhs (new_stmt, scalar_dest_var);
6084 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6085 gimple_set_lhs (new_stmt, reduc_var);
6086 }
6087 }
6088 else
6089 {
6090 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6091 reduc_var, def0);
6092 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6093 /* Remove the statement, so that we can use the same code paths
6094 as for statements that we've just created. */
6095 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6096 gsi_remove (&tmp_gsi, true);
6097 }
6098
6099 if (i == vec_num - 1)
6100 {
6101 gimple_set_lhs (new_stmt, scalar_dest);
6102 vect_finish_replace_stmt (loop_vinfo,
6103 scalar_dest_def_info,
6104 new_stmt);
6105 }
6106 else
6107 vect_finish_stmt_generation (loop_vinfo,
6108 scalar_dest_def_info,
6109 new_stmt, gsi);
6110
6111 if (slp_node)
6112 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6113 else
6114 {
6115 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6116 *vec_stmt = new_stmt;
6117 }
6118 }
6119
6120 return true;
6121 }
6122
6123 /* Function is_nonwrapping_integer_induction.
6124
6125 Check if STMT_VINO (which is part of loop LOOP) both increments and
6126 does not cause overflow. */
6127
6128 static bool
6129 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6130 {
6131 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6132 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6133 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6134 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6135 widest_int ni, max_loop_value, lhs_max;
6136 wi::overflow_type overflow = wi::OVF_NONE;
6137
6138 /* Make sure the loop is integer based. */
6139 if (TREE_CODE (base) != INTEGER_CST
6140 || TREE_CODE (step) != INTEGER_CST)
6141 return false;
6142
6143 /* Check that the max size of the loop will not wrap. */
6144
6145 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6146 return true;
6147
6148 if (! max_stmt_executions (loop, &ni))
6149 return false;
6150
6151 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6152 &overflow);
6153 if (overflow)
6154 return false;
6155
6156 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6157 TYPE_SIGN (lhs_type), &overflow);
6158 if (overflow)
6159 return false;
6160
6161 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6162 <= TYPE_PRECISION (lhs_type));
6163 }
6164
6165 /* Check if masking can be supported by inserting a conditional expression.
6166 CODE is the code for the operation. COND_FN is the conditional internal
6167 function, if it exists. VECTYPE_IN is the type of the vector input. */
6168 static bool
6169 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6170 tree vectype_in)
6171 {
6172 if (cond_fn != IFN_LAST
6173 && direct_internal_fn_supported_p (cond_fn, vectype_in,
6174 OPTIMIZE_FOR_SPEED))
6175 return false;
6176
6177 switch (code)
6178 {
6179 case DOT_PROD_EXPR:
6180 case SAD_EXPR:
6181 return true;
6182
6183 default:
6184 return false;
6185 }
6186 }
6187
6188 /* Insert a conditional expression to enable masked vectorization. CODE is the
6189 code for the operation. VOP is the array of operands. MASK is the loop
6190 mask. GSI is a statement iterator used to place the new conditional
6191 expression. */
6192 static void
6193 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6194 gimple_stmt_iterator *gsi)
6195 {
6196 switch (code)
6197 {
6198 case DOT_PROD_EXPR:
6199 {
6200 tree vectype = TREE_TYPE (vop[1]);
6201 tree zero = build_zero_cst (vectype);
6202 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6203 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6204 mask, vop[1], zero);
6205 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6206 vop[1] = masked_op1;
6207 break;
6208 }
6209
6210 case SAD_EXPR:
6211 {
6212 tree vectype = TREE_TYPE (vop[1]);
6213 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6214 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6215 mask, vop[1], vop[0]);
6216 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6217 vop[1] = masked_op1;
6218 break;
6219 }
6220
6221 default:
6222 gcc_unreachable ();
6223 }
6224 }
6225
6226 /* Function vectorizable_reduction.
6227
6228 Check if STMT_INFO performs a reduction operation that can be vectorized.
6229 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6230 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6231 Return true if STMT_INFO is vectorizable in this way.
6232
6233 This function also handles reduction idioms (patterns) that have been
6234 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6235 may be of this form:
6236 X = pattern_expr (arg0, arg1, ..., X)
6237 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6238 sequence that had been detected and replaced by the pattern-stmt
6239 (STMT_INFO).
6240
6241 This function also handles reduction of condition expressions, for example:
6242 for (int i = 0; i < N; i++)
6243 if (a[i] < value)
6244 last = a[i];
6245 This is handled by vectorising the loop and creating an additional vector
6246 containing the loop indexes for which "a[i] < value" was true. In the
6247 function epilogue this is reduced to a single max value and then used to
6248 index into the vector of results.
6249
6250 In some cases of reduction patterns, the type of the reduction variable X is
6251 different than the type of the other arguments of STMT_INFO.
6252 In such cases, the vectype that is used when transforming STMT_INFO into
6253 a vector stmt is different than the vectype that is used to determine the
6254 vectorization factor, because it consists of a different number of elements
6255 than the actual number of elements that are being operated upon in parallel.
6256
6257 For example, consider an accumulation of shorts into an int accumulator.
6258 On some targets it's possible to vectorize this pattern operating on 8
6259 shorts at a time (hence, the vectype for purposes of determining the
6260 vectorization factor should be V8HI); on the other hand, the vectype that
6261 is used to create the vector form is actually V4SI (the type of the result).
6262
6263 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6264 indicates what is the actual level of parallelism (V8HI in the example), so
6265 that the right vectorization factor would be derived. This vectype
6266 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6267 be used to create the vectorized stmt. The right vectype for the vectorized
6268 stmt is obtained from the type of the result X:
6269 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6270
6271 This means that, contrary to "regular" reductions (or "regular" stmts in
6272 general), the following equation:
6273 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6274 does *NOT* necessarily hold for reduction patterns. */
6275
6276 bool
6277 vectorizable_reduction (loop_vec_info loop_vinfo,
6278 stmt_vec_info stmt_info, slp_tree slp_node,
6279 slp_instance slp_node_instance,
6280 stmt_vector_for_cost *cost_vec)
6281 {
6282 tree scalar_dest;
6283 tree vectype_in = NULL_TREE;
6284 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6285 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6286 stmt_vec_info cond_stmt_vinfo = NULL;
6287 tree scalar_type;
6288 int i;
6289 int ncopies;
6290 bool single_defuse_cycle = false;
6291 bool nested_cycle = false;
6292 bool double_reduc = false;
6293 int vec_num;
6294 tree tem;
6295 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6296 tree cond_reduc_val = NULL_TREE;
6297
6298 /* Make sure it was already recognized as a reduction computation. */
6299 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6300 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6301 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6302 return false;
6303
6304 /* The stmt we store reduction analysis meta on. */
6305 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6306 reduc_info->is_reduc_info = true;
6307
6308 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6309 {
6310 if (is_a <gphi *> (stmt_info->stmt))
6311 {
6312 if (slp_node)
6313 {
6314 /* We eventually need to set a vector type on invariant
6315 arguments. */
6316 unsigned j;
6317 slp_tree child;
6318 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6319 if (!vect_maybe_update_slp_op_vectype
6320 (child, SLP_TREE_VECTYPE (slp_node)))
6321 {
6322 if (dump_enabled_p ())
6323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6324 "incompatible vector types for "
6325 "invariants\n");
6326 return false;
6327 }
6328 }
6329 /* Analysis for double-reduction is done on the outer
6330 loop PHI, nested cycles have no further restrictions. */
6331 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6332 }
6333 else
6334 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6335 return true;
6336 }
6337
6338 stmt_vec_info orig_stmt_of_analysis = stmt_info;
6339 stmt_vec_info phi_info = stmt_info;
6340 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
6341 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6342 {
6343 if (!is_a <gphi *> (stmt_info->stmt))
6344 {
6345 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6346 return true;
6347 }
6348 if (slp_node)
6349 {
6350 slp_node_instance->reduc_phis = slp_node;
6351 /* ??? We're leaving slp_node to point to the PHIs, we only
6352 need it to get at the number of vector stmts which wasn't
6353 yet initialized for the instance root. */
6354 }
6355 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6356 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6357 else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6358 {
6359 use_operand_p use_p;
6360 gimple *use_stmt;
6361 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6362 &use_p, &use_stmt);
6363 gcc_assert (res);
6364 phi_info = loop_vinfo->lookup_stmt (use_stmt);
6365 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6366 }
6367 }
6368
6369 /* PHIs should not participate in patterns. */
6370 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6371 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6372
6373 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6374 and compute the reduction chain length. Discover the real
6375 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
6376 tree reduc_def
6377 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6378 loop_latch_edge
6379 (gimple_bb (reduc_def_phi)->loop_father));
6380 unsigned reduc_chain_length = 0;
6381 bool only_slp_reduc_chain = true;
6382 stmt_info = NULL;
6383 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6384 while (reduc_def != PHI_RESULT (reduc_def_phi))
6385 {
6386 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6387 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6388 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6389 {
6390 if (dump_enabled_p ())
6391 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6392 "reduction chain broken by patterns.\n");
6393 return false;
6394 }
6395 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6396 only_slp_reduc_chain = false;
6397 /* ??? For epilogue generation live members of the chain need
6398 to point back to the PHI via their original stmt for
6399 info_for_reduction to work. */
6400 if (STMT_VINFO_LIVE_P (vdef))
6401 STMT_VINFO_REDUC_DEF (def) = phi_info;
6402 gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6403 if (!assign)
6404 {
6405 if (dump_enabled_p ())
6406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6407 "reduction chain includes calls.\n");
6408 return false;
6409 }
6410 if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6411 {
6412 if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6413 TREE_TYPE (gimple_assign_rhs1 (assign))))
6414 {
6415 if (dump_enabled_p ())
6416 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6417 "conversion in the reduction chain.\n");
6418 return false;
6419 }
6420 }
6421 else if (!stmt_info)
6422 /* First non-conversion stmt. */
6423 stmt_info = vdef;
6424 reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6425 reduc_chain_length++;
6426 if (!stmt_info && slp_node)
6427 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6428 }
6429 /* PHIs should not participate in patterns. */
6430 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6431
6432 if (nested_in_vect_loop_p (loop, stmt_info))
6433 {
6434 loop = loop->inner;
6435 nested_cycle = true;
6436 }
6437
6438 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6439 element. */
6440 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6441 {
6442 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6443 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6444 }
6445 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6446 gcc_assert (slp_node
6447 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6448
6449 /* 1. Is vectorizable reduction? */
6450 /* Not supportable if the reduction variable is used in the loop, unless
6451 it's a reduction chain. */
6452 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6453 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6454 return false;
6455
6456 /* Reductions that are not used even in an enclosing outer-loop,
6457 are expected to be "live" (used out of the loop). */
6458 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6459 && !STMT_VINFO_LIVE_P (stmt_info))
6460 return false;
6461
6462 /* 2. Has this been recognized as a reduction pattern?
6463
6464 Check if STMT represents a pattern that has been recognized
6465 in earlier analysis stages. For stmts that represent a pattern,
6466 the STMT_VINFO_RELATED_STMT field records the last stmt in
6467 the original sequence that constitutes the pattern. */
6468
6469 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6470 if (orig_stmt_info)
6471 {
6472 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6473 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6474 }
6475
6476 /* 3. Check the operands of the operation. The first operands are defined
6477 inside the loop body. The last operand is the reduction variable,
6478 which is defined by the loop-header-phi. */
6479
6480 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6481 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6482 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6483 enum tree_code code = gimple_assign_rhs_code (stmt);
6484 bool lane_reduc_code_p
6485 = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6486 int op_type = TREE_CODE_LENGTH (code);
6487
6488 scalar_dest = gimple_assign_lhs (stmt);
6489 scalar_type = TREE_TYPE (scalar_dest);
6490 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6491 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6492 return false;
6493
6494 /* Do not try to vectorize bit-precision reductions. */
6495 if (!type_has_mode_precision_p (scalar_type))
6496 return false;
6497
6498 /* For lane-reducing ops we're reducing the number of reduction PHIs
6499 which means the only use of that may be in the lane-reducing operation. */
6500 if (lane_reduc_code_p
6501 && reduc_chain_length != 1
6502 && !only_slp_reduc_chain)
6503 {
6504 if (dump_enabled_p ())
6505 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6506 "lane-reducing reduction with extra stmts.\n");
6507 return false;
6508 }
6509
6510 /* All uses but the last are expected to be defined in the loop.
6511 The last use is the reduction variable. In case of nested cycle this
6512 assumption is not true: we use reduc_index to record the index of the
6513 reduction variable. */
6514 slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6515 /* We need to skip an extra operand for COND_EXPRs with embedded
6516 comparison. */
6517 unsigned opno_adjust = 0;
6518 if (code == COND_EXPR
6519 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6520 opno_adjust = 1;
6521 for (i = 0; i < op_type; i++)
6522 {
6523 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6524 if (i == 0 && code == COND_EXPR)
6525 continue;
6526
6527 stmt_vec_info def_stmt_info;
6528 enum vect_def_type dt;
6529 tree op;
6530 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6531 i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6532 &def_stmt_info))
6533 {
6534 if (dump_enabled_p ())
6535 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6536 "use not simple.\n");
6537 return false;
6538 }
6539 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6540 continue;
6541
6542 /* There should be only one cycle def in the stmt, the one
6543 leading to reduc_def. */
6544 if (VECTORIZABLE_CYCLE_DEF (dt))
6545 return false;
6546
6547 /* To properly compute ncopies we are interested in the widest
6548 non-reduction input type in case we're looking at a widening
6549 accumulation that we later handle in vect_transform_reduction. */
6550 if (lane_reduc_code_p
6551 && tem
6552 && (!vectype_in
6553 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6554 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6555 vectype_in = tem;
6556
6557 if (code == COND_EXPR)
6558 {
6559 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6560 if (dt == vect_constant_def)
6561 {
6562 cond_reduc_dt = dt;
6563 cond_reduc_val = op;
6564 }
6565 if (dt == vect_induction_def
6566 && def_stmt_info
6567 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6568 {
6569 cond_reduc_dt = dt;
6570 cond_stmt_vinfo = def_stmt_info;
6571 }
6572 }
6573 }
6574 if (!vectype_in)
6575 vectype_in = STMT_VINFO_VECTYPE (phi_info);
6576 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6577
6578 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6579 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6580 /* If we have a condition reduction, see if we can simplify it further. */
6581 if (v_reduc_type == COND_REDUCTION)
6582 {
6583 if (slp_node)
6584 return false;
6585
6586 /* When the condition uses the reduction value in the condition, fail. */
6587 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6588 {
6589 if (dump_enabled_p ())
6590 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6591 "condition depends on previous iteration\n");
6592 return false;
6593 }
6594
6595 if (reduc_chain_length == 1
6596 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6597 vectype_in, OPTIMIZE_FOR_SPEED))
6598 {
6599 if (dump_enabled_p ())
6600 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6601 "optimizing condition reduction with"
6602 " FOLD_EXTRACT_LAST.\n");
6603 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6604 }
6605 else if (cond_reduc_dt == vect_induction_def)
6606 {
6607 tree base
6608 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6609 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6610
6611 gcc_assert (TREE_CODE (base) == INTEGER_CST
6612 && TREE_CODE (step) == INTEGER_CST);
6613 cond_reduc_val = NULL_TREE;
6614 enum tree_code cond_reduc_op_code = ERROR_MARK;
6615 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6616 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6617 ;
6618 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6619 above base; punt if base is the minimum value of the type for
6620 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6621 else if (tree_int_cst_sgn (step) == -1)
6622 {
6623 cond_reduc_op_code = MIN_EXPR;
6624 if (tree_int_cst_sgn (base) == -1)
6625 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6626 else if (tree_int_cst_lt (base,
6627 TYPE_MAX_VALUE (TREE_TYPE (base))))
6628 cond_reduc_val
6629 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6630 }
6631 else
6632 {
6633 cond_reduc_op_code = MAX_EXPR;
6634 if (tree_int_cst_sgn (base) == 1)
6635 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6636 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6637 base))
6638 cond_reduc_val
6639 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6640 }
6641 if (cond_reduc_val)
6642 {
6643 if (dump_enabled_p ())
6644 dump_printf_loc (MSG_NOTE, vect_location,
6645 "condition expression based on "
6646 "integer induction.\n");
6647 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6648 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6649 = cond_reduc_val;
6650 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6651 }
6652 }
6653 else if (cond_reduc_dt == vect_constant_def)
6654 {
6655 enum vect_def_type cond_initial_dt;
6656 tree cond_initial_val
6657 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6658
6659 gcc_assert (cond_reduc_val != NULL_TREE);
6660 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6661 if (cond_initial_dt == vect_constant_def
6662 && types_compatible_p (TREE_TYPE (cond_initial_val),
6663 TREE_TYPE (cond_reduc_val)))
6664 {
6665 tree e = fold_binary (LE_EXPR, boolean_type_node,
6666 cond_initial_val, cond_reduc_val);
6667 if (e && (integer_onep (e) || integer_zerop (e)))
6668 {
6669 if (dump_enabled_p ())
6670 dump_printf_loc (MSG_NOTE, vect_location,
6671 "condition expression based on "
6672 "compile time constant.\n");
6673 /* Record reduction code at analysis stage. */
6674 STMT_VINFO_REDUC_CODE (reduc_info)
6675 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6676 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6677 }
6678 }
6679 }
6680 }
6681
6682 if (STMT_VINFO_LIVE_P (phi_info))
6683 return false;
6684
6685 if (slp_node)
6686 ncopies = 1;
6687 else
6688 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6689
6690 gcc_assert (ncopies >= 1);
6691
6692 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6693
6694 if (nested_cycle)
6695 {
6696 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6697 == vect_double_reduction_def);
6698 double_reduc = true;
6699 }
6700
6701 /* 4.2. Check support for the epilog operation.
6702
6703 If STMT represents a reduction pattern, then the type of the
6704 reduction variable may be different than the type of the rest
6705 of the arguments. For example, consider the case of accumulation
6706 of shorts into an int accumulator; The original code:
6707 S1: int_a = (int) short_a;
6708 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6709
6710 was replaced with:
6711 STMT: int_acc = widen_sum <short_a, int_acc>
6712
6713 This means that:
6714 1. The tree-code that is used to create the vector operation in the
6715 epilog code (that reduces the partial results) is not the
6716 tree-code of STMT, but is rather the tree-code of the original
6717 stmt from the pattern that STMT is replacing. I.e, in the example
6718 above we want to use 'widen_sum' in the loop, but 'plus' in the
6719 epilog.
6720 2. The type (mode) we use to check available target support
6721 for the vector operation to be created in the *epilog*, is
6722 determined by the type of the reduction variable (in the example
6723 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6724 However the type (mode) we use to check available target support
6725 for the vector operation to be created *inside the loop*, is
6726 determined by the type of the other arguments to STMT (in the
6727 example we'd check this: optab_handler (widen_sum_optab,
6728 vect_short_mode)).
6729
6730 This is contrary to "regular" reductions, in which the types of all
6731 the arguments are the same as the type of the reduction variable.
6732 For "regular" reductions we can therefore use the same vector type
6733 (and also the same tree-code) when generating the epilog code and
6734 when generating the code inside the loop. */
6735
6736 enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6737 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6738
6739 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6740 if (reduction_type == TREE_CODE_REDUCTION)
6741 {
6742 /* Check whether it's ok to change the order of the computation.
6743 Generally, when vectorizing a reduction we change the order of the
6744 computation. This may change the behavior of the program in some
6745 cases, so we need to check that this is ok. One exception is when
6746 vectorizing an outer-loop: the inner-loop is executed sequentially,
6747 and therefore vectorizing reductions in the inner-loop during
6748 outer-loop vectorization is safe. */
6749 if (needs_fold_left_reduction_p (scalar_type, orig_code))
6750 {
6751 /* When vectorizing a reduction chain w/o SLP the reduction PHI
6752 is not directy used in stmt. */
6753 if (!only_slp_reduc_chain
6754 && reduc_chain_length != 1)
6755 {
6756 if (dump_enabled_p ())
6757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6758 "in-order reduction chain without SLP.\n");
6759 return false;
6760 }
6761 STMT_VINFO_REDUC_TYPE (reduc_info)
6762 = reduction_type = FOLD_LEFT_REDUCTION;
6763 }
6764 else if (!commutative_tree_code (orig_code)
6765 || !associative_tree_code (orig_code))
6766 {
6767 if (dump_enabled_p ())
6768 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6769 "reduction: not commutative/associative");
6770 return false;
6771 }
6772 }
6773
6774 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6775 && ncopies > 1)
6776 {
6777 if (dump_enabled_p ())
6778 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6779 "multiple types in double reduction or condition "
6780 "reduction or fold-left reduction.\n");
6781 return false;
6782 }
6783
6784 internal_fn reduc_fn = IFN_LAST;
6785 if (reduction_type == TREE_CODE_REDUCTION
6786 || reduction_type == FOLD_LEFT_REDUCTION
6787 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6788 || reduction_type == CONST_COND_REDUCTION)
6789 {
6790 if (reduction_type == FOLD_LEFT_REDUCTION
6791 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6792 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6793 {
6794 if (reduc_fn != IFN_LAST
6795 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6796 OPTIMIZE_FOR_SPEED))
6797 {
6798 if (dump_enabled_p ())
6799 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6800 "reduc op not supported by target.\n");
6801
6802 reduc_fn = IFN_LAST;
6803 }
6804 }
6805 else
6806 {
6807 if (!nested_cycle || double_reduc)
6808 {
6809 if (dump_enabled_p ())
6810 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6811 "no reduc code for scalar code.\n");
6812
6813 return false;
6814 }
6815 }
6816 }
6817 else if (reduction_type == COND_REDUCTION)
6818 {
6819 int scalar_precision
6820 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6821 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6822 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6823 nunits_out);
6824
6825 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6826 OPTIMIZE_FOR_SPEED))
6827 reduc_fn = IFN_REDUC_MAX;
6828 }
6829 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6830
6831 if (reduction_type != EXTRACT_LAST_REDUCTION
6832 && (!nested_cycle || double_reduc)
6833 && reduc_fn == IFN_LAST
6834 && !nunits_out.is_constant ())
6835 {
6836 if (dump_enabled_p ())
6837 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6838 "missing target support for reduction on"
6839 " variable-length vectors.\n");
6840 return false;
6841 }
6842
6843 /* For SLP reductions, see if there is a neutral value we can use. */
6844 tree neutral_op = NULL_TREE;
6845 if (slp_node)
6846 neutral_op = neutral_op_for_slp_reduction
6847 (slp_node_instance->reduc_phis, vectype_out, orig_code,
6848 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6849
6850 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6851 {
6852 /* We can't support in-order reductions of code such as this:
6853
6854 for (int i = 0; i < n1; ++i)
6855 for (int j = 0; j < n2; ++j)
6856 l += a[j];
6857
6858 since GCC effectively transforms the loop when vectorizing:
6859
6860 for (int i = 0; i < n1 / VF; ++i)
6861 for (int j = 0; j < n2; ++j)
6862 for (int k = 0; k < VF; ++k)
6863 l += a[j];
6864
6865 which is a reassociation of the original operation. */
6866 if (dump_enabled_p ())
6867 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6868 "in-order double reduction not supported.\n");
6869
6870 return false;
6871 }
6872
6873 if (reduction_type == FOLD_LEFT_REDUCTION
6874 && slp_node
6875 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6876 {
6877 /* We cannot use in-order reductions in this case because there is
6878 an implicit reassociation of the operations involved. */
6879 if (dump_enabled_p ())
6880 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6881 "in-order unchained SLP reductions not supported.\n");
6882 return false;
6883 }
6884
6885 /* For double reductions, and for SLP reductions with a neutral value,
6886 we construct a variable-length initial vector by loading a vector
6887 full of the neutral value and then shift-and-inserting the start
6888 values into the low-numbered elements. */
6889 if ((double_reduc || neutral_op)
6890 && !nunits_out.is_constant ()
6891 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6892 vectype_out, OPTIMIZE_FOR_SPEED))
6893 {
6894 if (dump_enabled_p ())
6895 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6896 "reduction on variable-length vectors requires"
6897 " target support for a vector-shift-and-insert"
6898 " operation.\n");
6899 return false;
6900 }
6901
6902 /* Check extra constraints for variable-length unchained SLP reductions. */
6903 if (STMT_SLP_TYPE (stmt_info)
6904 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6905 && !nunits_out.is_constant ())
6906 {
6907 /* We checked above that we could build the initial vector when
6908 there's a neutral element value. Check here for the case in
6909 which each SLP statement has its own initial value and in which
6910 that value needs to be repeated for every instance of the
6911 statement within the initial vector. */
6912 unsigned int group_size = SLP_TREE_LANES (slp_node);
6913 if (!neutral_op
6914 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
6915 TREE_TYPE (vectype_out)))
6916 {
6917 if (dump_enabled_p ())
6918 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6919 "unsupported form of SLP reduction for"
6920 " variable-length vectors: cannot build"
6921 " initial vector.\n");
6922 return false;
6923 }
6924 /* The epilogue code relies on the number of elements being a multiple
6925 of the group size. The duplicate-and-interleave approach to setting
6926 up the initial vector does too. */
6927 if (!multiple_p (nunits_out, group_size))
6928 {
6929 if (dump_enabled_p ())
6930 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6931 "unsupported form of SLP reduction for"
6932 " variable-length vectors: the vector size"
6933 " is not a multiple of the number of results.\n");
6934 return false;
6935 }
6936 }
6937
6938 if (reduction_type == COND_REDUCTION)
6939 {
6940 widest_int ni;
6941
6942 if (! max_loop_iterations (loop, &ni))
6943 {
6944 if (dump_enabled_p ())
6945 dump_printf_loc (MSG_NOTE, vect_location,
6946 "loop count not known, cannot create cond "
6947 "reduction.\n");
6948 return false;
6949 }
6950 /* Convert backedges to iterations. */
6951 ni += 1;
6952
6953 /* The additional index will be the same type as the condition. Check
6954 that the loop can fit into this less one (because we'll use up the
6955 zero slot for when there are no matches). */
6956 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6957 if (wi::geu_p (ni, wi::to_widest (max_index)))
6958 {
6959 if (dump_enabled_p ())
6960 dump_printf_loc (MSG_NOTE, vect_location,
6961 "loop size is greater than data size.\n");
6962 return false;
6963 }
6964 }
6965
6966 /* In case the vectorization factor (VF) is bigger than the number
6967 of elements that we can fit in a vectype (nunits), we have to generate
6968 more than one vector stmt - i.e - we need to "unroll" the
6969 vector stmt by a factor VF/nunits. For more details see documentation
6970 in vectorizable_operation. */
6971
6972 /* If the reduction is used in an outer loop we need to generate
6973 VF intermediate results, like so (e.g. for ncopies=2):
6974 r0 = phi (init, r0)
6975 r1 = phi (init, r1)
6976 r0 = x0 + r0;
6977 r1 = x1 + r1;
6978 (i.e. we generate VF results in 2 registers).
6979 In this case we have a separate def-use cycle for each copy, and therefore
6980 for each copy we get the vector def for the reduction variable from the
6981 respective phi node created for this copy.
6982
6983 Otherwise (the reduction is unused in the loop nest), we can combine
6984 together intermediate results, like so (e.g. for ncopies=2):
6985 r = phi (init, r)
6986 r = x0 + r;
6987 r = x1 + r;
6988 (i.e. we generate VF/2 results in a single register).
6989 In this case for each copy we get the vector def for the reduction variable
6990 from the vectorized reduction operation generated in the previous iteration.
6991
6992 This only works when we see both the reduction PHI and its only consumer
6993 in vectorizable_reduction and there are no intermediate stmts
6994 participating. */
6995 if (ncopies > 1
6996 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6997 && reduc_chain_length == 1)
6998 single_defuse_cycle = true;
6999
7000 if (single_defuse_cycle || lane_reduc_code_p)
7001 {
7002 gcc_assert (code != COND_EXPR);
7003
7004 /* 4. Supportable by target? */
7005 bool ok = true;
7006
7007 /* 4.1. check support for the operation in the loop */
7008 optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
7009 if (!optab)
7010 {
7011 if (dump_enabled_p ())
7012 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7013 "no optab.\n");
7014 ok = false;
7015 }
7016
7017 machine_mode vec_mode = TYPE_MODE (vectype_in);
7018 if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
7019 {
7020 if (dump_enabled_p ())
7021 dump_printf (MSG_NOTE, "op not supported by target.\n");
7022 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7023 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
7024 ok = false;
7025 else
7026 if (dump_enabled_p ())
7027 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7028 }
7029
7030 /* Worthwhile without SIMD support? */
7031 if (ok
7032 && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
7033 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
7034 {
7035 if (dump_enabled_p ())
7036 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7037 "not worthwhile without SIMD support.\n");
7038 ok = false;
7039 }
7040
7041 /* lane-reducing operations have to go through vect_transform_reduction.
7042 For the other cases try without the single cycle optimization. */
7043 if (!ok)
7044 {
7045 if (lane_reduc_code_p)
7046 return false;
7047 else
7048 single_defuse_cycle = false;
7049 }
7050 }
7051 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7052
7053 /* If the reduction stmt is one of the patterns that have lane
7054 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7055 if ((ncopies > 1 && ! single_defuse_cycle)
7056 && lane_reduc_code_p)
7057 {
7058 if (dump_enabled_p ())
7059 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7060 "multi def-use cycle not possible for lane-reducing "
7061 "reduction operation\n");
7062 return false;
7063 }
7064
7065 if (slp_node
7066 && !(!single_defuse_cycle
7067 && code != DOT_PROD_EXPR
7068 && code != WIDEN_SUM_EXPR
7069 && code != SAD_EXPR
7070 && reduction_type != FOLD_LEFT_REDUCTION))
7071 for (i = 0; i < op_type; i++)
7072 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7073 {
7074 if (dump_enabled_p ())
7075 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7076 "incompatible vector types for invariants\n");
7077 return false;
7078 }
7079
7080 if (slp_node)
7081 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7082 else
7083 vec_num = 1;
7084
7085 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7086 reduction_type, ncopies, cost_vec);
7087 if (dump_enabled_p ()
7088 && reduction_type == FOLD_LEFT_REDUCTION)
7089 dump_printf_loc (MSG_NOTE, vect_location,
7090 "using an in-order (fold-left) reduction.\n");
7091 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7092 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7093 reductions go through their own vectorizable_* routines. */
7094 if (!single_defuse_cycle
7095 && code != DOT_PROD_EXPR
7096 && code != WIDEN_SUM_EXPR
7097 && code != SAD_EXPR
7098 && reduction_type != FOLD_LEFT_REDUCTION)
7099 {
7100 stmt_vec_info tem
7101 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7102 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7103 {
7104 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7105 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7106 }
7107 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7108 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7109 }
7110 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7111 {
7112 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7113 internal_fn cond_fn = get_conditional_internal_fn (code);
7114
7115 if (reduction_type != FOLD_LEFT_REDUCTION
7116 && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7117 && (cond_fn == IFN_LAST
7118 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7119 OPTIMIZE_FOR_SPEED)))
7120 {
7121 if (dump_enabled_p ())
7122 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7123 "can't operate on partial vectors because"
7124 " no conditional operation is available.\n");
7125 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7126 }
7127 else if (reduction_type == FOLD_LEFT_REDUCTION
7128 && reduc_fn == IFN_LAST
7129 && !expand_vec_cond_expr_p (vectype_in,
7130 truth_type_for (vectype_in),
7131 SSA_NAME))
7132 {
7133 if (dump_enabled_p ())
7134 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7135 "can't operate on partial vectors because"
7136 " no conditional operation is available.\n");
7137 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7138 }
7139 else
7140 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7141 vectype_in, NULL);
7142 }
7143 return true;
7144 }
7145
7146 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7147 value. */
7148
7149 bool
7150 vect_transform_reduction (loop_vec_info loop_vinfo,
7151 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7152 gimple **vec_stmt, slp_tree slp_node)
7153 {
7154 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7155 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7156 int i;
7157 int ncopies;
7158 int vec_num;
7159
7160 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7161 gcc_assert (reduc_info->is_reduc_info);
7162
7163 if (nested_in_vect_loop_p (loop, stmt_info))
7164 {
7165 loop = loop->inner;
7166 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7167 }
7168
7169 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
7170 enum tree_code code = gimple_assign_rhs_code (stmt);
7171 int op_type = TREE_CODE_LENGTH (code);
7172
7173 /* Flatten RHS. */
7174 tree ops[3];
7175 switch (get_gimple_rhs_class (code))
7176 {
7177 case GIMPLE_TERNARY_RHS:
7178 ops[2] = gimple_assign_rhs3 (stmt);
7179 /* Fall thru. */
7180 case GIMPLE_BINARY_RHS:
7181 ops[0] = gimple_assign_rhs1 (stmt);
7182 ops[1] = gimple_assign_rhs2 (stmt);
7183 break;
7184 default:
7185 gcc_unreachable ();
7186 }
7187
7188 /* All uses but the last are expected to be defined in the loop.
7189 The last use is the reduction variable. In case of nested cycle this
7190 assumption is not true: we use reduc_index to record the index of the
7191 reduction variable. */
7192 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7193 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7194 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7195 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7196
7197 if (slp_node)
7198 {
7199 ncopies = 1;
7200 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7201 }
7202 else
7203 {
7204 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7205 vec_num = 1;
7206 }
7207
7208 internal_fn cond_fn = get_conditional_internal_fn (code);
7209 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7210 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7211
7212 /* Transform. */
7213 tree new_temp = NULL_TREE;
7214 auto_vec<tree> vec_oprnds0;
7215 auto_vec<tree> vec_oprnds1;
7216 auto_vec<tree> vec_oprnds2;
7217 tree def0;
7218
7219 if (dump_enabled_p ())
7220 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7221
7222 /* FORNOW: Multiple types are not supported for condition. */
7223 if (code == COND_EXPR)
7224 gcc_assert (ncopies == 1);
7225
7226 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7227
7228 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7229 if (reduction_type == FOLD_LEFT_REDUCTION)
7230 {
7231 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7232 return vectorize_fold_left_reduction
7233 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7234 reduc_fn, ops, vectype_in, reduc_index, masks);
7235 }
7236
7237 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7238 gcc_assert (single_defuse_cycle
7239 || code == DOT_PROD_EXPR
7240 || code == WIDEN_SUM_EXPR
7241 || code == SAD_EXPR);
7242
7243 /* Create the destination vector */
7244 tree scalar_dest = gimple_assign_lhs (stmt);
7245 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7246
7247 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7248 single_defuse_cycle && reduc_index == 0
7249 ? NULL_TREE : ops[0], &vec_oprnds0,
7250 single_defuse_cycle && reduc_index == 1
7251 ? NULL_TREE : ops[1], &vec_oprnds1,
7252 op_type == ternary_op
7253 && !(single_defuse_cycle && reduc_index == 2)
7254 ? ops[2] : NULL_TREE, &vec_oprnds2);
7255 if (single_defuse_cycle)
7256 {
7257 gcc_assert (!slp_node);
7258 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7259 ops[reduc_index],
7260 reduc_index == 0 ? &vec_oprnds0
7261 : (reduc_index == 1 ? &vec_oprnds1
7262 : &vec_oprnds2));
7263 }
7264
7265 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7266 {
7267 gimple *new_stmt;
7268 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7269 if (masked_loop_p && !mask_by_cond_expr)
7270 {
7271 /* Make sure that the reduction accumulator is vop[0]. */
7272 if (reduc_index == 1)
7273 {
7274 gcc_assert (commutative_tree_code (code));
7275 std::swap (vop[0], vop[1]);
7276 }
7277 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7278 vectype_in, i);
7279 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7280 vop[0], vop[1], vop[0]);
7281 new_temp = make_ssa_name (vec_dest, call);
7282 gimple_call_set_lhs (call, new_temp);
7283 gimple_call_set_nothrow (call, true);
7284 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7285 new_stmt = call;
7286 }
7287 else
7288 {
7289 if (op_type == ternary_op)
7290 vop[2] = vec_oprnds2[i];
7291
7292 if (masked_loop_p && mask_by_cond_expr)
7293 {
7294 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7295 vectype_in, i);
7296 build_vect_cond_expr (code, vop, mask, gsi);
7297 }
7298
7299 new_stmt = gimple_build_assign (vec_dest, code,
7300 vop[0], vop[1], vop[2]);
7301 new_temp = make_ssa_name (vec_dest, new_stmt);
7302 gimple_assign_set_lhs (new_stmt, new_temp);
7303 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7304 }
7305
7306 if (slp_node)
7307 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7308 else if (single_defuse_cycle
7309 && i < ncopies - 1)
7310 {
7311 if (reduc_index == 0)
7312 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7313 else if (reduc_index == 1)
7314 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7315 else if (reduc_index == 2)
7316 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7317 }
7318 else
7319 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7320 }
7321
7322 if (!slp_node)
7323 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7324
7325 return true;
7326 }
7327
7328 /* Transform phase of a cycle PHI. */
7329
7330 bool
7331 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7332 stmt_vec_info stmt_info, gimple **vec_stmt,
7333 slp_tree slp_node, slp_instance slp_node_instance)
7334 {
7335 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7336 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7337 int i;
7338 int ncopies;
7339 int j;
7340 bool nested_cycle = false;
7341 int vec_num;
7342
7343 if (nested_in_vect_loop_p (loop, stmt_info))
7344 {
7345 loop = loop->inner;
7346 nested_cycle = true;
7347 }
7348
7349 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7350 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7351 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7352 gcc_assert (reduc_info->is_reduc_info);
7353
7354 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7355 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7356 /* Leave the scalar phi in place. */
7357 return true;
7358
7359 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7360 /* For a nested cycle we do not fill the above. */
7361 if (!vectype_in)
7362 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7363 gcc_assert (vectype_in);
7364
7365 if (slp_node)
7366 {
7367 /* The size vect_schedule_slp_instance computes is off for us. */
7368 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7369 * SLP_TREE_LANES (slp_node), vectype_in);
7370 ncopies = 1;
7371 }
7372 else
7373 {
7374 vec_num = 1;
7375 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7376 }
7377
7378 /* Check whether we should use a single PHI node and accumulate
7379 vectors to one before the backedge. */
7380 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7381 ncopies = 1;
7382
7383 /* Create the destination vector */
7384 gphi *phi = as_a <gphi *> (stmt_info->stmt);
7385 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7386 vectype_out);
7387
7388 /* Get the loop-entry arguments. */
7389 tree vec_initial_def;
7390 auto_vec<tree> vec_initial_defs;
7391 if (slp_node)
7392 {
7393 vec_initial_defs.reserve (vec_num);
7394 if (nested_cycle)
7395 {
7396 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7397 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7398 &vec_initial_defs);
7399 }
7400 else
7401 {
7402 gcc_assert (slp_node == slp_node_instance->reduc_phis);
7403 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7404 tree neutral_op
7405 = neutral_op_for_slp_reduction (slp_node, vectype_out,
7406 STMT_VINFO_REDUC_CODE (reduc_info),
7407 first != NULL);
7408 get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
7409 &vec_initial_defs, vec_num,
7410 first != NULL, neutral_op);
7411 }
7412 }
7413 else
7414 {
7415 /* Get at the scalar def before the loop, that defines the initial
7416 value of the reduction variable. */
7417 tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7418 loop_preheader_edge (loop));
7419 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7420 and we can't use zero for induc_val, use initial_def. Similarly
7421 for REDUC_MIN and initial_def larger than the base. */
7422 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7423 {
7424 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7425 if (TREE_CODE (initial_def) == INTEGER_CST
7426 && !integer_zerop (induc_val)
7427 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7428 && tree_int_cst_lt (initial_def, induc_val))
7429 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7430 && tree_int_cst_lt (induc_val, initial_def))))
7431 {
7432 induc_val = initial_def;
7433 /* Communicate we used the initial_def to epilouge
7434 generation. */
7435 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7436 }
7437 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7438 vec_initial_defs.create (ncopies);
7439 for (i = 0; i < ncopies; ++i)
7440 vec_initial_defs.quick_push (vec_initial_def);
7441 }
7442 else if (nested_cycle)
7443 {
7444 /* Do not use an adjustment def as that case is not supported
7445 correctly if ncopies is not one. */
7446 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7447 ncopies, initial_def,
7448 &vec_initial_defs);
7449 }
7450 else
7451 {
7452 tree adjustment_def = NULL_TREE;
7453 tree *adjustment_defp = &adjustment_def;
7454 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7455 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7456 adjustment_defp = NULL;
7457 vec_initial_def
7458 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
7459 initial_def, adjustment_defp);
7460 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7461 vec_initial_defs.create (ncopies);
7462 for (i = 0; i < ncopies; ++i)
7463 vec_initial_defs.quick_push (vec_initial_def);
7464 }
7465 }
7466
7467 /* Generate the reduction PHIs upfront. */
7468 for (i = 0; i < vec_num; i++)
7469 {
7470 tree vec_init_def = vec_initial_defs[i];
7471 for (j = 0; j < ncopies; j++)
7472 {
7473 /* Create the reduction-phi that defines the reduction
7474 operand. */
7475 gphi *new_phi = create_phi_node (vec_dest, loop->header);
7476
7477 /* Set the loop-entry arg of the reduction-phi. */
7478 if (j != 0 && nested_cycle)
7479 vec_init_def = vec_initial_defs[j];
7480 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7481 UNKNOWN_LOCATION);
7482
7483 /* The loop-latch arg is set in epilogue processing. */
7484
7485 if (slp_node)
7486 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7487 else
7488 {
7489 if (j == 0)
7490 *vec_stmt = new_phi;
7491 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7492 }
7493 }
7494 }
7495
7496 return true;
7497 }
7498
7499 /* Vectorizes LC PHIs. */
7500
7501 bool
7502 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7503 stmt_vec_info stmt_info, gimple **vec_stmt,
7504 slp_tree slp_node)
7505 {
7506 if (!loop_vinfo
7507 || !is_a <gphi *> (stmt_info->stmt)
7508 || gimple_phi_num_args (stmt_info->stmt) != 1)
7509 return false;
7510
7511 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7512 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7513 return false;
7514
7515 if (!vec_stmt) /* transformation not required. */
7516 {
7517 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7518 return true;
7519 }
7520
7521 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7522 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7523 basic_block bb = gimple_bb (stmt_info->stmt);
7524 edge e = single_pred_edge (bb);
7525 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7526 auto_vec<tree> vec_oprnds;
7527 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7528 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7529 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7530 for (unsigned i = 0; i < vec_oprnds.length (); i++)
7531 {
7532 /* Create the vectorized LC PHI node. */
7533 gphi *new_phi = create_phi_node (vec_dest, bb);
7534 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7535 if (slp_node)
7536 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7537 else
7538 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7539 }
7540 if (!slp_node)
7541 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7542
7543 return true;
7544 }
7545
7546 /* Vectorizes PHIs. */
7547
7548 bool
7549 vectorizable_phi (vec_info *,
7550 stmt_vec_info stmt_info, gimple **vec_stmt,
7551 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7552 {
7553 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7554 return false;
7555
7556 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7557 return false;
7558
7559 tree vectype = SLP_TREE_VECTYPE (slp_node);
7560
7561 if (!vec_stmt) /* transformation not required. */
7562 {
7563 slp_tree child;
7564 unsigned i;
7565 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7566 if (!child)
7567 {
7568 if (dump_enabled_p ())
7569 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7570 "PHI node with unvectorized backedge def\n");
7571 return false;
7572 }
7573 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7574 {
7575 if (dump_enabled_p ())
7576 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7577 "incompatible vector types for invariants\n");
7578 return false;
7579 }
7580 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7581 vector_stmt, stmt_info, vectype, 0, vect_body);
7582 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
7583 return true;
7584 }
7585
7586 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7587 basic_block bb = gimple_bb (stmt_info->stmt);
7588 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7589 auto_vec<gphi *> new_phis;
7590 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
7591 {
7592 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
7593
7594 /* Skip not yet vectorized defs. */
7595 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7596 && SLP_TREE_VEC_STMTS (child).is_empty ())
7597 continue;
7598
7599 auto_vec<tree> vec_oprnds;
7600 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
7601 if (!new_phis.exists ())
7602 {
7603 new_phis.create (vec_oprnds.length ());
7604 for (unsigned j = 0; j < vec_oprnds.length (); j++)
7605 {
7606 /* Create the vectorized LC PHI node. */
7607 new_phis.quick_push (create_phi_node (vec_dest, bb));
7608 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
7609 }
7610 }
7611 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
7612 for (unsigned j = 0; j < vec_oprnds.length (); j++)
7613 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
7614 }
7615 /* We should have at least one already vectorized child. */
7616 gcc_assert (new_phis.exists ());
7617
7618 return true;
7619 }
7620
7621
7622 /* Function vect_min_worthwhile_factor.
7623
7624 For a loop where we could vectorize the operation indicated by CODE,
7625 return the minimum vectorization factor that makes it worthwhile
7626 to use generic vectors. */
7627 static unsigned int
7628 vect_min_worthwhile_factor (enum tree_code code)
7629 {
7630 switch (code)
7631 {
7632 case PLUS_EXPR:
7633 case MINUS_EXPR:
7634 case NEGATE_EXPR:
7635 return 4;
7636
7637 case BIT_AND_EXPR:
7638 case BIT_IOR_EXPR:
7639 case BIT_XOR_EXPR:
7640 case BIT_NOT_EXPR:
7641 return 2;
7642
7643 default:
7644 return INT_MAX;
7645 }
7646 }
7647
7648 /* Return true if VINFO indicates we are doing loop vectorization and if
7649 it is worth decomposing CODE operations into scalar operations for
7650 that loop's vectorization factor. */
7651
7652 bool
7653 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7654 {
7655 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7656 unsigned HOST_WIDE_INT value;
7657 return (loop_vinfo
7658 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7659 && value >= vect_min_worthwhile_factor (code));
7660 }
7661
7662 /* Function vectorizable_induction
7663
7664 Check if STMT_INFO performs an induction computation that can be vectorized.
7665 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7666 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7667 Return true if STMT_INFO is vectorizable in this way. */
7668
7669 bool
7670 vectorizable_induction (loop_vec_info loop_vinfo,
7671 stmt_vec_info stmt_info,
7672 gimple **vec_stmt, slp_tree slp_node,
7673 stmt_vector_for_cost *cost_vec)
7674 {
7675 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7676 unsigned ncopies;
7677 bool nested_in_vect_loop = false;
7678 class loop *iv_loop;
7679 tree vec_def;
7680 edge pe = loop_preheader_edge (loop);
7681 basic_block new_bb;
7682 tree new_vec, vec_init, vec_step, t;
7683 tree new_name;
7684 gimple *new_stmt;
7685 gphi *induction_phi;
7686 tree induc_def, vec_dest;
7687 tree init_expr, step_expr;
7688 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7689 unsigned i;
7690 tree expr;
7691 gimple_stmt_iterator si;
7692
7693 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7694 if (!phi)
7695 return false;
7696
7697 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7698 return false;
7699
7700 /* Make sure it was recognized as induction computation. */
7701 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7702 return false;
7703
7704 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7705 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7706
7707 if (slp_node)
7708 ncopies = 1;
7709 else
7710 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7711 gcc_assert (ncopies >= 1);
7712
7713 /* FORNOW. These restrictions should be relaxed. */
7714 if (nested_in_vect_loop_p (loop, stmt_info))
7715 {
7716 imm_use_iterator imm_iter;
7717 use_operand_p use_p;
7718 gimple *exit_phi;
7719 edge latch_e;
7720 tree loop_arg;
7721
7722 if (ncopies > 1)
7723 {
7724 if (dump_enabled_p ())
7725 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7726 "multiple types in nested loop.\n");
7727 return false;
7728 }
7729
7730 exit_phi = NULL;
7731 latch_e = loop_latch_edge (loop->inner);
7732 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7733 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7734 {
7735 gimple *use_stmt = USE_STMT (use_p);
7736 if (is_gimple_debug (use_stmt))
7737 continue;
7738
7739 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7740 {
7741 exit_phi = use_stmt;
7742 break;
7743 }
7744 }
7745 if (exit_phi)
7746 {
7747 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7748 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7749 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7750 {
7751 if (dump_enabled_p ())
7752 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7753 "inner-loop induction only used outside "
7754 "of the outer vectorized loop.\n");
7755 return false;
7756 }
7757 }
7758
7759 nested_in_vect_loop = true;
7760 iv_loop = loop->inner;
7761 }
7762 else
7763 iv_loop = loop;
7764 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7765
7766 if (slp_node && !nunits.is_constant ())
7767 {
7768 /* The current SLP code creates the step value element-by-element. */
7769 if (dump_enabled_p ())
7770 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7771 "SLP induction not supported for variable-length"
7772 " vectors.\n");
7773 return false;
7774 }
7775
7776 if (!vec_stmt) /* transformation not required. */
7777 {
7778 unsigned inside_cost = 0, prologue_cost = 0;
7779 if (slp_node)
7780 {
7781 /* We eventually need to set a vector type on invariant
7782 arguments. */
7783 unsigned j;
7784 slp_tree child;
7785 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7786 if (!vect_maybe_update_slp_op_vectype
7787 (child, SLP_TREE_VECTYPE (slp_node)))
7788 {
7789 if (dump_enabled_p ())
7790 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7791 "incompatible vector types for "
7792 "invariants\n");
7793 return false;
7794 }
7795 /* loop cost for vec_loop. */
7796 inside_cost
7797 = record_stmt_cost (cost_vec,
7798 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7799 vector_stmt, stmt_info, 0, vect_body);
7800 /* prologue cost for vec_init (if not nested) and step. */
7801 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
7802 scalar_to_vec,
7803 stmt_info, 0, vect_prologue);
7804 }
7805 else /* if (!slp_node) */
7806 {
7807 /* loop cost for vec_loop. */
7808 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
7809 stmt_info, 0, vect_body);
7810 /* prologue cost for vec_init and vec_step. */
7811 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
7812 stmt_info, 0, vect_prologue);
7813 }
7814 if (dump_enabled_p ())
7815 dump_printf_loc (MSG_NOTE, vect_location,
7816 "vect_model_induction_cost: inside_cost = %d, "
7817 "prologue_cost = %d .\n", inside_cost,
7818 prologue_cost);
7819
7820 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7821 DUMP_VECT_SCOPE ("vectorizable_induction");
7822 return true;
7823 }
7824
7825 /* Transform. */
7826
7827 /* Compute a vector variable, initialized with the first VF values of
7828 the induction variable. E.g., for an iv with IV_PHI='X' and
7829 evolution S, for a vector of 4 units, we want to compute:
7830 [X, X + S, X + 2*S, X + 3*S]. */
7831
7832 if (dump_enabled_p ())
7833 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7834
7835 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7836 gcc_assert (step_expr != NULL_TREE);
7837 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7838
7839 pe = loop_preheader_edge (iv_loop);
7840 /* Find the first insertion point in the BB. */
7841 basic_block bb = gimple_bb (phi);
7842 si = gsi_after_labels (bb);
7843
7844 /* For SLP induction we have to generate several IVs as for example
7845 with group size 3 we need
7846 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
7847 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
7848 if (slp_node)
7849 {
7850 /* Enforced above. */
7851 unsigned int const_nunits = nunits.to_constant ();
7852
7853 /* The initial values are vectorized, but any lanes > group_size
7854 need adjustment. */
7855 slp_tree init_node
7856 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
7857
7858 /* Gather steps. Since we do not vectorize inductions as
7859 cycles we have to reconstruct the step from SCEV data. */
7860 unsigned group_size = SLP_TREE_LANES (slp_node);
7861 tree *steps = XALLOCAVEC (tree, group_size);
7862 tree *inits = XALLOCAVEC (tree, group_size);
7863 stmt_vec_info phi_info;
7864 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
7865 {
7866 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
7867 if (!init_node)
7868 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
7869 pe->dest_idx);
7870 }
7871
7872 /* Now generate the IVs. */
7873 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7874 gcc_assert ((const_nunits * nvects) % group_size == 0);
7875 unsigned nivs;
7876 if (nested_in_vect_loop)
7877 nivs = nvects;
7878 else
7879 {
7880 /* Compute the number of distinct IVs we need. First reduce
7881 group_size if it is a multiple of const_nunits so we get
7882 one IV for a group_size of 4 but const_nunits 2. */
7883 unsigned group_sizep = group_size;
7884 if (group_sizep % const_nunits == 0)
7885 group_sizep = group_sizep / const_nunits;
7886 nivs = least_common_multiple (group_sizep,
7887 const_nunits) / const_nunits;
7888 }
7889 tree stept = TREE_TYPE (step_vectype);
7890 tree lupdate_mul = NULL_TREE;
7891 if (!nested_in_vect_loop)
7892 {
7893 /* The number of iterations covered in one vector iteration. */
7894 unsigned lup_mul = (nvects * const_nunits) / group_size;
7895 lupdate_mul
7896 = build_vector_from_val (step_vectype,
7897 SCALAR_FLOAT_TYPE_P (stept)
7898 ? build_real_from_wide (stept, lup_mul,
7899 UNSIGNED)
7900 : build_int_cstu (stept, lup_mul));
7901 }
7902 tree peel_mul = NULL_TREE;
7903 gimple_seq init_stmts = NULL;
7904 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
7905 {
7906 if (SCALAR_FLOAT_TYPE_P (stept))
7907 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
7908 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
7909 else
7910 peel_mul = gimple_convert (&init_stmts, stept,
7911 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
7912 peel_mul = gimple_build_vector_from_val (&init_stmts,
7913 step_vectype, peel_mul);
7914 }
7915 unsigned ivn;
7916 auto_vec<tree> vec_steps;
7917 for (ivn = 0; ivn < nivs; ++ivn)
7918 {
7919 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
7920 tree_vector_builder init_elts (vectype, const_nunits, 1);
7921 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
7922 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7923 {
7924 /* The scalar steps of the IVs. */
7925 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
7926 step_elts.quick_push (elt);
7927 if (!init_node)
7928 {
7929 /* The scalar inits of the IVs if not vectorized. */
7930 elt = inits[(ivn*const_nunits + eltn) % group_size];
7931 init_elts.quick_push (elt);
7932 }
7933 /* The number of steps to add to the initial values. */
7934 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
7935 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
7936 ? build_real_from_wide (stept,
7937 mul_elt, UNSIGNED)
7938 : build_int_cstu (stept, mul_elt));
7939 }
7940 vec_step = gimple_build_vector (&init_stmts, &step_elts);
7941 vec_step = gimple_convert (&init_stmts, step_vectype, vec_step);
7942 vec_steps.safe_push (vec_step);
7943 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
7944 if (peel_mul)
7945 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
7946 step_mul, peel_mul);
7947 if (!init_node)
7948 vec_init = gimple_build_vector (&init_stmts, &init_elts);
7949
7950 /* Create the induction-phi that defines the induction-operand. */
7951 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
7952 "vec_iv_");
7953 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7954 induc_def = PHI_RESULT (induction_phi);
7955
7956 /* Create the iv update inside the loop */
7957 tree up = vec_step;
7958 if (lupdate_mul)
7959 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
7960 vec_step, lupdate_mul);
7961 gimple_seq stmts = NULL;
7962 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7963 vec_def = gimple_build (&stmts,
7964 PLUS_EXPR, step_vectype, vec_def, up);
7965 vec_def = gimple_convert (&stmts, vectype, vec_def);
7966 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7967 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7968 UNKNOWN_LOCATION);
7969
7970 if (init_node)
7971 vec_init = vect_get_slp_vect_def (init_node, ivn);
7972 if (!nested_in_vect_loop
7973 && !integer_zerop (step_mul))
7974 {
7975 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
7976 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
7977 vec_step, step_mul);
7978 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
7979 vec_def, up);
7980 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
7981 }
7982
7983 /* Set the arguments of the phi node: */
7984 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7985
7986 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7987 }
7988 if (!nested_in_vect_loop)
7989 {
7990 /* Fill up to the number of vectors we need for the whole group. */
7991 nivs = least_common_multiple (group_size,
7992 const_nunits) / const_nunits;
7993 for (; ivn < nivs; ++ivn)
7994 SLP_TREE_VEC_STMTS (slp_node)
7995 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
7996 }
7997
7998 /* Re-use IVs when we can. We are generating further vector
7999 stmts by adding VF' * stride to the IVs generated above. */
8000 if (ivn < nvects)
8001 {
8002 unsigned vfp
8003 = least_common_multiple (group_size, const_nunits) / group_size;
8004 tree lupdate_mul
8005 = build_vector_from_val (step_vectype,
8006 SCALAR_FLOAT_TYPE_P (stept)
8007 ? build_real_from_wide (stept,
8008 vfp, UNSIGNED)
8009 : build_int_cstu (stept, vfp));
8010 for (; ivn < nvects; ++ivn)
8011 {
8012 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8013 tree def = gimple_get_lhs (iv);
8014 if (ivn < 2*nivs)
8015 vec_steps[ivn - nivs]
8016 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8017 vec_steps[ivn - nivs], lupdate_mul);
8018 gimple_seq stmts = NULL;
8019 def = gimple_convert (&stmts, step_vectype, def);
8020 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8021 def, vec_steps[ivn % nivs]);
8022 def = gimple_convert (&stmts, vectype, def);
8023 if (gimple_code (iv) == GIMPLE_PHI)
8024 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8025 else
8026 {
8027 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8028 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8029 }
8030 SLP_TREE_VEC_STMTS (slp_node)
8031 .quick_push (SSA_NAME_DEF_STMT (def));
8032 }
8033 }
8034
8035 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8036 gcc_assert (!new_bb);
8037
8038 return true;
8039 }
8040
8041 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
8042 loop_preheader_edge (iv_loop));
8043
8044 gimple_seq stmts = NULL;
8045 if (!nested_in_vect_loop)
8046 {
8047 /* Convert the initial value to the IV update type. */
8048 tree new_type = TREE_TYPE (step_expr);
8049 init_expr = gimple_convert (&stmts, new_type, init_expr);
8050
8051 /* If we are using the loop mask to "peel" for alignment then we need
8052 to adjust the start value here. */
8053 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8054 if (skip_niters != NULL_TREE)
8055 {
8056 if (FLOAT_TYPE_P (vectype))
8057 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8058 skip_niters);
8059 else
8060 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8061 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8062 skip_niters, step_expr);
8063 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8064 init_expr, skip_step);
8065 }
8066 }
8067
8068 if (stmts)
8069 {
8070 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8071 gcc_assert (!new_bb);
8072 }
8073
8074 /* Create the vector that holds the initial_value of the induction. */
8075 if (nested_in_vect_loop)
8076 {
8077 /* iv_loop is nested in the loop to be vectorized. init_expr had already
8078 been created during vectorization of previous stmts. We obtain it
8079 from the STMT_VINFO_VEC_STMT of the defining stmt. */
8080 auto_vec<tree> vec_inits;
8081 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8082 init_expr, &vec_inits);
8083 vec_init = vec_inits[0];
8084 /* If the initial value is not of proper type, convert it. */
8085 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8086 {
8087 new_stmt
8088 = gimple_build_assign (vect_get_new_ssa_name (vectype,
8089 vect_simple_var,
8090 "vec_iv_"),
8091 VIEW_CONVERT_EXPR,
8092 build1 (VIEW_CONVERT_EXPR, vectype,
8093 vec_init));
8094 vec_init = gimple_assign_lhs (new_stmt);
8095 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8096 new_stmt);
8097 gcc_assert (!new_bb);
8098 }
8099 }
8100 else
8101 {
8102 /* iv_loop is the loop to be vectorized. Create:
8103 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
8104 stmts = NULL;
8105 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8106
8107 unsigned HOST_WIDE_INT const_nunits;
8108 if (nunits.is_constant (&const_nunits))
8109 {
8110 tree_vector_builder elts (step_vectype, const_nunits, 1);
8111 elts.quick_push (new_name);
8112 for (i = 1; i < const_nunits; i++)
8113 {
8114 /* Create: new_name_i = new_name + step_expr */
8115 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8116 new_name, step_expr);
8117 elts.quick_push (new_name);
8118 }
8119 /* Create a vector from [new_name_0, new_name_1, ...,
8120 new_name_nunits-1] */
8121 vec_init = gimple_build_vector (&stmts, &elts);
8122 }
8123 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8124 /* Build the initial value directly from a VEC_SERIES_EXPR. */
8125 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8126 new_name, step_expr);
8127 else
8128 {
8129 /* Build:
8130 [base, base, base, ...]
8131 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
8132 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8133 gcc_assert (flag_associative_math);
8134 tree index = build_index_vector (step_vectype, 0, 1);
8135 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8136 new_name);
8137 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8138 step_expr);
8139 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8140 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8141 vec_init, step_vec);
8142 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8143 vec_init, base_vec);
8144 }
8145 vec_init = gimple_convert (&stmts, vectype, vec_init);
8146
8147 if (stmts)
8148 {
8149 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8150 gcc_assert (!new_bb);
8151 }
8152 }
8153
8154
8155 /* Create the vector that holds the step of the induction. */
8156 if (nested_in_vect_loop)
8157 /* iv_loop is nested in the loop to be vectorized. Generate:
8158 vec_step = [S, S, S, S] */
8159 new_name = step_expr;
8160 else
8161 {
8162 /* iv_loop is the loop to be vectorized. Generate:
8163 vec_step = [VF*S, VF*S, VF*S, VF*S] */
8164 gimple_seq seq = NULL;
8165 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8166 {
8167 expr = build_int_cst (integer_type_node, vf);
8168 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8169 }
8170 else
8171 expr = build_int_cst (TREE_TYPE (step_expr), vf);
8172 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8173 expr, step_expr);
8174 if (seq)
8175 {
8176 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8177 gcc_assert (!new_bb);
8178 }
8179 }
8180
8181 t = unshare_expr (new_name);
8182 gcc_assert (CONSTANT_CLASS_P (new_name)
8183 || TREE_CODE (new_name) == SSA_NAME);
8184 new_vec = build_vector_from_val (step_vectype, t);
8185 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8186 new_vec, step_vectype, NULL);
8187
8188
8189 /* Create the following def-use cycle:
8190 loop prolog:
8191 vec_init = ...
8192 vec_step = ...
8193 loop:
8194 vec_iv = PHI <vec_init, vec_loop>
8195 ...
8196 STMT
8197 ...
8198 vec_loop = vec_iv + vec_step; */
8199
8200 /* Create the induction-phi that defines the induction-operand. */
8201 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8202 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8203 induc_def = PHI_RESULT (induction_phi);
8204
8205 /* Create the iv update inside the loop */
8206 stmts = NULL;
8207 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8208 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8209 vec_def = gimple_convert (&stmts, vectype, vec_def);
8210 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8211 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8212
8213 /* Set the arguments of the phi node: */
8214 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8215 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8216 UNKNOWN_LOCATION);
8217
8218 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8219 *vec_stmt = induction_phi;
8220
8221 /* In case that vectorization factor (VF) is bigger than the number
8222 of elements that we can fit in a vectype (nunits), we have to generate
8223 more than one vector stmt - i.e - we need to "unroll" the
8224 vector stmt by a factor VF/nunits. For more details see documentation
8225 in vectorizable_operation. */
8226
8227 if (ncopies > 1)
8228 {
8229 gimple_seq seq = NULL;
8230 /* FORNOW. This restriction should be relaxed. */
8231 gcc_assert (!nested_in_vect_loop);
8232
8233 /* Create the vector that holds the step of the induction. */
8234 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8235 {
8236 expr = build_int_cst (integer_type_node, nunits);
8237 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8238 }
8239 else
8240 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8241 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8242 expr, step_expr);
8243 if (seq)
8244 {
8245 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8246 gcc_assert (!new_bb);
8247 }
8248
8249 t = unshare_expr (new_name);
8250 gcc_assert (CONSTANT_CLASS_P (new_name)
8251 || TREE_CODE (new_name) == SSA_NAME);
8252 new_vec = build_vector_from_val (step_vectype, t);
8253 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8254 new_vec, step_vectype, NULL);
8255
8256 vec_def = induc_def;
8257 for (i = 1; i < ncopies; i++)
8258 {
8259 /* vec_i = vec_prev + vec_step */
8260 gimple_seq stmts = NULL;
8261 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8262 vec_def = gimple_build (&stmts,
8263 PLUS_EXPR, step_vectype, vec_def, vec_step);
8264 vec_def = gimple_convert (&stmts, vectype, vec_def);
8265
8266 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8267 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8268 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8269 }
8270 }
8271
8272 if (dump_enabled_p ())
8273 dump_printf_loc (MSG_NOTE, vect_location,
8274 "transform induction: created def-use cycle: %G%G",
8275 induction_phi, SSA_NAME_DEF_STMT (vec_def));
8276
8277 return true;
8278 }
8279
8280 /* Function vectorizable_live_operation.
8281
8282 STMT_INFO computes a value that is used outside the loop. Check if
8283 it can be supported. */
8284
8285 bool
8286 vectorizable_live_operation (vec_info *vinfo,
8287 stmt_vec_info stmt_info,
8288 gimple_stmt_iterator *gsi,
8289 slp_tree slp_node, slp_instance slp_node_instance,
8290 int slp_index, bool vec_stmt_p,
8291 stmt_vector_for_cost *cost_vec)
8292 {
8293 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8294 imm_use_iterator imm_iter;
8295 tree lhs, lhs_type, bitsize, vec_bitsize;
8296 tree vectype = (slp_node
8297 ? SLP_TREE_VECTYPE (slp_node)
8298 : STMT_VINFO_VECTYPE (stmt_info));
8299 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8300 int ncopies;
8301 gimple *use_stmt;
8302 auto_vec<tree> vec_oprnds;
8303 int vec_entry = 0;
8304 poly_uint64 vec_index = 0;
8305
8306 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8307
8308 /* If a stmt of a reduction is live, vectorize it via
8309 vect_create_epilog_for_reduction. vectorizable_reduction assessed
8310 validity so just trigger the transform here. */
8311 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8312 {
8313 if (!vec_stmt_p)
8314 return true;
8315 if (slp_node)
8316 {
8317 /* For reduction chains the meta-info is attached to
8318 the group leader. */
8319 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8320 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8321 /* For SLP reductions we vectorize the epilogue for
8322 all involved stmts together. */
8323 else if (slp_index != 0)
8324 return true;
8325 else
8326 /* For SLP reductions the meta-info is attached to
8327 the representative. */
8328 stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8329 }
8330 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8331 gcc_assert (reduc_info->is_reduc_info);
8332 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8333 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8334 return true;
8335 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8336 slp_node_instance);
8337 return true;
8338 }
8339
8340 /* If STMT is not relevant and it is a simple assignment and its inputs are
8341 invariant then it can remain in place, unvectorized. The original last
8342 scalar value that it computes will be used. */
8343 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8344 {
8345 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8346 if (dump_enabled_p ())
8347 dump_printf_loc (MSG_NOTE, vect_location,
8348 "statement is simple and uses invariant. Leaving in "
8349 "place.\n");
8350 return true;
8351 }
8352
8353 if (slp_node)
8354 ncopies = 1;
8355 else
8356 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8357
8358 if (slp_node)
8359 {
8360 gcc_assert (slp_index >= 0);
8361
8362 /* Get the last occurrence of the scalar index from the concatenation of
8363 all the slp vectors. Calculate which slp vector it is and the index
8364 within. */
8365 int num_scalar = SLP_TREE_LANES (slp_node);
8366 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8367 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8368
8369 /* Calculate which vector contains the result, and which lane of
8370 that vector we need. */
8371 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8372 {
8373 if (dump_enabled_p ())
8374 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8375 "Cannot determine which vector holds the"
8376 " final result.\n");
8377 return false;
8378 }
8379 }
8380
8381 if (!vec_stmt_p)
8382 {
8383 /* No transformation required. */
8384 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8385 {
8386 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8387 OPTIMIZE_FOR_SPEED))
8388 {
8389 if (dump_enabled_p ())
8390 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8391 "can't operate on partial vectors "
8392 "because the target doesn't support extract "
8393 "last reduction.\n");
8394 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8395 }
8396 else if (slp_node)
8397 {
8398 if (dump_enabled_p ())
8399 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8400 "can't operate on partial vectors "
8401 "because an SLP statement is live after "
8402 "the loop.\n");
8403 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8404 }
8405 else if (ncopies > 1)
8406 {
8407 if (dump_enabled_p ())
8408 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8409 "can't operate on partial vectors "
8410 "because ncopies is greater than 1.\n");
8411 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8412 }
8413 else
8414 {
8415 gcc_assert (ncopies == 1 && !slp_node);
8416 vect_record_loop_mask (loop_vinfo,
8417 &LOOP_VINFO_MASKS (loop_vinfo),
8418 1, vectype, NULL);
8419 }
8420 }
8421 /* ??? Enable for loop costing as well. */
8422 if (!loop_vinfo)
8423 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8424 0, vect_epilogue);
8425 return true;
8426 }
8427
8428 /* Use the lhs of the original scalar statement. */
8429 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8430 if (dump_enabled_p ())
8431 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8432 "stmt %G", stmt);
8433
8434 lhs = gimple_get_lhs (stmt);
8435 lhs_type = TREE_TYPE (lhs);
8436
8437 bitsize = vector_element_bits_tree (vectype);
8438 vec_bitsize = TYPE_SIZE (vectype);
8439
8440 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8441 tree vec_lhs, bitstart;
8442 gimple *vec_stmt;
8443 if (slp_node)
8444 {
8445 gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8446
8447 /* Get the correct slp vectorized stmt. */
8448 vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8449 vec_lhs = gimple_get_lhs (vec_stmt);
8450
8451 /* Get entry to use. */
8452 bitstart = bitsize_int (vec_index);
8453 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8454 }
8455 else
8456 {
8457 /* For multiple copies, get the last copy. */
8458 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8459 vec_lhs = gimple_get_lhs (vec_stmt);
8460
8461 /* Get the last lane in the vector. */
8462 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8463 }
8464
8465 if (loop_vinfo)
8466 {
8467 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8468 requirement, insert one phi node for it. It looks like:
8469 loop;
8470 BB:
8471 # lhs' = PHI <lhs>
8472 ==>
8473 loop;
8474 BB:
8475 # vec_lhs' = PHI <vec_lhs>
8476 new_tree = lane_extract <vec_lhs', ...>;
8477 lhs' = new_tree; */
8478
8479 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8480 basic_block exit_bb = single_exit (loop)->dest;
8481 gcc_assert (single_pred_p (exit_bb));
8482
8483 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8484 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8485 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8486
8487 gimple_seq stmts = NULL;
8488 tree new_tree;
8489 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8490 {
8491 /* Emit:
8492
8493 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8494
8495 where VEC_LHS is the vectorized live-out result and MASK is
8496 the loop mask for the final iteration. */
8497 gcc_assert (ncopies == 1 && !slp_node);
8498 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8499 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8500 1, vectype, 0);
8501 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8502 mask, vec_lhs_phi);
8503
8504 /* Convert the extracted vector element to the scalar type. */
8505 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8506 }
8507 else
8508 {
8509 tree bftype = TREE_TYPE (vectype);
8510 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8511 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8512 new_tree = build3 (BIT_FIELD_REF, bftype,
8513 vec_lhs_phi, bitsize, bitstart);
8514 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8515 &stmts, true, NULL_TREE);
8516 }
8517
8518 if (stmts)
8519 {
8520 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8521 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8522
8523 /* Remove existing phi from lhs and create one copy from new_tree. */
8524 tree lhs_phi = NULL_TREE;
8525 gimple_stmt_iterator gsi;
8526 for (gsi = gsi_start_phis (exit_bb);
8527 !gsi_end_p (gsi); gsi_next (&gsi))
8528 {
8529 gimple *phi = gsi_stmt (gsi);
8530 if ((gimple_phi_arg_def (phi, 0) == lhs))
8531 {
8532 remove_phi_node (&gsi, false);
8533 lhs_phi = gimple_phi_result (phi);
8534 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8535 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8536 break;
8537 }
8538 }
8539 }
8540
8541 /* Replace use of lhs with newly computed result. If the use stmt is a
8542 single arg PHI, just replace all uses of PHI result. It's necessary
8543 because lcssa PHI defining lhs may be before newly inserted stmt. */
8544 use_operand_p use_p;
8545 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8546 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8547 && !is_gimple_debug (use_stmt))
8548 {
8549 if (gimple_code (use_stmt) == GIMPLE_PHI
8550 && gimple_phi_num_args (use_stmt) == 1)
8551 {
8552 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8553 }
8554 else
8555 {
8556 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8557 SET_USE (use_p, new_tree);
8558 }
8559 update_stmt (use_stmt);
8560 }
8561 }
8562 else
8563 {
8564 /* For basic-block vectorization simply insert the lane-extraction. */
8565 tree bftype = TREE_TYPE (vectype);
8566 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8567 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8568 tree new_tree = build3 (BIT_FIELD_REF, bftype,
8569 vec_lhs, bitsize, bitstart);
8570 gimple_seq stmts = NULL;
8571 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8572 &stmts, true, NULL_TREE);
8573 if (TREE_CODE (new_tree) == SSA_NAME
8574 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
8575 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
8576 if (is_a <gphi *> (vec_stmt))
8577 {
8578 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
8579 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8580 }
8581 else
8582 {
8583 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
8584 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
8585 }
8586
8587 /* Replace use of lhs with newly computed result. If the use stmt is a
8588 single arg PHI, just replace all uses of PHI result. It's necessary
8589 because lcssa PHI defining lhs may be before newly inserted stmt. */
8590 use_operand_p use_p;
8591 stmt_vec_info use_stmt_info;
8592 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8593 if (!is_gimple_debug (use_stmt)
8594 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
8595 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
8596 {
8597 /* ??? This can happen when the live lane ends up being
8598 used in a vector construction code-generated by an
8599 external SLP node (and code-generation for that already
8600 happened). See gcc.dg/vect/bb-slp-47.c.
8601 Doing this is what would happen if that vector CTOR
8602 were not code-generated yet so it is not too bad.
8603 ??? In fact we'd likely want to avoid this situation
8604 in the first place. */
8605 if (TREE_CODE (new_tree) == SSA_NAME
8606 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8607 && gimple_code (use_stmt) != GIMPLE_PHI
8608 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
8609 use_stmt))
8610 {
8611 enum tree_code code = gimple_assign_rhs_code (use_stmt);
8612 gcc_assert (code == CONSTRUCTOR
8613 || code == VIEW_CONVERT_EXPR
8614 || CONVERT_EXPR_CODE_P (code));
8615 if (dump_enabled_p ())
8616 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8617 "Using original scalar computation for "
8618 "live lane because use preceeds vector "
8619 "def\n");
8620 continue;
8621 }
8622 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8623 SET_USE (use_p, new_tree);
8624 update_stmt (use_stmt);
8625 }
8626 }
8627
8628 return true;
8629 }
8630
8631 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8632
8633 static void
8634 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8635 {
8636 ssa_op_iter op_iter;
8637 imm_use_iterator imm_iter;
8638 def_operand_p def_p;
8639 gimple *ustmt;
8640
8641 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8642 {
8643 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8644 {
8645 basic_block bb;
8646
8647 if (!is_gimple_debug (ustmt))
8648 continue;
8649
8650 bb = gimple_bb (ustmt);
8651
8652 if (!flow_bb_inside_loop_p (loop, bb))
8653 {
8654 if (gimple_debug_bind_p (ustmt))
8655 {
8656 if (dump_enabled_p ())
8657 dump_printf_loc (MSG_NOTE, vect_location,
8658 "killing debug use\n");
8659
8660 gimple_debug_bind_reset_value (ustmt);
8661 update_stmt (ustmt);
8662 }
8663 else
8664 gcc_unreachable ();
8665 }
8666 }
8667 }
8668 }
8669
8670 /* Given loop represented by LOOP_VINFO, return true if computation of
8671 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8672 otherwise. */
8673
8674 static bool
8675 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8676 {
8677 /* Constant case. */
8678 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8679 {
8680 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8681 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8682
8683 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8684 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8685 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8686 return true;
8687 }
8688
8689 widest_int max;
8690 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8691 /* Check the upper bound of loop niters. */
8692 if (get_max_loop_iterations (loop, &max))
8693 {
8694 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8695 signop sgn = TYPE_SIGN (type);
8696 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8697 if (max < type_max)
8698 return true;
8699 }
8700 return false;
8701 }
8702
8703 /* Return a mask type with half the number of elements as OLD_TYPE,
8704 given that it should have mode NEW_MODE. */
8705
8706 tree
8707 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8708 {
8709 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8710 return build_truth_vector_type_for_mode (nunits, new_mode);
8711 }
8712
8713 /* Return a mask type with twice as many elements as OLD_TYPE,
8714 given that it should have mode NEW_MODE. */
8715
8716 tree
8717 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8718 {
8719 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8720 return build_truth_vector_type_for_mode (nunits, new_mode);
8721 }
8722
8723 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8724 contain a sequence of NVECTORS masks that each control a vector of type
8725 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
8726 these vector masks with the vector version of SCALAR_MASK. */
8727
8728 void
8729 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8730 unsigned int nvectors, tree vectype, tree scalar_mask)
8731 {
8732 gcc_assert (nvectors != 0);
8733 if (masks->length () < nvectors)
8734 masks->safe_grow_cleared (nvectors, true);
8735 rgroup_controls *rgm = &(*masks)[nvectors - 1];
8736 /* The number of scalars per iteration and the number of vectors are
8737 both compile-time constants. */
8738 unsigned int nscalars_per_iter
8739 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8740 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8741
8742 if (scalar_mask)
8743 {
8744 scalar_cond_masked_key cond (scalar_mask, nvectors);
8745 loop_vinfo->scalar_cond_masked_set.add (cond);
8746 }
8747
8748 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8749 {
8750 rgm->max_nscalars_per_iter = nscalars_per_iter;
8751 rgm->type = truth_type_for (vectype);
8752 rgm->factor = 1;
8753 }
8754 }
8755
8756 /* Given a complete set of masks MASKS, extract mask number INDEX
8757 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8758 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8759
8760 See the comment above vec_loop_masks for more details about the mask
8761 arrangement. */
8762
8763 tree
8764 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8765 unsigned int nvectors, tree vectype, unsigned int index)
8766 {
8767 rgroup_controls *rgm = &(*masks)[nvectors - 1];
8768 tree mask_type = rgm->type;
8769
8770 /* Populate the rgroup's mask array, if this is the first time we've
8771 used it. */
8772 if (rgm->controls.is_empty ())
8773 {
8774 rgm->controls.safe_grow_cleared (nvectors, true);
8775 for (unsigned int i = 0; i < nvectors; ++i)
8776 {
8777 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8778 /* Provide a dummy definition until the real one is available. */
8779 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8780 rgm->controls[i] = mask;
8781 }
8782 }
8783
8784 tree mask = rgm->controls[index];
8785 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8786 TYPE_VECTOR_SUBPARTS (vectype)))
8787 {
8788 /* A loop mask for data type X can be reused for data type Y
8789 if X has N times more elements than Y and if Y's elements
8790 are N times bigger than X's. In this case each sequence
8791 of N elements in the loop mask will be all-zero or all-one.
8792 We can then view-convert the mask so that each sequence of
8793 N elements is replaced by a single element. */
8794 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8795 TYPE_VECTOR_SUBPARTS (vectype)));
8796 gimple_seq seq = NULL;
8797 mask_type = truth_type_for (vectype);
8798 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8799 if (seq)
8800 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8801 }
8802 return mask;
8803 }
8804
8805 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
8806 lengths for controlling an operation on VECTYPE. The operation splits
8807 each element of VECTYPE into FACTOR separate subelements, measuring the
8808 length as a number of these subelements. */
8809
8810 void
8811 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
8812 unsigned int nvectors, tree vectype, unsigned int factor)
8813 {
8814 gcc_assert (nvectors != 0);
8815 if (lens->length () < nvectors)
8816 lens->safe_grow_cleared (nvectors, true);
8817 rgroup_controls *rgl = &(*lens)[nvectors - 1];
8818
8819 /* The number of scalars per iteration, scalar occupied bytes and
8820 the number of vectors are both compile-time constants. */
8821 unsigned int nscalars_per_iter
8822 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8823 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8824
8825 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
8826 {
8827 /* For now, we only support cases in which all loads and stores fall back
8828 to VnQI or none do. */
8829 gcc_assert (!rgl->max_nscalars_per_iter
8830 || (rgl->factor == 1 && factor == 1)
8831 || (rgl->max_nscalars_per_iter * rgl->factor
8832 == nscalars_per_iter * factor));
8833 rgl->max_nscalars_per_iter = nscalars_per_iter;
8834 rgl->type = vectype;
8835 rgl->factor = factor;
8836 }
8837 }
8838
8839 /* Given a complete set of length LENS, extract length number INDEX for an
8840 rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
8841
8842 tree
8843 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
8844 unsigned int nvectors, unsigned int index)
8845 {
8846 rgroup_controls *rgl = &(*lens)[nvectors - 1];
8847
8848 /* Populate the rgroup's len array, if this is the first time we've
8849 used it. */
8850 if (rgl->controls.is_empty ())
8851 {
8852 rgl->controls.safe_grow_cleared (nvectors, true);
8853 for (unsigned int i = 0; i < nvectors; ++i)
8854 {
8855 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
8856 gcc_assert (len_type != NULL_TREE);
8857 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
8858
8859 /* Provide a dummy definition until the real one is available. */
8860 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
8861 rgl->controls[i] = len;
8862 }
8863 }
8864
8865 return rgl->controls[index];
8866 }
8867
8868 /* Scale profiling counters by estimation for LOOP which is vectorized
8869 by factor VF. */
8870
8871 static void
8872 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
8873 {
8874 edge preheader = loop_preheader_edge (loop);
8875 /* Reduce loop iterations by the vectorization factor. */
8876 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8877 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8878
8879 if (freq_h.nonzero_p ())
8880 {
8881 profile_probability p;
8882
8883 /* Avoid dropping loop body profile counter to 0 because of zero count
8884 in loop's preheader. */
8885 if (!(freq_e == profile_count::zero ()))
8886 freq_e = freq_e.force_nonzero ();
8887 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8888 scale_loop_frequencies (loop, p);
8889 }
8890
8891 edge exit_e = single_exit (loop);
8892 exit_e->probability = profile_probability::always ()
8893 .apply_scale (1, new_est_niter + 1);
8894
8895 edge exit_l = single_pred_edge (loop->latch);
8896 profile_probability prob = exit_l->probability;
8897 exit_l->probability = exit_e->probability.invert ();
8898 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8899 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8900 }
8901
8902 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
8903 latch edge values originally defined by it. */
8904
8905 static void
8906 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
8907 stmt_vec_info def_stmt_info)
8908 {
8909 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
8910 if (!def || TREE_CODE (def) != SSA_NAME)
8911 return;
8912 stmt_vec_info phi_info;
8913 imm_use_iterator iter;
8914 use_operand_p use_p;
8915 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
8916 if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
8917 if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
8918 && (phi_info = loop_vinfo->lookup_stmt (phi))
8919 && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
8920 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
8921 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
8922 {
8923 loop_p loop = gimple_bb (phi)->loop_father;
8924 edge e = loop_latch_edge (loop);
8925 if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
8926 {
8927 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
8928 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
8929 gcc_assert (phi_defs.length () == latch_defs.length ());
8930 for (unsigned i = 0; i < phi_defs.length (); ++i)
8931 add_phi_arg (as_a <gphi *> (phi_defs[i]),
8932 gimple_get_lhs (latch_defs[i]), e,
8933 gimple_phi_arg_location (phi, e->dest_idx));
8934 }
8935 }
8936 }
8937
8938 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8939 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8940 stmt_vec_info. */
8941
8942 static void
8943 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8944 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8945 {
8946 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8947 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8948
8949 if (dump_enabled_p ())
8950 dump_printf_loc (MSG_NOTE, vect_location,
8951 "------>vectorizing statement: %G", stmt_info->stmt);
8952
8953 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8954 vect_loop_kill_debug_uses (loop, stmt_info);
8955
8956 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8957 && !STMT_VINFO_LIVE_P (stmt_info))
8958 return;
8959
8960 if (STMT_VINFO_VECTYPE (stmt_info))
8961 {
8962 poly_uint64 nunits
8963 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8964 if (!STMT_SLP_TYPE (stmt_info)
8965 && maybe_ne (nunits, vf)
8966 && dump_enabled_p ())
8967 /* For SLP VF is set according to unrolling factor, and not
8968 to vector size, hence for SLP this print is not valid. */
8969 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8970 }
8971
8972 /* Pure SLP statements have already been vectorized. We still need
8973 to apply loop vectorization to hybrid SLP statements. */
8974 if (PURE_SLP_STMT (stmt_info))
8975 return;
8976
8977 if (dump_enabled_p ())
8978 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8979
8980 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
8981 *seen_store = stmt_info;
8982 }
8983
8984 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
8985 in the hash_map with its corresponding values. */
8986
8987 static tree
8988 find_in_mapping (tree t, void *context)
8989 {
8990 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
8991
8992 tree *value = mapping->get (t);
8993 return value ? *value : t;
8994 }
8995
8996 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
8997 original loop that has now been vectorized.
8998
8999 The inits of the data_references need to be advanced with the number of
9000 iterations of the main loop. This has been computed in vect_do_peeling and
9001 is stored in parameter ADVANCE. We first restore the data_references
9002 initial offset with the values recored in ORIG_DRS_INIT.
9003
9004 Since the loop_vec_info of this EPILOGUE was constructed for the original
9005 loop, its stmt_vec_infos all point to the original statements. These need
9006 to be updated to point to their corresponding copies as well as the SSA_NAMES
9007 in their PATTERN_DEF_SEQs and RELATED_STMTs.
9008
9009 The data_reference's connections also need to be updated. Their
9010 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9011 stmt_vec_infos, their statements need to point to their corresponding copy,
9012 if they are gather loads or scatter stores then their reference needs to be
9013 updated to point to its corresponding copy and finally we set
9014 'base_misaligned' to false as we have already peeled for alignment in the
9015 prologue of the main loop. */
9016
9017 static void
9018 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9019 {
9020 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9021 auto_vec<gimple *> stmt_worklist;
9022 hash_map<tree,tree> mapping;
9023 gimple *orig_stmt, *new_stmt;
9024 gimple_stmt_iterator epilogue_gsi;
9025 gphi_iterator epilogue_phi_gsi;
9026 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9027 basic_block *epilogue_bbs = get_loop_body (epilogue);
9028 unsigned i;
9029
9030 free (LOOP_VINFO_BBS (epilogue_vinfo));
9031 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9032
9033 /* Advance data_reference's with the number of iterations of the previous
9034 loop and its prologue. */
9035 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9036
9037
9038 /* The EPILOGUE loop is a copy of the original loop so they share the same
9039 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
9040 point to the copied statements. We also create a mapping of all LHS' in
9041 the original loop and all the LHS' in the EPILOGUE and create worklists to
9042 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
9043 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9044 {
9045 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9046 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9047 {
9048 new_stmt = epilogue_phi_gsi.phi ();
9049
9050 gcc_assert (gimple_uid (new_stmt) > 0);
9051 stmt_vinfo
9052 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9053
9054 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9055 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9056
9057 mapping.put (gimple_phi_result (orig_stmt),
9058 gimple_phi_result (new_stmt));
9059 /* PHI nodes can not have patterns or related statements. */
9060 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9061 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9062 }
9063
9064 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9065 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9066 {
9067 new_stmt = gsi_stmt (epilogue_gsi);
9068 if (is_gimple_debug (new_stmt))
9069 continue;
9070
9071 gcc_assert (gimple_uid (new_stmt) > 0);
9072 stmt_vinfo
9073 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9074
9075 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9076 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9077
9078 if (tree old_lhs = gimple_get_lhs (orig_stmt))
9079 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9080
9081 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9082 {
9083 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9084 for (gimple_stmt_iterator gsi = gsi_start (seq);
9085 !gsi_end_p (gsi); gsi_next (&gsi))
9086 stmt_worklist.safe_push (gsi_stmt (gsi));
9087 }
9088
9089 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9090 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9091 {
9092 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9093 stmt_worklist.safe_push (stmt);
9094 /* Set BB such that the assert in
9095 'get_initial_def_for_reduction' is able to determine that
9096 the BB of the related stmt is inside this loop. */
9097 gimple_set_bb (stmt,
9098 gimple_bb (new_stmt));
9099 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9100 gcc_assert (related_vinfo == NULL
9101 || related_vinfo == stmt_vinfo);
9102 }
9103 }
9104 }
9105
9106 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9107 using the original main loop and thus need to be updated to refer to the
9108 cloned variables used in the epilogue. */
9109 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9110 {
9111 gimple *stmt = stmt_worklist[i];
9112 tree *new_op;
9113
9114 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9115 {
9116 tree op = gimple_op (stmt, j);
9117 if ((new_op = mapping.get(op)))
9118 gimple_set_op (stmt, j, *new_op);
9119 else
9120 {
9121 /* PR92429: The last argument of simplify_replace_tree disables
9122 folding when replacing arguments. This is required as
9123 otherwise you might end up with different statements than the
9124 ones analyzed in vect_loop_analyze, leading to different
9125 vectorization. */
9126 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9127 &find_in_mapping, &mapping, false);
9128 gimple_set_op (stmt, j, op);
9129 }
9130 }
9131 }
9132
9133 struct data_reference *dr;
9134 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9135 FOR_EACH_VEC_ELT (datarefs, i, dr)
9136 {
9137 orig_stmt = DR_STMT (dr);
9138 gcc_assert (gimple_uid (orig_stmt) > 0);
9139 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9140 /* Data references for gather loads and scatter stores do not use the
9141 updated offset we set using ADVANCE. Instead we have to make sure the
9142 reference in the data references point to the corresponding copy of
9143 the original in the epilogue. */
9144 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9145 == VMAT_GATHER_SCATTER)
9146 {
9147 DR_REF (dr)
9148 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9149 &find_in_mapping, &mapping);
9150 DR_BASE_ADDRESS (dr)
9151 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9152 &find_in_mapping, &mapping);
9153 }
9154 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9155 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9156 /* The vector size of the epilogue is smaller than that of the main loop
9157 so the alignment is either the same or lower. This means the dr will
9158 thus by definition be aligned. */
9159 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
9160 }
9161
9162 epilogue_vinfo->shared->datarefs_copy.release ();
9163 epilogue_vinfo->shared->save_datarefs ();
9164 }
9165
9166 /* Function vect_transform_loop.
9167
9168 The analysis phase has determined that the loop is vectorizable.
9169 Vectorize the loop - created vectorized stmts to replace the scalar
9170 stmts in the loop, and update the loop exit condition.
9171 Returns scalar epilogue loop if any. */
9172
9173 class loop *
9174 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9175 {
9176 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9177 class loop *epilogue = NULL;
9178 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9179 int nbbs = loop->num_nodes;
9180 int i;
9181 tree niters_vector = NULL_TREE;
9182 tree step_vector = NULL_TREE;
9183 tree niters_vector_mult_vf = NULL_TREE;
9184 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9185 unsigned int lowest_vf = constant_lower_bound (vf);
9186 gimple *stmt;
9187 bool check_profitability = false;
9188 unsigned int th;
9189
9190 DUMP_VECT_SCOPE ("vec_transform_loop");
9191
9192 loop_vinfo->shared->check_datarefs ();
9193
9194 /* Use the more conservative vectorization threshold. If the number
9195 of iterations is constant assume the cost check has been performed
9196 by our caller. If the threshold makes all loops profitable that
9197 run at least the (estimated) vectorization factor number of times
9198 checking is pointless, too. */
9199 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9200 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9201 {
9202 if (dump_enabled_p ())
9203 dump_printf_loc (MSG_NOTE, vect_location,
9204 "Profitability threshold is %d loop iterations.\n",
9205 th);
9206 check_profitability = true;
9207 }
9208
9209 /* Make sure there exists a single-predecessor exit bb. Do this before
9210 versioning. */
9211 edge e = single_exit (loop);
9212 if (! single_pred_p (e->dest))
9213 {
9214 split_loop_exit_edge (e, true);
9215 if (dump_enabled_p ())
9216 dump_printf (MSG_NOTE, "split exit edge\n");
9217 }
9218
9219 /* Version the loop first, if required, so the profitability check
9220 comes first. */
9221
9222 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9223 {
9224 class loop *sloop
9225 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9226 sloop->force_vectorize = false;
9227 check_profitability = false;
9228 }
9229
9230 /* Make sure there exists a single-predecessor exit bb also on the
9231 scalar loop copy. Do this after versioning but before peeling
9232 so CFG structure is fine for both scalar and if-converted loop
9233 to make slpeel_duplicate_current_defs_from_edges face matched
9234 loop closed PHI nodes on the exit. */
9235 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9236 {
9237 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9238 if (! single_pred_p (e->dest))
9239 {
9240 split_loop_exit_edge (e, true);
9241 if (dump_enabled_p ())
9242 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9243 }
9244 }
9245
9246 tree niters = vect_build_loop_niters (loop_vinfo);
9247 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9248 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9249 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9250 tree advance;
9251 drs_init_vec orig_drs_init;
9252
9253 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9254 &step_vector, &niters_vector_mult_vf, th,
9255 check_profitability, niters_no_overflow,
9256 &advance);
9257
9258 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9259 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9260 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9261 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9262
9263 if (niters_vector == NULL_TREE)
9264 {
9265 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9266 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9267 && known_eq (lowest_vf, vf))
9268 {
9269 niters_vector
9270 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9271 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9272 step_vector = build_one_cst (TREE_TYPE (niters));
9273 }
9274 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9275 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9276 &step_vector, niters_no_overflow);
9277 else
9278 /* vect_do_peeling subtracted the number of peeled prologue
9279 iterations from LOOP_VINFO_NITERS. */
9280 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9281 &niters_vector, &step_vector,
9282 niters_no_overflow);
9283 }
9284
9285 /* 1) Make sure the loop header has exactly two entries
9286 2) Make sure we have a preheader basic block. */
9287
9288 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9289
9290 split_edge (loop_preheader_edge (loop));
9291
9292 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9293 /* This will deal with any possible peeling. */
9294 vect_prepare_for_masked_peels (loop_vinfo);
9295
9296 /* Schedule the SLP instances first, then handle loop vectorization
9297 below. */
9298 if (!loop_vinfo->slp_instances.is_empty ())
9299 {
9300 DUMP_VECT_SCOPE ("scheduling SLP instances");
9301 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9302 }
9303
9304 /* FORNOW: the vectorizer supports only loops which body consist
9305 of one basic block (header + empty latch). When the vectorizer will
9306 support more involved loop forms, the order by which the BBs are
9307 traversed need to be reconsidered. */
9308
9309 for (i = 0; i < nbbs; i++)
9310 {
9311 basic_block bb = bbs[i];
9312 stmt_vec_info stmt_info;
9313
9314 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9315 gsi_next (&si))
9316 {
9317 gphi *phi = si.phi ();
9318 if (dump_enabled_p ())
9319 dump_printf_loc (MSG_NOTE, vect_location,
9320 "------>vectorizing phi: %G", phi);
9321 stmt_info = loop_vinfo->lookup_stmt (phi);
9322 if (!stmt_info)
9323 continue;
9324
9325 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9326 vect_loop_kill_debug_uses (loop, stmt_info);
9327
9328 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9329 && !STMT_VINFO_LIVE_P (stmt_info))
9330 continue;
9331
9332 if (STMT_VINFO_VECTYPE (stmt_info)
9333 && (maybe_ne
9334 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9335 && dump_enabled_p ())
9336 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9337
9338 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9339 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9340 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9341 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9342 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9343 && ! PURE_SLP_STMT (stmt_info))
9344 {
9345 if (dump_enabled_p ())
9346 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9347 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9348 }
9349 }
9350
9351 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9352 gsi_next (&si))
9353 {
9354 gphi *phi = si.phi ();
9355 stmt_info = loop_vinfo->lookup_stmt (phi);
9356 if (!stmt_info)
9357 continue;
9358
9359 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9360 && !STMT_VINFO_LIVE_P (stmt_info))
9361 continue;
9362
9363 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9364 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9365 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9366 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9367 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9368 && ! PURE_SLP_STMT (stmt_info))
9369 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9370 }
9371
9372 for (gimple_stmt_iterator si = gsi_start_bb (bb);
9373 !gsi_end_p (si);)
9374 {
9375 stmt = gsi_stmt (si);
9376 /* During vectorization remove existing clobber stmts. */
9377 if (gimple_clobber_p (stmt))
9378 {
9379 unlink_stmt_vdef (stmt);
9380 gsi_remove (&si, true);
9381 release_defs (stmt);
9382 }
9383 else
9384 {
9385 /* Ignore vector stmts created in the outer loop. */
9386 stmt_info = loop_vinfo->lookup_stmt (stmt);
9387
9388 /* vector stmts created in the outer-loop during vectorization of
9389 stmts in an inner-loop may not have a stmt_info, and do not
9390 need to be vectorized. */
9391 stmt_vec_info seen_store = NULL;
9392 if (stmt_info)
9393 {
9394 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9395 {
9396 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9397 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9398 !gsi_end_p (subsi); gsi_next (&subsi))
9399 {
9400 stmt_vec_info pat_stmt_info
9401 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9402 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9403 &si, &seen_store);
9404 }
9405 stmt_vec_info pat_stmt_info
9406 = STMT_VINFO_RELATED_STMT (stmt_info);
9407 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
9408 &seen_store);
9409 maybe_set_vectorized_backedge_value (loop_vinfo,
9410 pat_stmt_info);
9411 }
9412 else
9413 {
9414 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9415 &seen_store);
9416 maybe_set_vectorized_backedge_value (loop_vinfo,
9417 stmt_info);
9418 }
9419 }
9420 gsi_next (&si);
9421 if (seen_store)
9422 {
9423 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9424 /* Interleaving. If IS_STORE is TRUE, the
9425 vectorization of the interleaving chain was
9426 completed - free all the stores in the chain. */
9427 vect_remove_stores (loop_vinfo,
9428 DR_GROUP_FIRST_ELEMENT (seen_store));
9429 else
9430 /* Free the attached stmt_vec_info and remove the stmt. */
9431 loop_vinfo->remove_stmt (stmt_info);
9432 }
9433 }
9434 }
9435
9436 /* Stub out scalar statements that must not survive vectorization.
9437 Doing this here helps with grouped statements, or statements that
9438 are involved in patterns. */
9439 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9440 !gsi_end_p (gsi); gsi_next (&gsi))
9441 {
9442 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9443 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
9444 {
9445 tree lhs = gimple_get_lhs (call);
9446 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9447 {
9448 tree zero = build_zero_cst (TREE_TYPE (lhs));
9449 gimple *new_stmt = gimple_build_assign (lhs, zero);
9450 gsi_replace (&gsi, new_stmt, true);
9451 }
9452 }
9453 }
9454 } /* BBs in loop */
9455
9456 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9457 a zero NITERS becomes a nonzero NITERS_VECTOR. */
9458 if (integer_onep (step_vector))
9459 niters_no_overflow = true;
9460 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9461 niters_vector_mult_vf, !niters_no_overflow);
9462
9463 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9464 scale_profile_for_vect_loop (loop, assumed_vf);
9465
9466 /* True if the final iteration might not handle a full vector's
9467 worth of scalar iterations. */
9468 bool final_iter_may_be_partial
9469 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9470 /* The minimum number of iterations performed by the epilogue. This
9471 is 1 when peeling for gaps because we always need a final scalar
9472 iteration. */
9473 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9474 /* +1 to convert latch counts to loop iteration counts,
9475 -min_epilogue_iters to remove iterations that cannot be performed
9476 by the vector code. */
9477 int bias_for_lowest = 1 - min_epilogue_iters;
9478 int bias_for_assumed = bias_for_lowest;
9479 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9480 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9481 {
9482 /* When the amount of peeling is known at compile time, the first
9483 iteration will have exactly alignment_npeels active elements.
9484 In the worst case it will have at least one. */
9485 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9486 bias_for_lowest += lowest_vf - min_first_active;
9487 bias_for_assumed += assumed_vf - min_first_active;
9488 }
9489 /* In these calculations the "- 1" converts loop iteration counts
9490 back to latch counts. */
9491 if (loop->any_upper_bound)
9492 loop->nb_iterations_upper_bound
9493 = (final_iter_may_be_partial
9494 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9495 lowest_vf) - 1
9496 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9497 lowest_vf) - 1);
9498 if (loop->any_likely_upper_bound)
9499 loop->nb_iterations_likely_upper_bound
9500 = (final_iter_may_be_partial
9501 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9502 + bias_for_lowest, lowest_vf) - 1
9503 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9504 + bias_for_lowest, lowest_vf) - 1);
9505 if (loop->any_estimate)
9506 loop->nb_iterations_estimate
9507 = (final_iter_may_be_partial
9508 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9509 assumed_vf) - 1
9510 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9511 assumed_vf) - 1);
9512
9513 if (dump_enabled_p ())
9514 {
9515 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9516 {
9517 dump_printf_loc (MSG_NOTE, vect_location,
9518 "LOOP VECTORIZED\n");
9519 if (loop->inner)
9520 dump_printf_loc (MSG_NOTE, vect_location,
9521 "OUTER LOOP VECTORIZED\n");
9522 dump_printf (MSG_NOTE, "\n");
9523 }
9524 else
9525 dump_printf_loc (MSG_NOTE, vect_location,
9526 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9527 GET_MODE_NAME (loop_vinfo->vector_mode));
9528 }
9529
9530 /* Loops vectorized with a variable factor won't benefit from
9531 unrolling/peeling. */
9532 if (!vf.is_constant ())
9533 {
9534 loop->unroll = 1;
9535 if (dump_enabled_p ())
9536 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9537 " variable-length vectorization factor\n");
9538 }
9539 /* Free SLP instances here because otherwise stmt reference counting
9540 won't work. */
9541 slp_instance instance;
9542 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9543 vect_free_slp_instance (instance);
9544 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9545 /* Clear-up safelen field since its value is invalid after vectorization
9546 since vectorized loop can have loop-carried dependencies. */
9547 loop->safelen = 0;
9548
9549 if (epilogue)
9550 {
9551 update_epilogue_loop_vinfo (epilogue, advance);
9552
9553 epilogue->simduid = loop->simduid;
9554 epilogue->force_vectorize = loop->force_vectorize;
9555 epilogue->dont_vectorize = false;
9556 }
9557
9558 return epilogue;
9559 }
9560
9561 /* The code below is trying to perform simple optimization - revert
9562 if-conversion for masked stores, i.e. if the mask of a store is zero
9563 do not perform it and all stored value producers also if possible.
9564 For example,
9565 for (i=0; i<n; i++)
9566 if (c[i])
9567 {
9568 p1[i] += 1;
9569 p2[i] = p3[i] +2;
9570 }
9571 this transformation will produce the following semi-hammock:
9572
9573 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9574 {
9575 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9576 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9577 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9578 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9579 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9580 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9581 }
9582 */
9583
9584 void
9585 optimize_mask_stores (class loop *loop)
9586 {
9587 basic_block *bbs = get_loop_body (loop);
9588 unsigned nbbs = loop->num_nodes;
9589 unsigned i;
9590 basic_block bb;
9591 class loop *bb_loop;
9592 gimple_stmt_iterator gsi;
9593 gimple *stmt;
9594 auto_vec<gimple *> worklist;
9595 auto_purge_vect_location sentinel;
9596
9597 vect_location = find_loop_location (loop);
9598 /* Pick up all masked stores in loop if any. */
9599 for (i = 0; i < nbbs; i++)
9600 {
9601 bb = bbs[i];
9602 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9603 gsi_next (&gsi))
9604 {
9605 stmt = gsi_stmt (gsi);
9606 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9607 worklist.safe_push (stmt);
9608 }
9609 }
9610
9611 free (bbs);
9612 if (worklist.is_empty ())
9613 return;
9614
9615 /* Loop has masked stores. */
9616 while (!worklist.is_empty ())
9617 {
9618 gimple *last, *last_store;
9619 edge e, efalse;
9620 tree mask;
9621 basic_block store_bb, join_bb;
9622 gimple_stmt_iterator gsi_to;
9623 tree vdef, new_vdef;
9624 gphi *phi;
9625 tree vectype;
9626 tree zero;
9627
9628 last = worklist.pop ();
9629 mask = gimple_call_arg (last, 2);
9630 bb = gimple_bb (last);
9631 /* Create then_bb and if-then structure in CFG, then_bb belongs to
9632 the same loop as if_bb. It could be different to LOOP when two
9633 level loop-nest is vectorized and mask_store belongs to the inner
9634 one. */
9635 e = split_block (bb, last);
9636 bb_loop = bb->loop_father;
9637 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9638 join_bb = e->dest;
9639 store_bb = create_empty_bb (bb);
9640 add_bb_to_loop (store_bb, bb_loop);
9641 e->flags = EDGE_TRUE_VALUE;
9642 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9643 /* Put STORE_BB to likely part. */
9644 efalse->probability = profile_probability::unlikely ();
9645 store_bb->count = efalse->count ();
9646 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9647 if (dom_info_available_p (CDI_DOMINATORS))
9648 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9649 if (dump_enabled_p ())
9650 dump_printf_loc (MSG_NOTE, vect_location,
9651 "Create new block %d to sink mask stores.",
9652 store_bb->index);
9653 /* Create vector comparison with boolean result. */
9654 vectype = TREE_TYPE (mask);
9655 zero = build_zero_cst (vectype);
9656 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9657 gsi = gsi_last_bb (bb);
9658 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9659 /* Create new PHI node for vdef of the last masked store:
9660 .MEM_2 = VDEF <.MEM_1>
9661 will be converted to
9662 .MEM.3 = VDEF <.MEM_1>
9663 and new PHI node will be created in join bb
9664 .MEM_2 = PHI <.MEM_1, .MEM_3>
9665 */
9666 vdef = gimple_vdef (last);
9667 new_vdef = make_ssa_name (gimple_vop (cfun), last);
9668 gimple_set_vdef (last, new_vdef);
9669 phi = create_phi_node (vdef, join_bb);
9670 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9671
9672 /* Put all masked stores with the same mask to STORE_BB if possible. */
9673 while (true)
9674 {
9675 gimple_stmt_iterator gsi_from;
9676 gimple *stmt1 = NULL;
9677
9678 /* Move masked store to STORE_BB. */
9679 last_store = last;
9680 gsi = gsi_for_stmt (last);
9681 gsi_from = gsi;
9682 /* Shift GSI to the previous stmt for further traversal. */
9683 gsi_prev (&gsi);
9684 gsi_to = gsi_start_bb (store_bb);
9685 gsi_move_before (&gsi_from, &gsi_to);
9686 /* Setup GSI_TO to the non-empty block start. */
9687 gsi_to = gsi_start_bb (store_bb);
9688 if (dump_enabled_p ())
9689 dump_printf_loc (MSG_NOTE, vect_location,
9690 "Move stmt to created bb\n%G", last);
9691 /* Move all stored value producers if possible. */
9692 while (!gsi_end_p (gsi))
9693 {
9694 tree lhs;
9695 imm_use_iterator imm_iter;
9696 use_operand_p use_p;
9697 bool res;
9698
9699 /* Skip debug statements. */
9700 if (is_gimple_debug (gsi_stmt (gsi)))
9701 {
9702 gsi_prev (&gsi);
9703 continue;
9704 }
9705 stmt1 = gsi_stmt (gsi);
9706 /* Do not consider statements writing to memory or having
9707 volatile operand. */
9708 if (gimple_vdef (stmt1)
9709 || gimple_has_volatile_ops (stmt1))
9710 break;
9711 gsi_from = gsi;
9712 gsi_prev (&gsi);
9713 lhs = gimple_get_lhs (stmt1);
9714 if (!lhs)
9715 break;
9716
9717 /* LHS of vectorized stmt must be SSA_NAME. */
9718 if (TREE_CODE (lhs) != SSA_NAME)
9719 break;
9720
9721 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9722 {
9723 /* Remove dead scalar statement. */
9724 if (has_zero_uses (lhs))
9725 {
9726 gsi_remove (&gsi_from, true);
9727 continue;
9728 }
9729 }
9730
9731 /* Check that LHS does not have uses outside of STORE_BB. */
9732 res = true;
9733 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9734 {
9735 gimple *use_stmt;
9736 use_stmt = USE_STMT (use_p);
9737 if (is_gimple_debug (use_stmt))
9738 continue;
9739 if (gimple_bb (use_stmt) != store_bb)
9740 {
9741 res = false;
9742 break;
9743 }
9744 }
9745 if (!res)
9746 break;
9747
9748 if (gimple_vuse (stmt1)
9749 && gimple_vuse (stmt1) != gimple_vuse (last_store))
9750 break;
9751
9752 /* Can move STMT1 to STORE_BB. */
9753 if (dump_enabled_p ())
9754 dump_printf_loc (MSG_NOTE, vect_location,
9755 "Move stmt to created bb\n%G", stmt1);
9756 gsi_move_before (&gsi_from, &gsi_to);
9757 /* Shift GSI_TO for further insertion. */
9758 gsi_prev (&gsi_to);
9759 }
9760 /* Put other masked stores with the same mask to STORE_BB. */
9761 if (worklist.is_empty ()
9762 || gimple_call_arg (worklist.last (), 2) != mask
9763 || worklist.last () != stmt1)
9764 break;
9765 last = worklist.pop ();
9766 }
9767 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9768 }
9769 }
9770
9771 /* Decide whether it is possible to use a zero-based induction variable
9772 when vectorizing LOOP_VINFO with partial vectors. If it is, return
9773 the value that the induction variable must be able to hold in order
9774 to ensure that the rgroups eventually have no active vector elements.
9775 Return -1 otherwise. */
9776
9777 widest_int
9778 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
9779 {
9780 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9781 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9782 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9783
9784 /* Calculate the value that the induction variable must be able
9785 to hit in order to ensure that we end the loop with an all-false mask.
9786 This involves adding the maximum number of inactive trailing scalar
9787 iterations. */
9788 widest_int iv_limit = -1;
9789 if (max_loop_iterations (loop, &iv_limit))
9790 {
9791 if (niters_skip)
9792 {
9793 /* Add the maximum number of skipped iterations to the
9794 maximum iteration count. */
9795 if (TREE_CODE (niters_skip) == INTEGER_CST)
9796 iv_limit += wi::to_widest (niters_skip);
9797 else
9798 iv_limit += max_vf - 1;
9799 }
9800 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9801 /* Make a conservatively-correct assumption. */
9802 iv_limit += max_vf - 1;
9803
9804 /* IV_LIMIT is the maximum number of latch iterations, which is also
9805 the maximum in-range IV value. Round this value down to the previous
9806 vector alignment boundary and then add an extra full iteration. */
9807 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9808 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9809 }
9810 return iv_limit;
9811 }
9812
9813 /* For the given rgroup_controls RGC, check whether an induction variable
9814 would ever hit a value that produces a set of all-false masks or zero
9815 lengths before wrapping around. Return true if it's possible to wrap
9816 around before hitting the desirable value, otherwise return false. */
9817
9818 bool
9819 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
9820 {
9821 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
9822
9823 if (iv_limit == -1)
9824 return true;
9825
9826 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9827 unsigned int compare_precision = TYPE_PRECISION (compare_type);
9828 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
9829
9830 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
9831 return true;
9832
9833 return false;
9834 }