dump_printf: use %T and %G throughout
[gcc.git] / gcc / tree-vect-loop.c
1 /* Loop Vectorization
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
57
58 /* Loop Vectorization Pass.
59
60 This pass tries to vectorize loops.
61
62 For example, the vectorizer transforms the following simple loop:
63
64 short a[N]; short b[N]; short c[N]; int i;
65
66 for (i=0; i<N; i++){
67 a[i] = b[i] + c[i];
68 }
69
70 as if it was manually vectorized by rewriting the source code into:
71
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75 v8hi va, vb, vc;
76
77 for (i=0; i<N/8; i++){
78 vb = pb[i];
79 vc = pc[i];
80 va = vb + vc;
81 pa[i] = va;
82 }
83
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
95
96 Analysis phase:
97 ===============
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
101
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
106
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
117
118 For example, say stmt S1 was vectorized into stmt VS1:
119
120 VS1: vb = px[i];
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122 S2: a = b;
123
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
128
129 VS1: vb = px[i];
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 VS2: va = vb;
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
133
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
136
137 Target modeling:
138 =================
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
144
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
151
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
154 */
155
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
157
158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
159 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
160 may already be set for general statements (not just data refs). */
161
162 static bool
163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
164 bool vectype_maybe_set_p,
165 poly_uint64 *vf,
166 vec<stmt_vec_info > *mask_producers)
167 {
168 gimple *stmt = stmt_info->stmt;
169
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
173 {
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return true;
177 }
178
179 tree stmt_vectype, nunits_vectype;
180 if (!vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
181 &nunits_vectype))
182 return false;
183
184 if (stmt_vectype)
185 {
186 if (STMT_VINFO_VECTYPE (stmt_info))
187 /* The only case when a vectype had been already set is for stmts
188 that contain a data ref, or for "pattern-stmts" (stmts generated
189 by the vectorizer to represent/replace a certain idiom). */
190 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
191 || vectype_maybe_set_p)
192 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
193 else if (stmt_vectype == boolean_type_node)
194 mask_producers->safe_push (stmt_info);
195 else
196 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
197 }
198
199 if (nunits_vectype)
200 vect_update_max_nunits (vf, nunits_vectype);
201
202 return true;
203 }
204
205 /* Subroutine of vect_determine_vectorization_factor. Set the vector
206 types of STMT_INFO and all attached pattern statements and update
207 the vectorization factor VF accordingly. If some of the statements
208 produce a mask result whose vector type can only be calculated later,
209 add them to MASK_PRODUCERS. Return true on success or false if
210 something prevented vectorization. */
211
212 static bool
213 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
214 vec<stmt_vec_info > *mask_producers)
215 {
216 vec_info *vinfo = stmt_info->vinfo;
217 if (dump_enabled_p ())
218 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
219 stmt_info->stmt);
220 if (!vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers))
221 return false;
222
223 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
224 && STMT_VINFO_RELATED_STMT (stmt_info))
225 {
226 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
227 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
228
229 /* If a pattern statement has def stmts, analyze them too. */
230 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
231 !gsi_end_p (si); gsi_next (&si))
232 {
233 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
234 if (dump_enabled_p ())
235 dump_printf_loc (MSG_NOTE, vect_location,
236 "==> examining pattern def stmt: %G",
237 def_stmt_info->stmt);
238 if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
239 vf, mask_producers))
240 return false;
241 }
242
243 if (dump_enabled_p ())
244 dump_printf_loc (MSG_NOTE, vect_location,
245 "==> examining pattern statement: %G",
246 stmt_info->stmt);
247 if (!vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers))
248 return false;
249 }
250
251 return true;
252 }
253
254 /* Function vect_determine_vectorization_factor
255
256 Determine the vectorization factor (VF). VF is the number of data elements
257 that are operated upon in parallel in a single iteration of the vectorized
258 loop. For example, when vectorizing a loop that operates on 4byte elements,
259 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
260 elements can fit in a single vector register.
261
262 We currently support vectorization of loops in which all types operated upon
263 are of the same size. Therefore this function currently sets VF according to
264 the size of the types operated upon, and fails if there are multiple sizes
265 in the loop.
266
267 VF is also the factor by which the loop iterations are strip-mined, e.g.:
268 original loop:
269 for (i=0; i<N; i++){
270 a[i] = b[i] + c[i];
271 }
272
273 vectorized loop:
274 for (i=0; i<N; i+=VF){
275 a[i:VF] = b[i:VF] + c[i:VF];
276 }
277 */
278
279 static bool
280 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
281 {
282 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
283 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
284 unsigned nbbs = loop->num_nodes;
285 poly_uint64 vectorization_factor = 1;
286 tree scalar_type = NULL_TREE;
287 gphi *phi;
288 tree vectype;
289 stmt_vec_info stmt_info;
290 unsigned i;
291 auto_vec<stmt_vec_info> mask_producers;
292
293 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
294
295 for (i = 0; i < nbbs; i++)
296 {
297 basic_block bb = bbs[i];
298
299 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
300 gsi_next (&si))
301 {
302 phi = si.phi ();
303 stmt_info = loop_vinfo->lookup_stmt (phi);
304 if (dump_enabled_p ())
305 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
306 phi);
307
308 gcc_assert (stmt_info);
309
310 if (STMT_VINFO_RELEVANT_P (stmt_info)
311 || STMT_VINFO_LIVE_P (stmt_info))
312 {
313 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
314 scalar_type = TREE_TYPE (PHI_RESULT (phi));
315
316 if (dump_enabled_p ())
317 dump_printf_loc (MSG_NOTE, vect_location,
318 "get vectype for scalar type: %T\n",
319 scalar_type);
320
321 vectype = get_vectype_for_scalar_type (scalar_type);
322 if (!vectype)
323 {
324 if (dump_enabled_p ())
325 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
326 "not vectorized: unsupported "
327 "data-type %T\n",
328 scalar_type);
329 return false;
330 }
331 STMT_VINFO_VECTYPE (stmt_info) = vectype;
332
333 if (dump_enabled_p ())
334 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
335 vectype);
336
337 if (dump_enabled_p ())
338 {
339 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
340 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
341 dump_printf (MSG_NOTE, "\n");
342 }
343
344 vect_update_max_nunits (&vectorization_factor, vectype);
345 }
346 }
347
348 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
349 gsi_next (&si))
350 {
351 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
352 if (!vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
353 &mask_producers))
354 return false;
355 }
356 }
357
358 /* TODO: Analyze cost. Decide if worth while to vectorize. */
359 if (dump_enabled_p ())
360 {
361 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
362 dump_dec (MSG_NOTE, vectorization_factor);
363 dump_printf (MSG_NOTE, "\n");
364 }
365
366 if (known_le (vectorization_factor, 1U))
367 {
368 if (dump_enabled_p ())
369 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
370 "not vectorized: unsupported data-type\n");
371 return false;
372 }
373 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
374
375 for (i = 0; i < mask_producers.length (); i++)
376 {
377 stmt_info = mask_producers[i];
378 tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
379 if (!mask_type)
380 return false;
381 STMT_VINFO_VECTYPE (stmt_info) = mask_type;
382 }
383
384 return true;
385 }
386
387
388 /* Function vect_is_simple_iv_evolution.
389
390 FORNOW: A simple evolution of an induction variables in the loop is
391 considered a polynomial evolution. */
392
393 static bool
394 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
395 tree * step)
396 {
397 tree init_expr;
398 tree step_expr;
399 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
400 basic_block bb;
401
402 /* When there is no evolution in this loop, the evolution function
403 is not "simple". */
404 if (evolution_part == NULL_TREE)
405 return false;
406
407 /* When the evolution is a polynomial of degree >= 2
408 the evolution function is not "simple". */
409 if (tree_is_chrec (evolution_part))
410 return false;
411
412 step_expr = evolution_part;
413 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
414
415 if (dump_enabled_p ())
416 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
417 step_expr, init_expr);
418
419 *init = init_expr;
420 *step = step_expr;
421
422 if (TREE_CODE (step_expr) != INTEGER_CST
423 && (TREE_CODE (step_expr) != SSA_NAME
424 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
425 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
426 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
427 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
428 || !flag_associative_math)))
429 && (TREE_CODE (step_expr) != REAL_CST
430 || !flag_associative_math))
431 {
432 if (dump_enabled_p ())
433 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
434 "step unknown.\n");
435 return false;
436 }
437
438 return true;
439 }
440
441 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
442 what we are assuming is a double reduction. For example, given
443 a structure like this:
444
445 outer1:
446 x_1 = PHI <x_4(outer2), ...>;
447 ...
448
449 inner:
450 x_2 = PHI <x_1(outer1), ...>;
451 ...
452 x_3 = ...;
453 ...
454
455 outer2:
456 x_4 = PHI <x_3(inner)>;
457 ...
458
459 outer loop analysis would treat x_1 as a double reduction phi and
460 this function would then return true for x_2. */
461
462 static bool
463 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
464 {
465 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
466 use_operand_p use_p;
467 ssa_op_iter op_iter;
468 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
469 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
470 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
471 return true;
472 return false;
473 }
474
475 /* Function vect_analyze_scalar_cycles_1.
476
477 Examine the cross iteration def-use cycles of scalar variables
478 in LOOP. LOOP_VINFO represents the loop that is now being
479 considered for vectorization (can be LOOP, or an outer-loop
480 enclosing LOOP). */
481
482 static void
483 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
484 {
485 basic_block bb = loop->header;
486 tree init, step;
487 auto_vec<stmt_vec_info, 64> worklist;
488 gphi_iterator gsi;
489 bool double_reduc;
490
491 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
492
493 /* First - identify all inductions. Reduction detection assumes that all the
494 inductions have been identified, therefore, this order must not be
495 changed. */
496 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
497 {
498 gphi *phi = gsi.phi ();
499 tree access_fn = NULL;
500 tree def = PHI_RESULT (phi);
501 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
502
503 if (dump_enabled_p ())
504 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
505
506 /* Skip virtual phi's. The data dependences that are associated with
507 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
508 if (virtual_operand_p (def))
509 continue;
510
511 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
512
513 /* Analyze the evolution function. */
514 access_fn = analyze_scalar_evolution (loop, def);
515 if (access_fn)
516 {
517 STRIP_NOPS (access_fn);
518 if (dump_enabled_p ())
519 dump_printf_loc (MSG_NOTE, vect_location,
520 "Access function of PHI: %T\n", access_fn);
521 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
522 = initial_condition_in_loop_num (access_fn, loop->num);
523 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
524 = evolution_part_in_loop_num (access_fn, loop->num);
525 }
526
527 if (!access_fn
528 || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
529 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
530 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
531 && TREE_CODE (step) != INTEGER_CST))
532 {
533 worklist.safe_push (stmt_vinfo);
534 continue;
535 }
536
537 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
538 != NULL_TREE);
539 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
540
541 if (dump_enabled_p ())
542 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
543 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
544 }
545
546
547 /* Second - identify all reductions and nested cycles. */
548 while (worklist.length () > 0)
549 {
550 stmt_vec_info stmt_vinfo = worklist.pop ();
551 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
552 tree def = PHI_RESULT (phi);
553
554 if (dump_enabled_p ())
555 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
556
557 gcc_assert (!virtual_operand_p (def)
558 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
559
560 stmt_vec_info reduc_stmt_info
561 = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
562 &double_reduc, false);
563 if (reduc_stmt_info)
564 {
565 if (double_reduc)
566 {
567 if (dump_enabled_p ())
568 dump_printf_loc (MSG_NOTE, vect_location,
569 "Detected double reduction.\n");
570
571 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
572 STMT_VINFO_DEF_TYPE (reduc_stmt_info)
573 = vect_double_reduction_def;
574 }
575 else
576 {
577 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
578 {
579 if (dump_enabled_p ())
580 dump_printf_loc (MSG_NOTE, vect_location,
581 "Detected vectorizable nested cycle.\n");
582
583 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
584 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
585 }
586 else
587 {
588 if (dump_enabled_p ())
589 dump_printf_loc (MSG_NOTE, vect_location,
590 "Detected reduction.\n");
591
592 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
593 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
594 /* Store the reduction cycles for possible vectorization in
595 loop-aware SLP if it was not detected as reduction
596 chain. */
597 if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
598 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
599 (reduc_stmt_info);
600 }
601 }
602 }
603 else
604 if (dump_enabled_p ())
605 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
606 "Unknown def-use cycle pattern.\n");
607 }
608 }
609
610
611 /* Function vect_analyze_scalar_cycles.
612
613 Examine the cross iteration def-use cycles of scalar variables, by
614 analyzing the loop-header PHIs of scalar variables. Classify each
615 cycle as one of the following: invariant, induction, reduction, unknown.
616 We do that for the loop represented by LOOP_VINFO, and also to its
617 inner-loop, if exists.
618 Examples for scalar cycles:
619
620 Example1: reduction:
621
622 loop1:
623 for (i=0; i<N; i++)
624 sum += a[i];
625
626 Example2: induction:
627
628 loop2:
629 for (i=0; i<N; i++)
630 a[i] = i; */
631
632 static void
633 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
634 {
635 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
636
637 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
638
639 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
640 Reductions in such inner-loop therefore have different properties than
641 the reductions in the nest that gets vectorized:
642 1. When vectorized, they are executed in the same order as in the original
643 scalar loop, so we can't change the order of computation when
644 vectorizing them.
645 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
646 current checks are too strict. */
647
648 if (loop->inner)
649 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
650 }
651
652 /* Transfer group and reduction information from STMT_INFO to its
653 pattern stmt. */
654
655 static void
656 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
657 {
658 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
659 stmt_vec_info stmtp;
660 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
661 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
662 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
663 do
664 {
665 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
666 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
667 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
668 if (stmt_info)
669 REDUC_GROUP_NEXT_ELEMENT (stmtp)
670 = STMT_VINFO_RELATED_STMT (stmt_info);
671 }
672 while (stmt_info);
673 STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
674 }
675
676 /* Fixup scalar cycles that now have their stmts detected as patterns. */
677
678 static void
679 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
680 {
681 stmt_vec_info first;
682 unsigned i;
683
684 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
685 if (STMT_VINFO_IN_PATTERN_P (first))
686 {
687 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
688 while (next)
689 {
690 if (! STMT_VINFO_IN_PATTERN_P (next))
691 break;
692 next = REDUC_GROUP_NEXT_ELEMENT (next);
693 }
694 /* If not all stmt in the chain are patterns try to handle
695 the chain without patterns. */
696 if (! next)
697 {
698 vect_fixup_reduc_chain (first);
699 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
700 = STMT_VINFO_RELATED_STMT (first);
701 }
702 }
703 }
704
705 /* Function vect_get_loop_niters.
706
707 Determine how many iterations the loop is executed and place it
708 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
709 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
710 niter information holds in ASSUMPTIONS.
711
712 Return the loop exit condition. */
713
714
715 static gcond *
716 vect_get_loop_niters (struct loop *loop, tree *assumptions,
717 tree *number_of_iterations, tree *number_of_iterationsm1)
718 {
719 edge exit = single_exit (loop);
720 struct tree_niter_desc niter_desc;
721 tree niter_assumptions, niter, may_be_zero;
722 gcond *cond = get_loop_exit_condition (loop);
723
724 *assumptions = boolean_true_node;
725 *number_of_iterationsm1 = chrec_dont_know;
726 *number_of_iterations = chrec_dont_know;
727 DUMP_VECT_SCOPE ("get_loop_niters");
728
729 if (!exit)
730 return cond;
731
732 niter = chrec_dont_know;
733 may_be_zero = NULL_TREE;
734 niter_assumptions = boolean_true_node;
735 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
736 || chrec_contains_undetermined (niter_desc.niter))
737 return cond;
738
739 niter_assumptions = niter_desc.assumptions;
740 may_be_zero = niter_desc.may_be_zero;
741 niter = niter_desc.niter;
742
743 if (may_be_zero && integer_zerop (may_be_zero))
744 may_be_zero = NULL_TREE;
745
746 if (may_be_zero)
747 {
748 if (COMPARISON_CLASS_P (may_be_zero))
749 {
750 /* Try to combine may_be_zero with assumptions, this can simplify
751 computation of niter expression. */
752 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
753 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
754 niter_assumptions,
755 fold_build1 (TRUTH_NOT_EXPR,
756 boolean_type_node,
757 may_be_zero));
758 else
759 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
760 build_int_cst (TREE_TYPE (niter), 0),
761 rewrite_to_non_trapping_overflow (niter));
762
763 may_be_zero = NULL_TREE;
764 }
765 else if (integer_nonzerop (may_be_zero))
766 {
767 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
768 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
769 return cond;
770 }
771 else
772 return cond;
773 }
774
775 *assumptions = niter_assumptions;
776 *number_of_iterationsm1 = niter;
777
778 /* We want the number of loop header executions which is the number
779 of latch executions plus one.
780 ??? For UINT_MAX latch executions this number overflows to zero
781 for loops like do { n++; } while (n != 0); */
782 if (niter && !chrec_contains_undetermined (niter))
783 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
784 build_int_cst (TREE_TYPE (niter), 1));
785 *number_of_iterations = niter;
786
787 return cond;
788 }
789
790 /* Function bb_in_loop_p
791
792 Used as predicate for dfs order traversal of the loop bbs. */
793
794 static bool
795 bb_in_loop_p (const_basic_block bb, const void *data)
796 {
797 const struct loop *const loop = (const struct loop *)data;
798 if (flow_bb_inside_loop_p (loop, bb))
799 return true;
800 return false;
801 }
802
803
804 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
805 stmt_vec_info structs for all the stmts in LOOP_IN. */
806
807 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
808 : vec_info (vec_info::loop, init_cost (loop_in), shared),
809 loop (loop_in),
810 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
811 num_itersm1 (NULL_TREE),
812 num_iters (NULL_TREE),
813 num_iters_unchanged (NULL_TREE),
814 num_iters_assumptions (NULL_TREE),
815 th (0),
816 versioning_threshold (0),
817 vectorization_factor (0),
818 max_vectorization_factor (0),
819 mask_skip_niters (NULL_TREE),
820 mask_compare_type (NULL_TREE),
821 unaligned_dr (NULL),
822 peeling_for_alignment (0),
823 ptr_mask (0),
824 ivexpr_map (NULL),
825 slp_unrolling_factor (1),
826 single_scalar_iteration_cost (0),
827 vectorizable (false),
828 can_fully_mask_p (true),
829 fully_masked_p (false),
830 peeling_for_gaps (false),
831 peeling_for_niter (false),
832 operands_swapped (false),
833 no_data_dependencies (false),
834 has_mask_store (false),
835 scalar_loop (NULL),
836 orig_loop_info (NULL)
837 {
838 /* CHECKME: We want to visit all BBs before their successors (except for
839 latch blocks, for which this assertion wouldn't hold). In the simple
840 case of the loop forms we allow, a dfs order of the BBs would the same
841 as reversed postorder traversal, so we are safe. */
842
843 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
844 bbs, loop->num_nodes, loop);
845 gcc_assert (nbbs == loop->num_nodes);
846
847 for (unsigned int i = 0; i < nbbs; i++)
848 {
849 basic_block bb = bbs[i];
850 gimple_stmt_iterator si;
851
852 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
853 {
854 gimple *phi = gsi_stmt (si);
855 gimple_set_uid (phi, 0);
856 add_stmt (phi);
857 }
858
859 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
860 {
861 gimple *stmt = gsi_stmt (si);
862 gimple_set_uid (stmt, 0);
863 add_stmt (stmt);
864 }
865 }
866 }
867
868 /* Free all levels of MASKS. */
869
870 void
871 release_vec_loop_masks (vec_loop_masks *masks)
872 {
873 rgroup_masks *rgm;
874 unsigned int i;
875 FOR_EACH_VEC_ELT (*masks, i, rgm)
876 rgm->masks.release ();
877 masks->release ();
878 }
879
880 /* Free all memory used by the _loop_vec_info, as well as all the
881 stmt_vec_info structs of all the stmts in the loop. */
882
883 _loop_vec_info::~_loop_vec_info ()
884 {
885 int nbbs;
886 gimple_stmt_iterator si;
887 int j;
888
889 nbbs = loop->num_nodes;
890 for (j = 0; j < nbbs; j++)
891 {
892 basic_block bb = bbs[j];
893 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
894 {
895 gimple *stmt = gsi_stmt (si);
896
897 /* We may have broken canonical form by moving a constant
898 into RHS1 of a commutative op. Fix such occurrences. */
899 if (operands_swapped && is_gimple_assign (stmt))
900 {
901 enum tree_code code = gimple_assign_rhs_code (stmt);
902
903 if ((code == PLUS_EXPR
904 || code == POINTER_PLUS_EXPR
905 || code == MULT_EXPR)
906 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
907 swap_ssa_operands (stmt,
908 gimple_assign_rhs1_ptr (stmt),
909 gimple_assign_rhs2_ptr (stmt));
910 else if (code == COND_EXPR
911 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
912 {
913 tree cond_expr = gimple_assign_rhs1 (stmt);
914 enum tree_code cond_code = TREE_CODE (cond_expr);
915
916 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
917 {
918 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
919 0));
920 cond_code = invert_tree_comparison (cond_code,
921 honor_nans);
922 if (cond_code != ERROR_MARK)
923 {
924 TREE_SET_CODE (cond_expr, cond_code);
925 swap_ssa_operands (stmt,
926 gimple_assign_rhs2_ptr (stmt),
927 gimple_assign_rhs3_ptr (stmt));
928 }
929 }
930 }
931 }
932 gsi_next (&si);
933 }
934 }
935
936 free (bbs);
937
938 release_vec_loop_masks (&masks);
939 delete ivexpr_map;
940
941 loop->aux = NULL;
942 }
943
944 /* Return an invariant or register for EXPR and emit necessary
945 computations in the LOOP_VINFO loop preheader. */
946
947 tree
948 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
949 {
950 if (is_gimple_reg (expr)
951 || is_gimple_min_invariant (expr))
952 return expr;
953
954 if (! loop_vinfo->ivexpr_map)
955 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
956 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
957 if (! cached)
958 {
959 gimple_seq stmts = NULL;
960 cached = force_gimple_operand (unshare_expr (expr),
961 &stmts, true, NULL_TREE);
962 if (stmts)
963 {
964 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
965 gsi_insert_seq_on_edge_immediate (e, stmts);
966 }
967 }
968 return cached;
969 }
970
971 /* Return true if we can use CMP_TYPE as the comparison type to produce
972 all masks required to mask LOOP_VINFO. */
973
974 static bool
975 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
976 {
977 rgroup_masks *rgm;
978 unsigned int i;
979 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
980 if (rgm->mask_type != NULL_TREE
981 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
982 cmp_type, rgm->mask_type,
983 OPTIMIZE_FOR_SPEED))
984 return false;
985 return true;
986 }
987
988 /* Calculate the maximum number of scalars per iteration for every
989 rgroup in LOOP_VINFO. */
990
991 static unsigned int
992 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
993 {
994 unsigned int res = 1;
995 unsigned int i;
996 rgroup_masks *rgm;
997 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
998 res = MAX (res, rgm->max_nscalars_per_iter);
999 return res;
1000 }
1001
1002 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1003 whether we can actually generate the masks required. Return true if so,
1004 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1005
1006 static bool
1007 vect_verify_full_masking (loop_vec_info loop_vinfo)
1008 {
1009 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1010 unsigned int min_ni_width;
1011
1012 /* Use a normal loop if there are no statements that need masking.
1013 This only happens in rare degenerate cases: it means that the loop
1014 has no loads, no stores, and no live-out values. */
1015 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1016 return false;
1017
1018 /* Get the maximum number of iterations that is representable
1019 in the counter type. */
1020 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1021 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1022
1023 /* Get a more refined estimate for the number of iterations. */
1024 widest_int max_back_edges;
1025 if (max_loop_iterations (loop, &max_back_edges))
1026 max_ni = wi::smin (max_ni, max_back_edges + 1);
1027
1028 /* Account for rgroup masks, in which each bit is replicated N times. */
1029 max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1030
1031 /* Work out how many bits we need to represent the limit. */
1032 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1033
1034 /* Find a scalar mode for which WHILE_ULT is supported. */
1035 opt_scalar_int_mode cmp_mode_iter;
1036 tree cmp_type = NULL_TREE;
1037 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1038 {
1039 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1040 if (cmp_bits >= min_ni_width
1041 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1042 {
1043 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1044 if (this_type
1045 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1046 {
1047 /* Although we could stop as soon as we find a valid mode,
1048 it's often better to continue until we hit Pmode, since the
1049 operands to the WHILE are more likely to be reusable in
1050 address calculations. */
1051 cmp_type = this_type;
1052 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1053 break;
1054 }
1055 }
1056 }
1057
1058 if (!cmp_type)
1059 return false;
1060
1061 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1062 return true;
1063 }
1064
1065 /* Calculate the cost of one scalar iteration of the loop. */
1066 static void
1067 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1068 {
1069 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1070 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1071 int nbbs = loop->num_nodes, factor;
1072 int innerloop_iters, i;
1073
1074 /* Gather costs for statements in the scalar loop. */
1075
1076 /* FORNOW. */
1077 innerloop_iters = 1;
1078 if (loop->inner)
1079 innerloop_iters = 50; /* FIXME */
1080
1081 for (i = 0; i < nbbs; i++)
1082 {
1083 gimple_stmt_iterator si;
1084 basic_block bb = bbs[i];
1085
1086 if (bb->loop_father == loop->inner)
1087 factor = innerloop_iters;
1088 else
1089 factor = 1;
1090
1091 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1092 {
1093 gimple *stmt = gsi_stmt (si);
1094 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1095
1096 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1097 continue;
1098
1099 /* Skip stmts that are not vectorized inside the loop. */
1100 if (stmt_info
1101 && !STMT_VINFO_RELEVANT_P (stmt_info)
1102 && (!STMT_VINFO_LIVE_P (stmt_info)
1103 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1104 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1105 continue;
1106
1107 vect_cost_for_stmt kind;
1108 if (STMT_VINFO_DATA_REF (stmt_info))
1109 {
1110 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1111 kind = scalar_load;
1112 else
1113 kind = scalar_store;
1114 }
1115 else
1116 kind = scalar_stmt;
1117
1118 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1119 factor, kind, stmt_info, 0, vect_prologue);
1120 }
1121 }
1122
1123 /* Now accumulate cost. */
1124 void *target_cost_data = init_cost (loop);
1125 stmt_info_for_cost *si;
1126 int j;
1127 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1128 j, si)
1129 (void) add_stmt_cost (target_cost_data, si->count,
1130 si->kind, si->stmt_info, si->misalign,
1131 vect_body);
1132 unsigned dummy, body_cost = 0;
1133 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1134 destroy_cost_data (target_cost_data);
1135 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1136 }
1137
1138
1139 /* Function vect_analyze_loop_form_1.
1140
1141 Verify that certain CFG restrictions hold, including:
1142 - the loop has a pre-header
1143 - the loop has a single entry and exit
1144 - the loop exit condition is simple enough
1145 - the number of iterations can be analyzed, i.e, a countable loop. The
1146 niter could be analyzed under some assumptions. */
1147
1148 bool
1149 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1150 tree *assumptions, tree *number_of_iterationsm1,
1151 tree *number_of_iterations, gcond **inner_loop_cond)
1152 {
1153 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1154
1155 /* Different restrictions apply when we are considering an inner-most loop,
1156 vs. an outer (nested) loop.
1157 (FORNOW. May want to relax some of these restrictions in the future). */
1158
1159 if (!loop->inner)
1160 {
1161 /* Inner-most loop. We currently require that the number of BBs is
1162 exactly 2 (the header and latch). Vectorizable inner-most loops
1163 look like this:
1164
1165 (pre-header)
1166 |
1167 header <--------+
1168 | | |
1169 | +--> latch --+
1170 |
1171 (exit-bb) */
1172
1173 if (loop->num_nodes != 2)
1174 {
1175 if (dump_enabled_p ())
1176 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1177 "not vectorized: control flow in loop.\n");
1178 return false;
1179 }
1180
1181 if (empty_block_p (loop->header))
1182 {
1183 if (dump_enabled_p ())
1184 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1185 "not vectorized: empty loop.\n");
1186 return false;
1187 }
1188 }
1189 else
1190 {
1191 struct loop *innerloop = loop->inner;
1192 edge entryedge;
1193
1194 /* Nested loop. We currently require that the loop is doubly-nested,
1195 contains a single inner loop, and the number of BBs is exactly 5.
1196 Vectorizable outer-loops look like this:
1197
1198 (pre-header)
1199 |
1200 header <---+
1201 | |
1202 inner-loop |
1203 | |
1204 tail ------+
1205 |
1206 (exit-bb)
1207
1208 The inner-loop has the properties expected of inner-most loops
1209 as described above. */
1210
1211 if ((loop->inner)->inner || (loop->inner)->next)
1212 {
1213 if (dump_enabled_p ())
1214 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1215 "not vectorized: multiple nested loops.\n");
1216 return false;
1217 }
1218
1219 if (loop->num_nodes != 5)
1220 {
1221 if (dump_enabled_p ())
1222 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1223 "not vectorized: control flow in loop.\n");
1224 return false;
1225 }
1226
1227 entryedge = loop_preheader_edge (innerloop);
1228 if (entryedge->src != loop->header
1229 || !single_exit (innerloop)
1230 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1231 {
1232 if (dump_enabled_p ())
1233 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1234 "not vectorized: unsupported outerloop form.\n");
1235 return false;
1236 }
1237
1238 /* Analyze the inner-loop. */
1239 tree inner_niterm1, inner_niter, inner_assumptions;
1240 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1241 &inner_assumptions, &inner_niterm1,
1242 &inner_niter, NULL)
1243 /* Don't support analyzing niter under assumptions for inner
1244 loop. */
1245 || !integer_onep (inner_assumptions))
1246 {
1247 if (dump_enabled_p ())
1248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1249 "not vectorized: Bad inner loop.\n");
1250 return false;
1251 }
1252
1253 if (!expr_invariant_in_loop_p (loop, inner_niter))
1254 {
1255 if (dump_enabled_p ())
1256 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1257 "not vectorized: inner-loop count not"
1258 " invariant.\n");
1259 return false;
1260 }
1261
1262 if (dump_enabled_p ())
1263 dump_printf_loc (MSG_NOTE, vect_location,
1264 "Considering outer-loop vectorization.\n");
1265 }
1266
1267 if (!single_exit (loop)
1268 || EDGE_COUNT (loop->header->preds) != 2)
1269 {
1270 if (dump_enabled_p ())
1271 {
1272 if (!single_exit (loop))
1273 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1274 "not vectorized: multiple exits.\n");
1275 else if (EDGE_COUNT (loop->header->preds) != 2)
1276 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1277 "not vectorized: too many incoming edges.\n");
1278 }
1279 return false;
1280 }
1281
1282 /* We assume that the loop exit condition is at the end of the loop. i.e,
1283 that the loop is represented as a do-while (with a proper if-guard
1284 before the loop if needed), where the loop header contains all the
1285 executable statements, and the latch is empty. */
1286 if (!empty_block_p (loop->latch)
1287 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1288 {
1289 if (dump_enabled_p ())
1290 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1291 "not vectorized: latch block not empty.\n");
1292 return false;
1293 }
1294
1295 /* Make sure the exit is not abnormal. */
1296 edge e = single_exit (loop);
1297 if (e->flags & EDGE_ABNORMAL)
1298 {
1299 if (dump_enabled_p ())
1300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1301 "not vectorized: abnormal loop exit edge.\n");
1302 return false;
1303 }
1304
1305 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1306 number_of_iterationsm1);
1307 if (!*loop_cond)
1308 {
1309 if (dump_enabled_p ())
1310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1311 "not vectorized: complicated exit condition.\n");
1312 return false;
1313 }
1314
1315 if (integer_zerop (*assumptions)
1316 || !*number_of_iterations
1317 || chrec_contains_undetermined (*number_of_iterations))
1318 {
1319 if (dump_enabled_p ())
1320 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1321 "not vectorized: number of iterations cannot be "
1322 "computed.\n");
1323 return false;
1324 }
1325
1326 if (integer_zerop (*number_of_iterations))
1327 {
1328 if (dump_enabled_p ())
1329 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1330 "not vectorized: number of iterations = 0.\n");
1331 return false;
1332 }
1333
1334 return true;
1335 }
1336
1337 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1338
1339 loop_vec_info
1340 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1341 {
1342 tree assumptions, number_of_iterations, number_of_iterationsm1;
1343 gcond *loop_cond, *inner_loop_cond = NULL;
1344
1345 if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1346 &assumptions, &number_of_iterationsm1,
1347 &number_of_iterations, &inner_loop_cond))
1348 return NULL;
1349
1350 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1351 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1352 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1353 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1354 if (!integer_onep (assumptions))
1355 {
1356 /* We consider to vectorize this loop by versioning it under
1357 some assumptions. In order to do this, we need to clear
1358 existing information computed by scev and niter analyzer. */
1359 scev_reset_htab ();
1360 free_numbers_of_iterations_estimates (loop);
1361 /* Also set flag for this loop so that following scev and niter
1362 analysis are done under the assumptions. */
1363 loop_constraint_set (loop, LOOP_C_FINITE);
1364 /* Also record the assumptions for versioning. */
1365 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1366 }
1367
1368 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1369 {
1370 if (dump_enabled_p ())
1371 {
1372 dump_printf_loc (MSG_NOTE, vect_location,
1373 "Symbolic number of iterations is ");
1374 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1375 dump_printf (MSG_NOTE, "\n");
1376 }
1377 }
1378
1379 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1380 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1381 if (inner_loop_cond)
1382 {
1383 stmt_vec_info inner_loop_cond_info
1384 = loop_vinfo->lookup_stmt (inner_loop_cond);
1385 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1386 }
1387
1388 gcc_assert (!loop->aux);
1389 loop->aux = loop_vinfo;
1390 return loop_vinfo;
1391 }
1392
1393
1394
1395 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1396 statements update the vectorization factor. */
1397
1398 static void
1399 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1400 {
1401 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1402 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1403 int nbbs = loop->num_nodes;
1404 poly_uint64 vectorization_factor;
1405 int i;
1406
1407 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1408
1409 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1410 gcc_assert (known_ne (vectorization_factor, 0U));
1411
1412 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1413 vectorization factor of the loop is the unrolling factor required by
1414 the SLP instances. If that unrolling factor is 1, we say, that we
1415 perform pure SLP on loop - cross iteration parallelism is not
1416 exploited. */
1417 bool only_slp_in_loop = true;
1418 for (i = 0; i < nbbs; i++)
1419 {
1420 basic_block bb = bbs[i];
1421 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1422 gsi_next (&si))
1423 {
1424 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1425 stmt_info = vect_stmt_to_vectorize (stmt_info);
1426 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1427 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1428 && !PURE_SLP_STMT (stmt_info))
1429 /* STMT needs both SLP and loop-based vectorization. */
1430 only_slp_in_loop = false;
1431 }
1432 }
1433
1434 if (only_slp_in_loop)
1435 {
1436 dump_printf_loc (MSG_NOTE, vect_location,
1437 "Loop contains only SLP stmts\n");
1438 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1439 }
1440 else
1441 {
1442 dump_printf_loc (MSG_NOTE, vect_location,
1443 "Loop contains SLP and non-SLP stmts\n");
1444 /* Both the vectorization factor and unroll factor have the form
1445 current_vector_size * X for some rational X, so they must have
1446 a common multiple. */
1447 vectorization_factor
1448 = force_common_multiple (vectorization_factor,
1449 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1450 }
1451
1452 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1453 if (dump_enabled_p ())
1454 {
1455 dump_printf_loc (MSG_NOTE, vect_location,
1456 "Updating vectorization factor to ");
1457 dump_dec (MSG_NOTE, vectorization_factor);
1458 dump_printf (MSG_NOTE, ".\n");
1459 }
1460 }
1461
1462 /* Return true if STMT_INFO describes a double reduction phi and if
1463 the other phi in the reduction is also relevant for vectorization.
1464 This rejects cases such as:
1465
1466 outer1:
1467 x_1 = PHI <x_3(outer2), ...>;
1468 ...
1469
1470 inner:
1471 x_2 = ...;
1472 ...
1473
1474 outer2:
1475 x_3 = PHI <x_2(inner)>;
1476
1477 if nothing in x_2 or elsewhere makes x_1 relevant. */
1478
1479 static bool
1480 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1481 {
1482 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1483 return false;
1484
1485 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1486 }
1487
1488 /* Function vect_analyze_loop_operations.
1489
1490 Scan the loop stmts and make sure they are all vectorizable. */
1491
1492 static bool
1493 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1494 {
1495 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1496 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1497 int nbbs = loop->num_nodes;
1498 int i;
1499 stmt_vec_info stmt_info;
1500 bool need_to_vectorize = false;
1501 bool ok;
1502
1503 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1504
1505 stmt_vector_for_cost cost_vec;
1506 cost_vec.create (2);
1507
1508 for (i = 0; i < nbbs; i++)
1509 {
1510 basic_block bb = bbs[i];
1511
1512 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1513 gsi_next (&si))
1514 {
1515 gphi *phi = si.phi ();
1516 ok = true;
1517
1518 stmt_info = loop_vinfo->lookup_stmt (phi);
1519 if (dump_enabled_p ())
1520 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1521 if (virtual_operand_p (gimple_phi_result (phi)))
1522 continue;
1523
1524 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1525 (i.e., a phi in the tail of the outer-loop). */
1526 if (! is_loop_header_bb_p (bb))
1527 {
1528 /* FORNOW: we currently don't support the case that these phis
1529 are not used in the outerloop (unless it is double reduction,
1530 i.e., this phi is vect_reduction_def), cause this case
1531 requires to actually do something here. */
1532 if (STMT_VINFO_LIVE_P (stmt_info)
1533 && !vect_active_double_reduction_p (stmt_info))
1534 {
1535 if (dump_enabled_p ())
1536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1537 "Unsupported loop-closed phi in "
1538 "outer-loop.\n");
1539 return false;
1540 }
1541
1542 /* If PHI is used in the outer loop, we check that its operand
1543 is defined in the inner loop. */
1544 if (STMT_VINFO_RELEVANT_P (stmt_info))
1545 {
1546 tree phi_op;
1547
1548 if (gimple_phi_num_args (phi) != 1)
1549 return false;
1550
1551 phi_op = PHI_ARG_DEF (phi, 0);
1552 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1553 if (!op_def_info)
1554 return false;
1555
1556 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1557 && (STMT_VINFO_RELEVANT (op_def_info)
1558 != vect_used_in_outer_by_reduction))
1559 return false;
1560 }
1561
1562 continue;
1563 }
1564
1565 gcc_assert (stmt_info);
1566
1567 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1568 || STMT_VINFO_LIVE_P (stmt_info))
1569 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1570 {
1571 /* A scalar-dependence cycle that we don't support. */
1572 if (dump_enabled_p ())
1573 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1574 "not vectorized: scalar dependence cycle.\n");
1575 return false;
1576 }
1577
1578 if (STMT_VINFO_RELEVANT_P (stmt_info))
1579 {
1580 need_to_vectorize = true;
1581 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1582 && ! PURE_SLP_STMT (stmt_info))
1583 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1584 &cost_vec);
1585 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1586 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1587 && ! PURE_SLP_STMT (stmt_info))
1588 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
1589 &cost_vec);
1590 }
1591
1592 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1593 if (ok
1594 && STMT_VINFO_LIVE_P (stmt_info)
1595 && !PURE_SLP_STMT (stmt_info))
1596 ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
1597 &cost_vec);
1598
1599 if (!ok)
1600 {
1601 if (dump_enabled_p ())
1602 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1603 "not vectorized: relevant phi not "
1604 "supported: %G", phi);
1605 return false;
1606 }
1607 }
1608
1609 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1610 gsi_next (&si))
1611 {
1612 gimple *stmt = gsi_stmt (si);
1613 if (!gimple_clobber_p (stmt)
1614 && !vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1615 &need_to_vectorize,
1616 NULL, NULL, &cost_vec))
1617 return false;
1618 }
1619 } /* bbs */
1620
1621 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1622 cost_vec.release ();
1623
1624 /* All operations in the loop are either irrelevant (deal with loop
1625 control, or dead), or only used outside the loop and can be moved
1626 out of the loop (e.g. invariants, inductions). The loop can be
1627 optimized away by scalar optimizations. We're better off not
1628 touching this loop. */
1629 if (!need_to_vectorize)
1630 {
1631 if (dump_enabled_p ())
1632 dump_printf_loc (MSG_NOTE, vect_location,
1633 "All the computation can be taken out of the loop.\n");
1634 if (dump_enabled_p ())
1635 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1636 "not vectorized: redundant loop. no profit to "
1637 "vectorize.\n");
1638 return false;
1639 }
1640
1641 return true;
1642 }
1643
1644 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1645 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1646 definitely no, or -1 if it's worth retrying. */
1647
1648 static int
1649 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1650 {
1651 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1652 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1653
1654 /* Only fully-masked loops can have iteration counts less than the
1655 vectorization factor. */
1656 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1657 {
1658 HOST_WIDE_INT max_niter;
1659
1660 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1661 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1662 else
1663 max_niter = max_stmt_executions_int (loop);
1664
1665 if (max_niter != -1
1666 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1667 {
1668 if (dump_enabled_p ())
1669 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1670 "not vectorized: iteration count smaller than "
1671 "vectorization factor.\n");
1672 return 0;
1673 }
1674 }
1675
1676 int min_profitable_iters, min_profitable_estimate;
1677 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1678 &min_profitable_estimate);
1679
1680 if (min_profitable_iters < 0)
1681 {
1682 if (dump_enabled_p ())
1683 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1684 "not vectorized: vectorization not profitable.\n");
1685 if (dump_enabled_p ())
1686 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1687 "not vectorized: vector version will never be "
1688 "profitable.\n");
1689 return -1;
1690 }
1691
1692 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1693 * assumed_vf);
1694
1695 /* Use the cost model only if it is more conservative than user specified
1696 threshold. */
1697 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1698 min_profitable_iters);
1699
1700 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1701
1702 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1703 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1704 {
1705 if (dump_enabled_p ())
1706 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1707 "not vectorized: vectorization not profitable.\n");
1708 if (dump_enabled_p ())
1709 dump_printf_loc (MSG_NOTE, vect_location,
1710 "not vectorized: iteration count smaller than user "
1711 "specified loop bound parameter or minimum profitable "
1712 "iterations (whichever is more conservative).\n");
1713 return 0;
1714 }
1715
1716 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1717 if (estimated_niter == -1)
1718 estimated_niter = likely_max_stmt_executions_int (loop);
1719 if (estimated_niter != -1
1720 && ((unsigned HOST_WIDE_INT) estimated_niter
1721 < MAX (th, (unsigned) min_profitable_estimate)))
1722 {
1723 if (dump_enabled_p ())
1724 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1725 "not vectorized: estimated iteration count too "
1726 "small.\n");
1727 if (dump_enabled_p ())
1728 dump_printf_loc (MSG_NOTE, vect_location,
1729 "not vectorized: estimated iteration count smaller "
1730 "than specified loop bound parameter or minimum "
1731 "profitable iterations (whichever is more "
1732 "conservative).\n");
1733 return -1;
1734 }
1735
1736 return 1;
1737 }
1738
1739 static bool
1740 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1741 vec<data_reference_p> *datarefs,
1742 unsigned int *n_stmts)
1743 {
1744 *n_stmts = 0;
1745 for (unsigned i = 0; i < loop->num_nodes; i++)
1746 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1747 !gsi_end_p (gsi); gsi_next (&gsi))
1748 {
1749 gimple *stmt = gsi_stmt (gsi);
1750 if (is_gimple_debug (stmt))
1751 continue;
1752 ++(*n_stmts);
1753 if (!vect_find_stmt_data_reference (loop, stmt, datarefs))
1754 {
1755 if (is_gimple_call (stmt) && loop->safelen)
1756 {
1757 tree fndecl = gimple_call_fndecl (stmt), op;
1758 if (fndecl != NULL_TREE)
1759 {
1760 cgraph_node *node = cgraph_node::get (fndecl);
1761 if (node != NULL && node->simd_clones != NULL)
1762 {
1763 unsigned int j, n = gimple_call_num_args (stmt);
1764 for (j = 0; j < n; j++)
1765 {
1766 op = gimple_call_arg (stmt, j);
1767 if (DECL_P (op)
1768 || (REFERENCE_CLASS_P (op)
1769 && get_base_address (op)))
1770 break;
1771 }
1772 op = gimple_call_lhs (stmt);
1773 /* Ignore #pragma omp declare simd functions
1774 if they don't have data references in the
1775 call stmt itself. */
1776 if (j == n
1777 && !(op
1778 && (DECL_P (op)
1779 || (REFERENCE_CLASS_P (op)
1780 && get_base_address (op)))))
1781 continue;
1782 }
1783 }
1784 }
1785 return false;
1786 }
1787 /* If dependence analysis will give up due to the limit on the
1788 number of datarefs stop here and fail fatally. */
1789 if (datarefs->length ()
1790 > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1791 return false;
1792 }
1793 return true;
1794 }
1795
1796 /* Function vect_analyze_loop_2.
1797
1798 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1799 for it. The different analyses will record information in the
1800 loop_vec_info struct. */
1801 static bool
1802 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1803 {
1804 bool ok;
1805 int res;
1806 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1807 poly_uint64 min_vf = 2;
1808
1809 /* The first group of checks is independent of the vector size. */
1810 fatal = true;
1811
1812 /* Find all data references in the loop (which correspond to vdefs/vuses)
1813 and analyze their evolution in the loop. */
1814
1815 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1816
1817 /* Gather the data references and count stmts in the loop. */
1818 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1819 {
1820 if (!vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1821 &LOOP_VINFO_DATAREFS (loop_vinfo),
1822 n_stmts))
1823 {
1824 if (dump_enabled_p ())
1825 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1826 "not vectorized: loop contains function "
1827 "calls or data references that cannot "
1828 "be analyzed\n");
1829 return false;
1830 }
1831 loop_vinfo->shared->save_datarefs ();
1832 }
1833 else
1834 loop_vinfo->shared->check_datarefs ();
1835
1836 /* Analyze the data references and also adjust the minimal
1837 vectorization factor according to the loads and stores. */
1838
1839 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1840 if (!ok)
1841 {
1842 if (dump_enabled_p ())
1843 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1844 "bad data references.\n");
1845 return false;
1846 }
1847
1848 /* Classify all cross-iteration scalar data-flow cycles.
1849 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1850 vect_analyze_scalar_cycles (loop_vinfo);
1851
1852 vect_pattern_recog (loop_vinfo);
1853
1854 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1855
1856 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1857 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1858
1859 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1860 if (!ok)
1861 {
1862 if (dump_enabled_p ())
1863 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1864 "bad data access.\n");
1865 return false;
1866 }
1867
1868 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1869
1870 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1871 if (!ok)
1872 {
1873 if (dump_enabled_p ())
1874 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1875 "unexpected pattern.\n");
1876 return false;
1877 }
1878
1879 /* While the rest of the analysis below depends on it in some way. */
1880 fatal = false;
1881
1882 /* Analyze data dependences between the data-refs in the loop
1883 and adjust the maximum vectorization factor according to
1884 the dependences.
1885 FORNOW: fail at the first data dependence that we encounter. */
1886
1887 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1888 if (!ok
1889 || (max_vf != MAX_VECTORIZATION_FACTOR
1890 && maybe_lt (max_vf, min_vf)))
1891 {
1892 if (dump_enabled_p ())
1893 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1894 "bad data dependence.\n");
1895 return false;
1896 }
1897 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1898
1899 ok = vect_determine_vectorization_factor (loop_vinfo);
1900 if (!ok)
1901 {
1902 if (dump_enabled_p ())
1903 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1904 "can't determine vectorization factor.\n");
1905 return false;
1906 }
1907 if (max_vf != MAX_VECTORIZATION_FACTOR
1908 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1909 {
1910 if (dump_enabled_p ())
1911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1912 "bad data dependence.\n");
1913 return false;
1914 }
1915
1916 /* Compute the scalar iteration cost. */
1917 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1918
1919 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1920 unsigned th;
1921
1922 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1923 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1924 if (!ok)
1925 return false;
1926
1927 /* If there are any SLP instances mark them as pure_slp. */
1928 bool slp = vect_make_slp_decision (loop_vinfo);
1929 if (slp)
1930 {
1931 /* Find stmts that need to be both vectorized and SLPed. */
1932 vect_detect_hybrid_slp (loop_vinfo);
1933
1934 /* Update the vectorization factor based on the SLP decision. */
1935 vect_update_vf_for_slp (loop_vinfo);
1936 }
1937
1938 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1939
1940 /* We don't expect to have to roll back to anything other than an empty
1941 set of rgroups. */
1942 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1943
1944 /* This is the point where we can re-start analysis with SLP forced off. */
1945 start_over:
1946
1947 /* Now the vectorization factor is final. */
1948 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1949 gcc_assert (known_ne (vectorization_factor, 0U));
1950
1951 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1952 {
1953 dump_printf_loc (MSG_NOTE, vect_location,
1954 "vectorization_factor = ");
1955 dump_dec (MSG_NOTE, vectorization_factor);
1956 dump_printf (MSG_NOTE, ", niters = %wd\n",
1957 LOOP_VINFO_INT_NITERS (loop_vinfo));
1958 }
1959
1960 HOST_WIDE_INT max_niter
1961 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1962
1963 /* Analyze the alignment of the data-refs in the loop.
1964 Fail if a data reference is found that cannot be vectorized. */
1965
1966 ok = vect_analyze_data_refs_alignment (loop_vinfo);
1967 if (!ok)
1968 {
1969 if (dump_enabled_p ())
1970 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1971 "bad data alignment.\n");
1972 return false;
1973 }
1974
1975 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1976 It is important to call pruning after vect_analyze_data_ref_accesses,
1977 since we use grouping information gathered by interleaving analysis. */
1978 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1979 if (!ok)
1980 return false;
1981
1982 /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
1983 vectorization. */
1984 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1985 {
1986 /* This pass will decide on using loop versioning and/or loop peeling in
1987 order to enhance the alignment of data references in the loop. */
1988 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1989 if (!ok)
1990 {
1991 if (dump_enabled_p ())
1992 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1993 "bad data alignment.\n");
1994 return false;
1995 }
1996 }
1997
1998 if (slp)
1999 {
2000 /* Analyze operations in the SLP instances. Note this may
2001 remove unsupported SLP instances which makes the above
2002 SLP kind detection invalid. */
2003 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2004 vect_slp_analyze_operations (loop_vinfo);
2005 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2006 goto again;
2007 }
2008
2009 /* Scan all the remaining operations in the loop that are not subject
2010 to SLP and make sure they are vectorizable. */
2011 ok = vect_analyze_loop_operations (loop_vinfo);
2012 if (!ok)
2013 {
2014 if (dump_enabled_p ())
2015 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2016 "bad operation or unsupported loop bound.\n");
2017 return false;
2018 }
2019
2020 /* Decide whether to use a fully-masked loop for this vectorization
2021 factor. */
2022 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2023 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2024 && vect_verify_full_masking (loop_vinfo));
2025 if (dump_enabled_p ())
2026 {
2027 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2028 dump_printf_loc (MSG_NOTE, vect_location,
2029 "using a fully-masked loop.\n");
2030 else
2031 dump_printf_loc (MSG_NOTE, vect_location,
2032 "not using a fully-masked loop.\n");
2033 }
2034
2035 /* If epilog loop is required because of data accesses with gaps,
2036 one additional iteration needs to be peeled. Check if there is
2037 enough iterations for vectorization. */
2038 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2039 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2040 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2041 {
2042 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2043 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2044
2045 if (known_lt (wi::to_widest (scalar_niters), vf))
2046 {
2047 if (dump_enabled_p ())
2048 dump_printf_loc (MSG_NOTE, vect_location,
2049 "loop has no enough iterations to support"
2050 " peeling for gaps.\n");
2051 return false;
2052 }
2053 }
2054
2055 /* Check the costings of the loop make vectorizing worthwhile. */
2056 res = vect_analyze_loop_costing (loop_vinfo);
2057 if (res < 0)
2058 goto again;
2059 if (!res)
2060 {
2061 if (dump_enabled_p ())
2062 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2063 "Loop costings not worthwhile.\n");
2064 return false;
2065 }
2066
2067 /* Decide whether we need to create an epilogue loop to handle
2068 remaining scalar iterations. */
2069 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2070
2071 unsigned HOST_WIDE_INT const_vf;
2072 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2073 /* The main loop handles all iterations. */
2074 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2075 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2076 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2077 {
2078 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2079 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2080 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2081 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2082 }
2083 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2084 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2085 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2086 < (unsigned) exact_log2 (const_vf))
2087 /* In case of versioning, check if the maximum number of
2088 iterations is greater than th. If they are identical,
2089 the epilogue is unnecessary. */
2090 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2091 || ((unsigned HOST_WIDE_INT) max_niter
2092 > (th / const_vf) * const_vf))))
2093 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2094
2095 /* If an epilogue loop is required make sure we can create one. */
2096 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2097 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2098 {
2099 if (dump_enabled_p ())
2100 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2101 if (!vect_can_advance_ivs_p (loop_vinfo)
2102 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2103 single_exit (LOOP_VINFO_LOOP
2104 (loop_vinfo))))
2105 {
2106 if (dump_enabled_p ())
2107 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2108 "not vectorized: can't create required "
2109 "epilog loop\n");
2110 goto again;
2111 }
2112 }
2113
2114 /* During peeling, we need to check if number of loop iterations is
2115 enough for both peeled prolog loop and vector loop. This check
2116 can be merged along with threshold check of loop versioning, so
2117 increase threshold for this case if necessary. */
2118 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2119 {
2120 poly_uint64 niters_th = 0;
2121
2122 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2123 {
2124 /* Niters for peeled prolog loop. */
2125 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2126 {
2127 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2128 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2129 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2130 }
2131 else
2132 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2133 }
2134
2135 /* Niters for at least one iteration of vectorized loop. */
2136 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2137 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2138 /* One additional iteration because of peeling for gap. */
2139 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2140 niters_th += 1;
2141 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2142 }
2143
2144 gcc_assert (known_eq (vectorization_factor,
2145 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2146
2147 /* Ok to vectorize! */
2148 return true;
2149
2150 again:
2151 /* Try again with SLP forced off but if we didn't do any SLP there is
2152 no point in re-trying. */
2153 if (!slp)
2154 return false;
2155
2156 /* If there are reduction chains re-trying will fail anyway. */
2157 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2158 return false;
2159
2160 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2161 via interleaving or lane instructions. */
2162 slp_instance instance;
2163 slp_tree node;
2164 unsigned i, j;
2165 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2166 {
2167 stmt_vec_info vinfo;
2168 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2169 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2170 continue;
2171 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2172 unsigned int size = DR_GROUP_SIZE (vinfo);
2173 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2174 if (! vect_store_lanes_supported (vectype, size, false)
2175 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2176 && ! vect_grouped_store_supported (vectype, size))
2177 return false;
2178 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2179 {
2180 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2181 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2182 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2183 size = DR_GROUP_SIZE (vinfo);
2184 vectype = STMT_VINFO_VECTYPE (vinfo);
2185 if (! vect_load_lanes_supported (vectype, size, false)
2186 && ! vect_grouped_load_supported (vectype, single_element_p,
2187 size))
2188 return false;
2189 }
2190 }
2191
2192 if (dump_enabled_p ())
2193 dump_printf_loc (MSG_NOTE, vect_location,
2194 "re-trying with SLP disabled\n");
2195
2196 /* Roll back state appropriately. No SLP this time. */
2197 slp = false;
2198 /* Restore vectorization factor as it were without SLP. */
2199 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2200 /* Free the SLP instances. */
2201 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2202 vect_free_slp_instance (instance, false);
2203 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2204 /* Reset SLP type to loop_vect on all stmts. */
2205 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2206 {
2207 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2208 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2209 !gsi_end_p (si); gsi_next (&si))
2210 {
2211 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2212 STMT_SLP_TYPE (stmt_info) = loop_vect;
2213 }
2214 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2215 !gsi_end_p (si); gsi_next (&si))
2216 {
2217 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2218 STMT_SLP_TYPE (stmt_info) = loop_vect;
2219 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2220 {
2221 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2222 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2223 STMT_SLP_TYPE (stmt_info) = loop_vect;
2224 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2225 !gsi_end_p (pi); gsi_next (&pi))
2226 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2227 = loop_vect;
2228 }
2229 }
2230 }
2231 /* Free optimized alias test DDRS. */
2232 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2233 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2234 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2235 /* Reset target cost data. */
2236 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2237 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2238 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2239 /* Reset accumulated rgroup information. */
2240 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2241 /* Reset assorted flags. */
2242 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2243 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2244 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2245 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2246 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2247
2248 goto start_over;
2249 }
2250
2251 /* Function vect_analyze_loop.
2252
2253 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2254 for it. The different analyses will record information in the
2255 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2256 be vectorized. */
2257 loop_vec_info
2258 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2259 vec_info_shared *shared)
2260 {
2261 loop_vec_info loop_vinfo;
2262 auto_vector_sizes vector_sizes;
2263
2264 /* Autodetect first vector size we try. */
2265 current_vector_size = 0;
2266 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2267 unsigned int next_size = 0;
2268
2269 DUMP_VECT_SCOPE ("analyze_loop_nest");
2270
2271 if (loop_outer (loop)
2272 && loop_vec_info_for_loop (loop_outer (loop))
2273 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2274 {
2275 if (dump_enabled_p ())
2276 dump_printf_loc (MSG_NOTE, vect_location,
2277 "outer-loop already vectorized.\n");
2278 return NULL;
2279 }
2280
2281 if (!find_loop_nest (loop, &shared->loop_nest))
2282 {
2283 if (dump_enabled_p ())
2284 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2285 "not vectorized: loop nest containing two "
2286 "or more consecutive inner loops cannot be "
2287 "vectorized\n");
2288 return NULL;
2289 }
2290
2291 unsigned n_stmts = 0;
2292 poly_uint64 autodetected_vector_size = 0;
2293 while (1)
2294 {
2295 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2296 loop_vinfo = vect_analyze_loop_form (loop, shared);
2297 if (!loop_vinfo)
2298 {
2299 if (dump_enabled_p ())
2300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2301 "bad loop form.\n");
2302 return NULL;
2303 }
2304
2305 bool fatal = false;
2306
2307 if (orig_loop_vinfo)
2308 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2309
2310 if (vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts))
2311 {
2312 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2313
2314 return loop_vinfo;
2315 }
2316
2317 delete loop_vinfo;
2318
2319 if (next_size == 0)
2320 autodetected_vector_size = current_vector_size;
2321
2322 if (next_size < vector_sizes.length ()
2323 && known_eq (vector_sizes[next_size], autodetected_vector_size))
2324 next_size += 1;
2325
2326 if (fatal
2327 || next_size == vector_sizes.length ()
2328 || known_eq (current_vector_size, 0U))
2329 return NULL;
2330
2331 /* Try the next biggest vector size. */
2332 current_vector_size = vector_sizes[next_size++];
2333 if (dump_enabled_p ())
2334 {
2335 dump_printf_loc (MSG_NOTE, vect_location,
2336 "***** Re-trying analysis with "
2337 "vector size ");
2338 dump_dec (MSG_NOTE, current_vector_size);
2339 dump_printf (MSG_NOTE, "\n");
2340 }
2341 }
2342 }
2343
2344 /* Return true if there is an in-order reduction function for CODE, storing
2345 it in *REDUC_FN if so. */
2346
2347 static bool
2348 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2349 {
2350 switch (code)
2351 {
2352 case PLUS_EXPR:
2353 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2354 return true;
2355
2356 default:
2357 return false;
2358 }
2359 }
2360
2361 /* Function reduction_fn_for_scalar_code
2362
2363 Input:
2364 CODE - tree_code of a reduction operations.
2365
2366 Output:
2367 REDUC_FN - the corresponding internal function to be used to reduce the
2368 vector of partial results into a single scalar result, or IFN_LAST
2369 if the operation is a supported reduction operation, but does not have
2370 such an internal function.
2371
2372 Return FALSE if CODE currently cannot be vectorized as reduction. */
2373
2374 static bool
2375 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2376 {
2377 switch (code)
2378 {
2379 case MAX_EXPR:
2380 *reduc_fn = IFN_REDUC_MAX;
2381 return true;
2382
2383 case MIN_EXPR:
2384 *reduc_fn = IFN_REDUC_MIN;
2385 return true;
2386
2387 case PLUS_EXPR:
2388 *reduc_fn = IFN_REDUC_PLUS;
2389 return true;
2390
2391 case BIT_AND_EXPR:
2392 *reduc_fn = IFN_REDUC_AND;
2393 return true;
2394
2395 case BIT_IOR_EXPR:
2396 *reduc_fn = IFN_REDUC_IOR;
2397 return true;
2398
2399 case BIT_XOR_EXPR:
2400 *reduc_fn = IFN_REDUC_XOR;
2401 return true;
2402
2403 case MULT_EXPR:
2404 case MINUS_EXPR:
2405 *reduc_fn = IFN_LAST;
2406 return true;
2407
2408 default:
2409 return false;
2410 }
2411 }
2412
2413 /* If there is a neutral value X such that SLP reduction NODE would not
2414 be affected by the introduction of additional X elements, return that X,
2415 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2416 is true if the SLP statements perform a single reduction, false if each
2417 statement performs an independent reduction. */
2418
2419 static tree
2420 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2421 bool reduc_chain)
2422 {
2423 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2424 stmt_vec_info stmt_vinfo = stmts[0];
2425 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2426 tree scalar_type = TREE_TYPE (vector_type);
2427 struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2428 gcc_assert (loop);
2429
2430 switch (code)
2431 {
2432 case WIDEN_SUM_EXPR:
2433 case DOT_PROD_EXPR:
2434 case SAD_EXPR:
2435 case PLUS_EXPR:
2436 case MINUS_EXPR:
2437 case BIT_IOR_EXPR:
2438 case BIT_XOR_EXPR:
2439 return build_zero_cst (scalar_type);
2440
2441 case MULT_EXPR:
2442 return build_one_cst (scalar_type);
2443
2444 case BIT_AND_EXPR:
2445 return build_all_ones_cst (scalar_type);
2446
2447 case MAX_EXPR:
2448 case MIN_EXPR:
2449 /* For MIN/MAX the initial values are neutral. A reduction chain
2450 has only a single initial value, so that value is neutral for
2451 all statements. */
2452 if (reduc_chain)
2453 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2454 loop_preheader_edge (loop));
2455 return NULL_TREE;
2456
2457 default:
2458 return NULL_TREE;
2459 }
2460 }
2461
2462 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2463 STMT is printed with a message MSG. */
2464
2465 static void
2466 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2467 {
2468 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2469 }
2470
2471 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2472 operation. Return true if the results of DEF_STMT_INFO are something
2473 that can be accumulated by such a reduction. */
2474
2475 static bool
2476 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2477 {
2478 return (is_gimple_assign (def_stmt_info->stmt)
2479 || is_gimple_call (def_stmt_info->stmt)
2480 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2481 || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2482 && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2483 && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2484 }
2485
2486 /* Detect SLP reduction of the form:
2487
2488 #a1 = phi <a5, a0>
2489 a2 = operation (a1)
2490 a3 = operation (a2)
2491 a4 = operation (a3)
2492 a5 = operation (a4)
2493
2494 #a = phi <a5>
2495
2496 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2497 FIRST_STMT is the first reduction stmt in the chain
2498 (a2 = operation (a1)).
2499
2500 Return TRUE if a reduction chain was detected. */
2501
2502 static bool
2503 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2504 gimple *first_stmt)
2505 {
2506 struct loop *loop = (gimple_bb (phi))->loop_father;
2507 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2508 enum tree_code code;
2509 gimple *loop_use_stmt = NULL;
2510 stmt_vec_info use_stmt_info, current_stmt_info = NULL;
2511 tree lhs;
2512 imm_use_iterator imm_iter;
2513 use_operand_p use_p;
2514 int nloop_uses, size = 0, n_out_of_loop_uses;
2515 bool found = false;
2516
2517 if (loop != vect_loop)
2518 return false;
2519
2520 lhs = PHI_RESULT (phi);
2521 code = gimple_assign_rhs_code (first_stmt);
2522 while (1)
2523 {
2524 nloop_uses = 0;
2525 n_out_of_loop_uses = 0;
2526 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2527 {
2528 gimple *use_stmt = USE_STMT (use_p);
2529 if (is_gimple_debug (use_stmt))
2530 continue;
2531
2532 /* Check if we got back to the reduction phi. */
2533 if (use_stmt == phi)
2534 {
2535 loop_use_stmt = use_stmt;
2536 found = true;
2537 break;
2538 }
2539
2540 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2541 {
2542 loop_use_stmt = use_stmt;
2543 nloop_uses++;
2544 }
2545 else
2546 n_out_of_loop_uses++;
2547
2548 /* There are can be either a single use in the loop or two uses in
2549 phi nodes. */
2550 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2551 return false;
2552 }
2553
2554 if (found)
2555 break;
2556
2557 /* We reached a statement with no loop uses. */
2558 if (nloop_uses == 0)
2559 return false;
2560
2561 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2562 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2563 return false;
2564
2565 if (!is_gimple_assign (loop_use_stmt)
2566 || code != gimple_assign_rhs_code (loop_use_stmt)
2567 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2568 return false;
2569
2570 /* Insert USE_STMT into reduction chain. */
2571 use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2572 if (current_stmt_info)
2573 {
2574 REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = use_stmt_info;
2575 REDUC_GROUP_FIRST_ELEMENT (use_stmt_info)
2576 = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2577 }
2578 else
2579 REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = use_stmt_info;
2580
2581 lhs = gimple_assign_lhs (loop_use_stmt);
2582 current_stmt_info = use_stmt_info;
2583 size++;
2584 }
2585
2586 if (!found || loop_use_stmt != phi || size < 2)
2587 return false;
2588
2589 /* Swap the operands, if needed, to make the reduction operand be the second
2590 operand. */
2591 lhs = PHI_RESULT (phi);
2592 stmt_vec_info next_stmt_info = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2593 while (next_stmt_info)
2594 {
2595 gassign *next_stmt = as_a <gassign *> (next_stmt_info->stmt);
2596 if (gimple_assign_rhs2 (next_stmt) == lhs)
2597 {
2598 tree op = gimple_assign_rhs1 (next_stmt);
2599 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2600
2601 /* Check that the other def is either defined in the loop
2602 ("vect_internal_def"), or it's an induction (defined by a
2603 loop-header phi-node). */
2604 if (def_stmt_info
2605 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2606 && vect_valid_reduction_input_p (def_stmt_info))
2607 {
2608 lhs = gimple_assign_lhs (next_stmt);
2609 next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
2610 continue;
2611 }
2612
2613 return false;
2614 }
2615 else
2616 {
2617 tree op = gimple_assign_rhs2 (next_stmt);
2618 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2619
2620 /* Check that the other def is either defined in the loop
2621 ("vect_internal_def"), or it's an induction (defined by a
2622 loop-header phi-node). */
2623 if (def_stmt_info
2624 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2625 && vect_valid_reduction_input_p (def_stmt_info))
2626 {
2627 if (dump_enabled_p ())
2628 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
2629 next_stmt);
2630
2631 swap_ssa_operands (next_stmt,
2632 gimple_assign_rhs1_ptr (next_stmt),
2633 gimple_assign_rhs2_ptr (next_stmt));
2634 update_stmt (next_stmt);
2635
2636 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2637 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2638 }
2639 else
2640 return false;
2641 }
2642
2643 lhs = gimple_assign_lhs (next_stmt);
2644 next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
2645 }
2646
2647 /* Save the chain for further analysis in SLP detection. */
2648 stmt_vec_info first_stmt_info
2649 = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2650 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first_stmt_info);
2651 REDUC_GROUP_SIZE (first_stmt_info) = size;
2652
2653 return true;
2654 }
2655
2656 /* Return true if we need an in-order reduction for operation CODE
2657 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2658 overflow must wrap. */
2659
2660 static bool
2661 needs_fold_left_reduction_p (tree type, tree_code code,
2662 bool need_wrapping_integral_overflow)
2663 {
2664 /* CHECKME: check for !flag_finite_math_only too? */
2665 if (SCALAR_FLOAT_TYPE_P (type))
2666 switch (code)
2667 {
2668 case MIN_EXPR:
2669 case MAX_EXPR:
2670 return false;
2671
2672 default:
2673 return !flag_associative_math;
2674 }
2675
2676 if (INTEGRAL_TYPE_P (type))
2677 {
2678 if (!operation_no_trapping_overflow (type, code))
2679 return true;
2680 if (need_wrapping_integral_overflow
2681 && !TYPE_OVERFLOW_WRAPS (type)
2682 && operation_can_overflow (code))
2683 return true;
2684 return false;
2685 }
2686
2687 if (SAT_FIXED_POINT_TYPE_P (type))
2688 return true;
2689
2690 return false;
2691 }
2692
2693 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2694 reduction operation CODE has a handled computation expression. */
2695
2696 bool
2697 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2698 tree loop_arg, enum tree_code code)
2699 {
2700 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2701 auto_bitmap visited;
2702 tree lookfor = PHI_RESULT (phi);
2703 ssa_op_iter curri;
2704 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2705 while (USE_FROM_PTR (curr) != loop_arg)
2706 curr = op_iter_next_use (&curri);
2707 curri.i = curri.numops;
2708 do
2709 {
2710 path.safe_push (std::make_pair (curri, curr));
2711 tree use = USE_FROM_PTR (curr);
2712 if (use == lookfor)
2713 break;
2714 gimple *def = SSA_NAME_DEF_STMT (use);
2715 if (gimple_nop_p (def)
2716 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2717 {
2718 pop:
2719 do
2720 {
2721 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2722 curri = x.first;
2723 curr = x.second;
2724 do
2725 curr = op_iter_next_use (&curri);
2726 /* Skip already visited or non-SSA operands (from iterating
2727 over PHI args). */
2728 while (curr != NULL_USE_OPERAND_P
2729 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2730 || ! bitmap_set_bit (visited,
2731 SSA_NAME_VERSION
2732 (USE_FROM_PTR (curr)))));
2733 }
2734 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2735 if (curr == NULL_USE_OPERAND_P)
2736 break;
2737 }
2738 else
2739 {
2740 if (gimple_code (def) == GIMPLE_PHI)
2741 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2742 else
2743 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2744 while (curr != NULL_USE_OPERAND_P
2745 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2746 || ! bitmap_set_bit (visited,
2747 SSA_NAME_VERSION
2748 (USE_FROM_PTR (curr)))))
2749 curr = op_iter_next_use (&curri);
2750 if (curr == NULL_USE_OPERAND_P)
2751 goto pop;
2752 }
2753 }
2754 while (1);
2755 if (dump_file && (dump_flags & TDF_DETAILS))
2756 {
2757 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2758 unsigned i;
2759 std::pair<ssa_op_iter, use_operand_p> *x;
2760 FOR_EACH_VEC_ELT (path, i, x)
2761 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2762 dump_printf (MSG_NOTE, "\n");
2763 }
2764
2765 /* Check whether the reduction path detected is valid. */
2766 bool fail = path.length () == 0;
2767 bool neg = false;
2768 for (unsigned i = 1; i < path.length (); ++i)
2769 {
2770 gimple *use_stmt = USE_STMT (path[i].second);
2771 tree op = USE_FROM_PTR (path[i].second);
2772 if (! has_single_use (op)
2773 || ! is_gimple_assign (use_stmt))
2774 {
2775 fail = true;
2776 break;
2777 }
2778 if (gimple_assign_rhs_code (use_stmt) != code)
2779 {
2780 if (code == PLUS_EXPR
2781 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2782 {
2783 /* Track whether we negate the reduction value each iteration. */
2784 if (gimple_assign_rhs2 (use_stmt) == op)
2785 neg = ! neg;
2786 }
2787 else
2788 {
2789 fail = true;
2790 break;
2791 }
2792 }
2793 }
2794 return ! fail && ! neg;
2795 }
2796
2797
2798 /* Function vect_is_simple_reduction
2799
2800 (1) Detect a cross-iteration def-use cycle that represents a simple
2801 reduction computation. We look for the following pattern:
2802
2803 loop_header:
2804 a1 = phi < a0, a2 >
2805 a3 = ...
2806 a2 = operation (a3, a1)
2807
2808 or
2809
2810 a3 = ...
2811 loop_header:
2812 a1 = phi < a0, a2 >
2813 a2 = operation (a3, a1)
2814
2815 such that:
2816 1. operation is commutative and associative and it is safe to
2817 change the order of the computation
2818 2. no uses for a2 in the loop (a2 is used out of the loop)
2819 3. no uses of a1 in the loop besides the reduction operation
2820 4. no uses of a1 outside the loop.
2821
2822 Conditions 1,4 are tested here.
2823 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2824
2825 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2826 nested cycles.
2827
2828 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2829 reductions:
2830
2831 a1 = phi < a0, a2 >
2832 inner loop (def of a3)
2833 a2 = phi < a3 >
2834
2835 (4) Detect condition expressions, ie:
2836 for (int i = 0; i < N; i++)
2837 if (a[i] < val)
2838 ret_val = a[i];
2839
2840 */
2841
2842 static stmt_vec_info
2843 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2844 bool *double_reduc,
2845 bool need_wrapping_integral_overflow,
2846 enum vect_reduction_type *v_reduc_type)
2847 {
2848 gphi *phi = as_a <gphi *> (phi_info->stmt);
2849 struct loop *loop = (gimple_bb (phi))->loop_father;
2850 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2851 gimple *phi_use_stmt = NULL;
2852 enum tree_code orig_code, code;
2853 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2854 tree type;
2855 int nloop_uses;
2856 tree name;
2857 imm_use_iterator imm_iter;
2858 use_operand_p use_p;
2859 bool phi_def;
2860
2861 *double_reduc = false;
2862 *v_reduc_type = TREE_CODE_REDUCTION;
2863
2864 tree phi_name = PHI_RESULT (phi);
2865 /* ??? If there are no uses of the PHI result the inner loop reduction
2866 won't be detected as possibly double-reduction by vectorizable_reduction
2867 because that tries to walk the PHI arg from the preheader edge which
2868 can be constant. See PR60382. */
2869 if (has_zero_uses (phi_name))
2870 return NULL;
2871 nloop_uses = 0;
2872 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2873 {
2874 gimple *use_stmt = USE_STMT (use_p);
2875 if (is_gimple_debug (use_stmt))
2876 continue;
2877
2878 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2879 {
2880 if (dump_enabled_p ())
2881 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2882 "intermediate value used outside loop.\n");
2883
2884 return NULL;
2885 }
2886
2887 nloop_uses++;
2888 if (nloop_uses > 1)
2889 {
2890 if (dump_enabled_p ())
2891 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2892 "reduction value used in loop.\n");
2893 return NULL;
2894 }
2895
2896 phi_use_stmt = use_stmt;
2897 }
2898
2899 edge latch_e = loop_latch_edge (loop);
2900 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2901 if (TREE_CODE (loop_arg) != SSA_NAME)
2902 {
2903 if (dump_enabled_p ())
2904 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2905 "reduction: not ssa_name: %T\n", loop_arg);
2906 return NULL;
2907 }
2908
2909 stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
2910 if (!def_stmt_info
2911 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
2912 return NULL;
2913
2914 if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
2915 {
2916 name = gimple_assign_lhs (def_stmt);
2917 phi_def = false;
2918 }
2919 else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2920 {
2921 name = PHI_RESULT (def_stmt);
2922 phi_def = true;
2923 }
2924 else
2925 {
2926 if (dump_enabled_p ())
2927 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2928 "reduction: unhandled reduction operation: %G",
2929 def_stmt_info->stmt);
2930 return NULL;
2931 }
2932
2933 nloop_uses = 0;
2934 auto_vec<gphi *, 3> lcphis;
2935 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2936 {
2937 gimple *use_stmt = USE_STMT (use_p);
2938 if (is_gimple_debug (use_stmt))
2939 continue;
2940 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2941 nloop_uses++;
2942 else
2943 /* We can have more than one loop-closed PHI. */
2944 lcphis.safe_push (as_a <gphi *> (use_stmt));
2945 if (nloop_uses > 1)
2946 {
2947 if (dump_enabled_p ())
2948 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2949 "reduction used in loop.\n");
2950 return NULL;
2951 }
2952 }
2953
2954 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2955 defined in the inner loop. */
2956 if (phi_def)
2957 {
2958 gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
2959 op1 = PHI_ARG_DEF (def_stmt, 0);
2960
2961 if (gimple_phi_num_args (def_stmt) != 1
2962 || TREE_CODE (op1) != SSA_NAME)
2963 {
2964 if (dump_enabled_p ())
2965 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2966 "unsupported phi node definition.\n");
2967
2968 return NULL;
2969 }
2970
2971 gimple *def1 = SSA_NAME_DEF_STMT (op1);
2972 if (gimple_bb (def1)
2973 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2974 && loop->inner
2975 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2976 && is_gimple_assign (def1)
2977 && is_a <gphi *> (phi_use_stmt)
2978 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2979 {
2980 if (dump_enabled_p ())
2981 report_vect_op (MSG_NOTE, def_stmt,
2982 "detected double reduction: ");
2983
2984 *double_reduc = true;
2985 return def_stmt_info;
2986 }
2987
2988 return NULL;
2989 }
2990
2991 /* If we are vectorizing an inner reduction we are executing that
2992 in the original order only in case we are not dealing with a
2993 double reduction. */
2994 bool check_reduction = true;
2995 if (flow_loop_nested_p (vect_loop, loop))
2996 {
2997 gphi *lcphi;
2998 unsigned i;
2999 check_reduction = false;
3000 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3001 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3002 {
3003 gimple *use_stmt = USE_STMT (use_p);
3004 if (is_gimple_debug (use_stmt))
3005 continue;
3006 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3007 check_reduction = true;
3008 }
3009 }
3010
3011 gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
3012 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3013 code = orig_code = gimple_assign_rhs_code (def_stmt);
3014
3015 /* We can handle "res -= x[i]", which is non-associative by
3016 simply rewriting this into "res += -x[i]". Avoid changing
3017 gimple instruction for the first simple tests and only do this
3018 if we're allowed to change code at all. */
3019 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3020 code = PLUS_EXPR;
3021
3022 if (code == COND_EXPR)
3023 {
3024 if (! nested_in_vect_loop)
3025 *v_reduc_type = COND_REDUCTION;
3026
3027 op3 = gimple_assign_rhs1 (def_stmt);
3028 if (COMPARISON_CLASS_P (op3))
3029 {
3030 op4 = TREE_OPERAND (op3, 1);
3031 op3 = TREE_OPERAND (op3, 0);
3032 }
3033 if (op3 == phi_name || op4 == phi_name)
3034 {
3035 if (dump_enabled_p ())
3036 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3037 "reduction: condition depends on previous"
3038 " iteration: ");
3039 return NULL;
3040 }
3041
3042 op1 = gimple_assign_rhs2 (def_stmt);
3043 op2 = gimple_assign_rhs3 (def_stmt);
3044 }
3045 else if (!commutative_tree_code (code) || !associative_tree_code (code))
3046 {
3047 if (dump_enabled_p ())
3048 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3049 "reduction: not commutative/associative: ");
3050 return NULL;
3051 }
3052 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3053 {
3054 op1 = gimple_assign_rhs1 (def_stmt);
3055 op2 = gimple_assign_rhs2 (def_stmt);
3056 }
3057 else
3058 {
3059 if (dump_enabled_p ())
3060 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3061 "reduction: not handled operation: ");
3062 return NULL;
3063 }
3064
3065 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3066 {
3067 if (dump_enabled_p ())
3068 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3069 "reduction: both uses not ssa_names: ");
3070
3071 return NULL;
3072 }
3073
3074 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3075 if ((TREE_CODE (op1) == SSA_NAME
3076 && !types_compatible_p (type,TREE_TYPE (op1)))
3077 || (TREE_CODE (op2) == SSA_NAME
3078 && !types_compatible_p (type, TREE_TYPE (op2)))
3079 || (op3 && TREE_CODE (op3) == SSA_NAME
3080 && !types_compatible_p (type, TREE_TYPE (op3)))
3081 || (op4 && TREE_CODE (op4) == SSA_NAME
3082 && !types_compatible_p (type, TREE_TYPE (op4))))
3083 {
3084 if (dump_enabled_p ())
3085 {
3086 dump_printf_loc (MSG_NOTE, vect_location,
3087 "reduction: multiple types: operation type: "
3088 "%T, operands types: %T,%T",
3089 type, TREE_TYPE (op1), TREE_TYPE (op2));
3090 if (op3)
3091 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
3092
3093 if (op4)
3094 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
3095 dump_printf (MSG_NOTE, "\n");
3096 }
3097
3098 return NULL;
3099 }
3100
3101 /* Check whether it's ok to change the order of the computation.
3102 Generally, when vectorizing a reduction we change the order of the
3103 computation. This may change the behavior of the program in some
3104 cases, so we need to check that this is ok. One exception is when
3105 vectorizing an outer-loop: the inner-loop is executed sequentially,
3106 and therefore vectorizing reductions in the inner-loop during
3107 outer-loop vectorization is safe. */
3108 if (check_reduction
3109 && *v_reduc_type == TREE_CODE_REDUCTION
3110 && needs_fold_left_reduction_p (type, code,
3111 need_wrapping_integral_overflow))
3112 *v_reduc_type = FOLD_LEFT_REDUCTION;
3113
3114 /* Reduction is safe. We're dealing with one of the following:
3115 1) integer arithmetic and no trapv
3116 2) floating point arithmetic, and special flags permit this optimization
3117 3) nested cycle (i.e., outer loop vectorization). */
3118 stmt_vec_info def1_info = loop_info->lookup_def (op1);
3119 stmt_vec_info def2_info = loop_info->lookup_def (op2);
3120 if (code != COND_EXPR && !def1_info && !def2_info)
3121 {
3122 if (dump_enabled_p ())
3123 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3124 return NULL;
3125 }
3126
3127 /* Check that one def is the reduction def, defined by PHI,
3128 the other def is either defined in the loop ("vect_internal_def"),
3129 or it's an induction (defined by a loop-header phi-node). */
3130
3131 if (def2_info
3132 && def2_info->stmt == phi
3133 && (code == COND_EXPR
3134 || !def1_info
3135 || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
3136 || vect_valid_reduction_input_p (def1_info)))
3137 {
3138 if (dump_enabled_p ())
3139 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3140 return def_stmt_info;
3141 }
3142
3143 if (def1_info
3144 && def1_info->stmt == phi
3145 && (code == COND_EXPR
3146 || !def2_info
3147 || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
3148 || vect_valid_reduction_input_p (def2_info)))
3149 {
3150 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3151 {
3152 /* Check if we can swap operands (just for simplicity - so that
3153 the rest of the code can assume that the reduction variable
3154 is always the last (second) argument). */
3155 if (code == COND_EXPR)
3156 {
3157 /* Swap cond_expr by inverting the condition. */
3158 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3159 enum tree_code invert_code = ERROR_MARK;
3160 enum tree_code cond_code = TREE_CODE (cond_expr);
3161
3162 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3163 {
3164 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3165 invert_code = invert_tree_comparison (cond_code, honor_nans);
3166 }
3167 if (invert_code != ERROR_MARK)
3168 {
3169 TREE_SET_CODE (cond_expr, invert_code);
3170 swap_ssa_operands (def_stmt,
3171 gimple_assign_rhs2_ptr (def_stmt),
3172 gimple_assign_rhs3_ptr (def_stmt));
3173 }
3174 else
3175 {
3176 if (dump_enabled_p ())
3177 report_vect_op (MSG_NOTE, def_stmt,
3178 "detected reduction: cannot swap operands "
3179 "for cond_expr");
3180 return NULL;
3181 }
3182 }
3183 else
3184 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3185 gimple_assign_rhs2_ptr (def_stmt));
3186
3187 if (dump_enabled_p ())
3188 report_vect_op (MSG_NOTE, def_stmt,
3189 "detected reduction: need to swap operands: ");
3190
3191 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3192 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3193 }
3194 else
3195 {
3196 if (dump_enabled_p ())
3197 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3198 }
3199
3200 return def_stmt_info;
3201 }
3202
3203 /* Try to find SLP reduction chain. */
3204 if (! nested_in_vect_loop
3205 && code != COND_EXPR
3206 && orig_code != MINUS_EXPR
3207 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3208 {
3209 if (dump_enabled_p ())
3210 report_vect_op (MSG_NOTE, def_stmt,
3211 "reduction: detected reduction chain: ");
3212
3213 return def_stmt_info;
3214 }
3215
3216 /* Dissolve group eventually half-built by vect_is_slp_reduction. */
3217 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (def_stmt_info);
3218 while (first)
3219 {
3220 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
3221 REDUC_GROUP_FIRST_ELEMENT (first) = NULL;
3222 REDUC_GROUP_NEXT_ELEMENT (first) = NULL;
3223 first = next;
3224 }
3225
3226 /* Look for the expression computing loop_arg from loop PHI result. */
3227 if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3228 return def_stmt_info;
3229
3230 if (dump_enabled_p ())
3231 {
3232 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3233 "reduction: unknown pattern: ");
3234 }
3235
3236 return NULL;
3237 }
3238
3239 /* Wrapper around vect_is_simple_reduction, which will modify code
3240 in-place if it enables detection of more reductions. Arguments
3241 as there. */
3242
3243 stmt_vec_info
3244 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3245 bool *double_reduc,
3246 bool need_wrapping_integral_overflow)
3247 {
3248 enum vect_reduction_type v_reduc_type;
3249 stmt_vec_info def_info
3250 = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3251 need_wrapping_integral_overflow,
3252 &v_reduc_type);
3253 if (def_info)
3254 {
3255 STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3256 STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3257 STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3258 STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3259 }
3260 return def_info;
3261 }
3262
3263 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3264 int
3265 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3266 int *peel_iters_epilogue,
3267 stmt_vector_for_cost *scalar_cost_vec,
3268 stmt_vector_for_cost *prologue_cost_vec,
3269 stmt_vector_for_cost *epilogue_cost_vec)
3270 {
3271 int retval = 0;
3272 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3273
3274 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3275 {
3276 *peel_iters_epilogue = assumed_vf / 2;
3277 if (dump_enabled_p ())
3278 dump_printf_loc (MSG_NOTE, vect_location,
3279 "cost model: epilogue peel iters set to vf/2 "
3280 "because loop iterations are unknown .\n");
3281
3282 /* If peeled iterations are known but number of scalar loop
3283 iterations are unknown, count a taken branch per peeled loop. */
3284 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3285 NULL, 0, vect_prologue);
3286 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3287 NULL, 0, vect_epilogue);
3288 }
3289 else
3290 {
3291 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3292 peel_iters_prologue = niters < peel_iters_prologue ?
3293 niters : peel_iters_prologue;
3294 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3295 /* If we need to peel for gaps, but no peeling is required, we have to
3296 peel VF iterations. */
3297 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3298 *peel_iters_epilogue = assumed_vf;
3299 }
3300
3301 stmt_info_for_cost *si;
3302 int j;
3303 if (peel_iters_prologue)
3304 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3305 retval += record_stmt_cost (prologue_cost_vec,
3306 si->count * peel_iters_prologue,
3307 si->kind, si->stmt_info, si->misalign,
3308 vect_prologue);
3309 if (*peel_iters_epilogue)
3310 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3311 retval += record_stmt_cost (epilogue_cost_vec,
3312 si->count * *peel_iters_epilogue,
3313 si->kind, si->stmt_info, si->misalign,
3314 vect_epilogue);
3315
3316 return retval;
3317 }
3318
3319 /* Function vect_estimate_min_profitable_iters
3320
3321 Return the number of iterations required for the vector version of the
3322 loop to be profitable relative to the cost of the scalar version of the
3323 loop.
3324
3325 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3326 of iterations for vectorization. -1 value means loop vectorization
3327 is not profitable. This returned value may be used for dynamic
3328 profitability check.
3329
3330 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3331 for static check against estimated number of iterations. */
3332
3333 static void
3334 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3335 int *ret_min_profitable_niters,
3336 int *ret_min_profitable_estimate)
3337 {
3338 int min_profitable_iters;
3339 int min_profitable_estimate;
3340 int peel_iters_prologue;
3341 int peel_iters_epilogue;
3342 unsigned vec_inside_cost = 0;
3343 int vec_outside_cost = 0;
3344 unsigned vec_prologue_cost = 0;
3345 unsigned vec_epilogue_cost = 0;
3346 int scalar_single_iter_cost = 0;
3347 int scalar_outside_cost = 0;
3348 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3349 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3350 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3351
3352 /* Cost model disabled. */
3353 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3354 {
3355 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3356 *ret_min_profitable_niters = 0;
3357 *ret_min_profitable_estimate = 0;
3358 return;
3359 }
3360
3361 /* Requires loop versioning tests to handle misalignment. */
3362 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3363 {
3364 /* FIXME: Make cost depend on complexity of individual check. */
3365 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3366 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3367 vect_prologue);
3368 dump_printf (MSG_NOTE,
3369 "cost model: Adding cost of checks for loop "
3370 "versioning to treat misalignment.\n");
3371 }
3372
3373 /* Requires loop versioning with alias checks. */
3374 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3375 {
3376 /* FIXME: Make cost depend on complexity of individual check. */
3377 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3378 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3379 vect_prologue);
3380 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3381 if (len)
3382 /* Count LEN - 1 ANDs and LEN comparisons. */
3383 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3384 NULL, 0, vect_prologue);
3385 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3386 if (len)
3387 {
3388 /* Count LEN - 1 ANDs and LEN comparisons. */
3389 unsigned int nstmts = len * 2 - 1;
3390 /* +1 for each bias that needs adding. */
3391 for (unsigned int i = 0; i < len; ++i)
3392 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3393 nstmts += 1;
3394 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3395 NULL, 0, vect_prologue);
3396 }
3397 dump_printf (MSG_NOTE,
3398 "cost model: Adding cost of checks for loop "
3399 "versioning aliasing.\n");
3400 }
3401
3402 /* Requires loop versioning with niter checks. */
3403 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3404 {
3405 /* FIXME: Make cost depend on complexity of individual check. */
3406 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3407 vect_prologue);
3408 dump_printf (MSG_NOTE,
3409 "cost model: Adding cost of checks for loop "
3410 "versioning niters.\n");
3411 }
3412
3413 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3414 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3415 vect_prologue);
3416
3417 /* Count statements in scalar loop. Using this as scalar cost for a single
3418 iteration for now.
3419
3420 TODO: Add outer loop support.
3421
3422 TODO: Consider assigning different costs to different scalar
3423 statements. */
3424
3425 scalar_single_iter_cost
3426 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3427
3428 /* Add additional cost for the peeled instructions in prologue and epilogue
3429 loop. (For fully-masked loops there will be no peeling.)
3430
3431 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3432 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3433
3434 TODO: Build an expression that represents peel_iters for prologue and
3435 epilogue to be used in a run-time test. */
3436
3437 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3438 {
3439 peel_iters_prologue = 0;
3440 peel_iters_epilogue = 0;
3441
3442 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3443 {
3444 /* We need to peel exactly one iteration. */
3445 peel_iters_epilogue += 1;
3446 stmt_info_for_cost *si;
3447 int j;
3448 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3449 j, si)
3450 (void) add_stmt_cost (target_cost_data, si->count,
3451 si->kind, si->stmt_info, si->misalign,
3452 vect_epilogue);
3453 }
3454 }
3455 else if (npeel < 0)
3456 {
3457 peel_iters_prologue = assumed_vf / 2;
3458 dump_printf (MSG_NOTE, "cost model: "
3459 "prologue peel iters set to vf/2.\n");
3460
3461 /* If peeling for alignment is unknown, loop bound of main loop becomes
3462 unknown. */
3463 peel_iters_epilogue = assumed_vf / 2;
3464 dump_printf (MSG_NOTE, "cost model: "
3465 "epilogue peel iters set to vf/2 because "
3466 "peeling for alignment is unknown.\n");
3467
3468 /* If peeled iterations are unknown, count a taken branch and a not taken
3469 branch per peeled loop. Even if scalar loop iterations are known,
3470 vector iterations are not known since peeled prologue iterations are
3471 not known. Hence guards remain the same. */
3472 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3473 NULL, 0, vect_prologue);
3474 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3475 NULL, 0, vect_prologue);
3476 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3477 NULL, 0, vect_epilogue);
3478 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3479 NULL, 0, vect_epilogue);
3480 stmt_info_for_cost *si;
3481 int j;
3482 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3483 {
3484 (void) add_stmt_cost (target_cost_data,
3485 si->count * peel_iters_prologue,
3486 si->kind, si->stmt_info, si->misalign,
3487 vect_prologue);
3488 (void) add_stmt_cost (target_cost_data,
3489 si->count * peel_iters_epilogue,
3490 si->kind, si->stmt_info, si->misalign,
3491 vect_epilogue);
3492 }
3493 }
3494 else
3495 {
3496 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3497 stmt_info_for_cost *si;
3498 int j;
3499 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3500
3501 prologue_cost_vec.create (2);
3502 epilogue_cost_vec.create (2);
3503 peel_iters_prologue = npeel;
3504
3505 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3506 &peel_iters_epilogue,
3507 &LOOP_VINFO_SCALAR_ITERATION_COST
3508 (loop_vinfo),
3509 &prologue_cost_vec,
3510 &epilogue_cost_vec);
3511
3512 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3513 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3514 si->misalign, vect_prologue);
3515
3516 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3517 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3518 si->misalign, vect_epilogue);
3519
3520 prologue_cost_vec.release ();
3521 epilogue_cost_vec.release ();
3522 }
3523
3524 /* FORNOW: The scalar outside cost is incremented in one of the
3525 following ways:
3526
3527 1. The vectorizer checks for alignment and aliasing and generates
3528 a condition that allows dynamic vectorization. A cost model
3529 check is ANDED with the versioning condition. Hence scalar code
3530 path now has the added cost of the versioning check.
3531
3532 if (cost > th & versioning_check)
3533 jmp to vector code
3534
3535 Hence run-time scalar is incremented by not-taken branch cost.
3536
3537 2. The vectorizer then checks if a prologue is required. If the
3538 cost model check was not done before during versioning, it has to
3539 be done before the prologue check.
3540
3541 if (cost <= th)
3542 prologue = scalar_iters
3543 if (prologue == 0)
3544 jmp to vector code
3545 else
3546 execute prologue
3547 if (prologue == num_iters)
3548 go to exit
3549
3550 Hence the run-time scalar cost is incremented by a taken branch,
3551 plus a not-taken branch, plus a taken branch cost.
3552
3553 3. The vectorizer then checks if an epilogue is required. If the
3554 cost model check was not done before during prologue check, it
3555 has to be done with the epilogue check.
3556
3557 if (prologue == 0)
3558 jmp to vector code
3559 else
3560 execute prologue
3561 if (prologue == num_iters)
3562 go to exit
3563 vector code:
3564 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3565 jmp to epilogue
3566
3567 Hence the run-time scalar cost should be incremented by 2 taken
3568 branches.
3569
3570 TODO: The back end may reorder the BBS's differently and reverse
3571 conditions/branch directions. Change the estimates below to
3572 something more reasonable. */
3573
3574 /* If the number of iterations is known and we do not do versioning, we can
3575 decide whether to vectorize at compile time. Hence the scalar version
3576 do not carry cost model guard costs. */
3577 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3578 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3579 {
3580 /* Cost model check occurs at versioning. */
3581 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3582 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3583 else
3584 {
3585 /* Cost model check occurs at prologue generation. */
3586 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3587 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3588 + vect_get_stmt_cost (cond_branch_not_taken);
3589 /* Cost model check occurs at epilogue generation. */
3590 else
3591 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3592 }
3593 }
3594
3595 /* Complete the target-specific cost calculations. */
3596 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3597 &vec_inside_cost, &vec_epilogue_cost);
3598
3599 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3600
3601 if (dump_enabled_p ())
3602 {
3603 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3604 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3605 vec_inside_cost);
3606 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3607 vec_prologue_cost);
3608 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3609 vec_epilogue_cost);
3610 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3611 scalar_single_iter_cost);
3612 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3613 scalar_outside_cost);
3614 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3615 vec_outside_cost);
3616 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3617 peel_iters_prologue);
3618 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3619 peel_iters_epilogue);
3620 }
3621
3622 /* Calculate number of iterations required to make the vector version
3623 profitable, relative to the loop bodies only. The following condition
3624 must hold true:
3625 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3626 where
3627 SIC = scalar iteration cost, VIC = vector iteration cost,
3628 VOC = vector outside cost, VF = vectorization factor,
3629 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3630 SOC = scalar outside cost for run time cost model check. */
3631
3632 if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3633 {
3634 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3635 * assumed_vf
3636 - vec_inside_cost * peel_iters_prologue
3637 - vec_inside_cost * peel_iters_epilogue);
3638 if (min_profitable_iters <= 0)
3639 min_profitable_iters = 0;
3640 else
3641 {
3642 min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3643 - vec_inside_cost);
3644
3645 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3646 <= (((int) vec_inside_cost * min_profitable_iters)
3647 + (((int) vec_outside_cost - scalar_outside_cost)
3648 * assumed_vf)))
3649 min_profitable_iters++;
3650 }
3651 }
3652 /* vector version will never be profitable. */
3653 else
3654 {
3655 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3656 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3657 "vectorization did not happen for a simd loop");
3658
3659 if (dump_enabled_p ())
3660 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3661 "cost model: the vector iteration cost = %d "
3662 "divided by the scalar iteration cost = %d "
3663 "is greater or equal to the vectorization factor = %d"
3664 ".\n",
3665 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3666 *ret_min_profitable_niters = -1;
3667 *ret_min_profitable_estimate = -1;
3668 return;
3669 }
3670
3671 dump_printf (MSG_NOTE,
3672 " Calculated minimum iters for profitability: %d\n",
3673 min_profitable_iters);
3674
3675 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3676 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3677 /* We want the vectorized loop to execute at least once. */
3678 min_profitable_iters = assumed_vf + peel_iters_prologue;
3679
3680 if (dump_enabled_p ())
3681 dump_printf_loc (MSG_NOTE, vect_location,
3682 " Runtime profitability threshold = %d\n",
3683 min_profitable_iters);
3684
3685 *ret_min_profitable_niters = min_profitable_iters;
3686
3687 /* Calculate number of iterations required to make the vector version
3688 profitable, relative to the loop bodies only.
3689
3690 Non-vectorized variant is SIC * niters and it must win over vector
3691 variant on the expected loop trip count. The following condition must hold true:
3692 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3693
3694 if (vec_outside_cost <= 0)
3695 min_profitable_estimate = 0;
3696 else
3697 {
3698 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3699 * assumed_vf
3700 - vec_inside_cost * peel_iters_prologue
3701 - vec_inside_cost * peel_iters_epilogue)
3702 / ((scalar_single_iter_cost * assumed_vf)
3703 - vec_inside_cost);
3704 }
3705 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3706 if (dump_enabled_p ())
3707 dump_printf_loc (MSG_NOTE, vect_location,
3708 " Static estimate profitability threshold = %d\n",
3709 min_profitable_estimate);
3710
3711 *ret_min_profitable_estimate = min_profitable_estimate;
3712 }
3713
3714 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3715 vector elements (not bits) for a vector with NELT elements. */
3716 static void
3717 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3718 vec_perm_builder *sel)
3719 {
3720 /* The encoding is a single stepped pattern. Any wrap-around is handled
3721 by vec_perm_indices. */
3722 sel->new_vector (nelt, 1, 3);
3723 for (unsigned int i = 0; i < 3; i++)
3724 sel->quick_push (i + offset);
3725 }
3726
3727 /* Checks whether the target supports whole-vector shifts for vectors of mode
3728 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3729 it supports vec_perm_const with masks for all necessary shift amounts. */
3730 static bool
3731 have_whole_vector_shift (machine_mode mode)
3732 {
3733 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3734 return true;
3735
3736 /* Variable-length vectors should be handled via the optab. */
3737 unsigned int nelt;
3738 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3739 return false;
3740
3741 vec_perm_builder sel;
3742 vec_perm_indices indices;
3743 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3744 {
3745 calc_vec_perm_mask_for_shift (i, nelt, &sel);
3746 indices.new_vector (sel, 2, nelt);
3747 if (!can_vec_perm_const_p (mode, indices, false))
3748 return false;
3749 }
3750 return true;
3751 }
3752
3753 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3754 functions. Design better to avoid maintenance issues. */
3755
3756 /* Function vect_model_reduction_cost.
3757
3758 Models cost for a reduction operation, including the vector ops
3759 generated within the strip-mine loop, the initial definition before
3760 the loop, and the epilogue code that must be generated. */
3761
3762 static void
3763 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3764 int ncopies, stmt_vector_for_cost *cost_vec)
3765 {
3766 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3767 enum tree_code code;
3768 optab optab;
3769 tree vectype;
3770 machine_mode mode;
3771 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3772 struct loop *loop = NULL;
3773
3774 if (loop_vinfo)
3775 loop = LOOP_VINFO_LOOP (loop_vinfo);
3776
3777 /* Condition reductions generate two reductions in the loop. */
3778 vect_reduction_type reduction_type
3779 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3780 if (reduction_type == COND_REDUCTION)
3781 ncopies *= 2;
3782
3783 vectype = STMT_VINFO_VECTYPE (stmt_info);
3784 mode = TYPE_MODE (vectype);
3785 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3786
3787 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3788
3789 if (reduction_type == EXTRACT_LAST_REDUCTION
3790 || reduction_type == FOLD_LEFT_REDUCTION)
3791 {
3792 /* No extra instructions needed in the prologue. */
3793 prologue_cost = 0;
3794
3795 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3796 /* Count one reduction-like operation per vector. */
3797 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3798 stmt_info, 0, vect_body);
3799 else
3800 {
3801 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
3802 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3803 inside_cost = record_stmt_cost (cost_vec, nelements,
3804 vec_to_scalar, stmt_info, 0,
3805 vect_body);
3806 inside_cost += record_stmt_cost (cost_vec, nelements,
3807 scalar_stmt, stmt_info, 0,
3808 vect_body);
3809 }
3810 }
3811 else
3812 {
3813 /* Add in cost for initial definition.
3814 For cond reduction we have four vectors: initial index, step,
3815 initial result of the data reduction, initial value of the index
3816 reduction. */
3817 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3818 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3819 scalar_to_vec, stmt_info, 0,
3820 vect_prologue);
3821
3822 /* Cost of reduction op inside loop. */
3823 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3824 stmt_info, 0, vect_body);
3825 }
3826
3827 /* Determine cost of epilogue code.
3828
3829 We have a reduction operator that will reduce the vector in one statement.
3830 Also requires scalar extract. */
3831
3832 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3833 {
3834 if (reduc_fn != IFN_LAST)
3835 {
3836 if (reduction_type == COND_REDUCTION)
3837 {
3838 /* An EQ stmt and an COND_EXPR stmt. */
3839 epilogue_cost += record_stmt_cost (cost_vec, 2,
3840 vector_stmt, stmt_info, 0,
3841 vect_epilogue);
3842 /* Reduction of the max index and a reduction of the found
3843 values. */
3844 epilogue_cost += record_stmt_cost (cost_vec, 2,
3845 vec_to_scalar, stmt_info, 0,
3846 vect_epilogue);
3847 /* A broadcast of the max value. */
3848 epilogue_cost += record_stmt_cost (cost_vec, 1,
3849 scalar_to_vec, stmt_info, 0,
3850 vect_epilogue);
3851 }
3852 else
3853 {
3854 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3855 stmt_info, 0, vect_epilogue);
3856 epilogue_cost += record_stmt_cost (cost_vec, 1,
3857 vec_to_scalar, stmt_info, 0,
3858 vect_epilogue);
3859 }
3860 }
3861 else if (reduction_type == COND_REDUCTION)
3862 {
3863 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3864 /* Extraction of scalar elements. */
3865 epilogue_cost += record_stmt_cost (cost_vec,
3866 2 * estimated_nunits,
3867 vec_to_scalar, stmt_info, 0,
3868 vect_epilogue);
3869 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
3870 epilogue_cost += record_stmt_cost (cost_vec,
3871 2 * estimated_nunits - 3,
3872 scalar_stmt, stmt_info, 0,
3873 vect_epilogue);
3874 }
3875 else if (reduction_type == EXTRACT_LAST_REDUCTION
3876 || reduction_type == FOLD_LEFT_REDUCTION)
3877 /* No extra instructions need in the epilogue. */
3878 ;
3879 else
3880 {
3881 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3882 tree bitsize =
3883 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3884 int element_bitsize = tree_to_uhwi (bitsize);
3885 int nelements = vec_size_in_bits / element_bitsize;
3886
3887 if (code == COND_EXPR)
3888 code = MAX_EXPR;
3889
3890 optab = optab_for_tree_code (code, vectype, optab_default);
3891
3892 /* We have a whole vector shift available. */
3893 if (optab != unknown_optab
3894 && VECTOR_MODE_P (mode)
3895 && optab_handler (optab, mode) != CODE_FOR_nothing
3896 && have_whole_vector_shift (mode))
3897 {
3898 /* Final reduction via vector shifts and the reduction operator.
3899 Also requires scalar extract. */
3900 epilogue_cost += record_stmt_cost (cost_vec,
3901 exact_log2 (nelements) * 2,
3902 vector_stmt, stmt_info, 0,
3903 vect_epilogue);
3904 epilogue_cost += record_stmt_cost (cost_vec, 1,
3905 vec_to_scalar, stmt_info, 0,
3906 vect_epilogue);
3907 }
3908 else
3909 /* Use extracts and reduction op for final reduction. For N
3910 elements, we have N extracts and N-1 reduction ops. */
3911 epilogue_cost += record_stmt_cost (cost_vec,
3912 nelements + nelements - 1,
3913 vector_stmt, stmt_info, 0,
3914 vect_epilogue);
3915 }
3916 }
3917
3918 if (dump_enabled_p ())
3919 dump_printf (MSG_NOTE,
3920 "vect_model_reduction_cost: inside_cost = %d, "
3921 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3922 prologue_cost, epilogue_cost);
3923 }
3924
3925
3926 /* Function vect_model_induction_cost.
3927
3928 Models cost for induction operations. */
3929
3930 static void
3931 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
3932 stmt_vector_for_cost *cost_vec)
3933 {
3934 unsigned inside_cost, prologue_cost;
3935
3936 if (PURE_SLP_STMT (stmt_info))
3937 return;
3938
3939 /* loop cost for vec_loop. */
3940 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3941 stmt_info, 0, vect_body);
3942
3943 /* prologue cost for vec_init and vec_step. */
3944 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
3945 stmt_info, 0, vect_prologue);
3946
3947 if (dump_enabled_p ())
3948 dump_printf_loc (MSG_NOTE, vect_location,
3949 "vect_model_induction_cost: inside_cost = %d, "
3950 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3951 }
3952
3953
3954
3955 /* Function get_initial_def_for_reduction
3956
3957 Input:
3958 STMT_VINFO - a stmt that performs a reduction operation in the loop.
3959 INIT_VAL - the initial value of the reduction variable
3960
3961 Output:
3962 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3963 of the reduction (used for adjusting the epilog - see below).
3964 Return a vector variable, initialized according to the operation that
3965 STMT_VINFO performs. This vector will be used as the initial value
3966 of the vector of partial results.
3967
3968 Option1 (adjust in epilog): Initialize the vector as follows:
3969 add/bit or/xor: [0,0,...,0,0]
3970 mult/bit and: [1,1,...,1,1]
3971 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3972 and when necessary (e.g. add/mult case) let the caller know
3973 that it needs to adjust the result by init_val.
3974
3975 Option2: Initialize the vector as follows:
3976 add/bit or/xor: [init_val,0,0,...,0]
3977 mult/bit and: [init_val,1,1,...,1]
3978 min/max/cond_expr: [init_val,init_val,...,init_val]
3979 and no adjustments are needed.
3980
3981 For example, for the following code:
3982
3983 s = init_val;
3984 for (i=0;i<n;i++)
3985 s = s + a[i];
3986
3987 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
3988 For a vector of 4 units, we want to return either [0,0,0,init_val],
3989 or [0,0,0,0] and let the caller know that it needs to adjust
3990 the result at the end by 'init_val'.
3991
3992 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3993 initialization vector is simpler (same element in all entries), if
3994 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3995
3996 A cost model should help decide between these two schemes. */
3997
3998 tree
3999 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
4000 tree *adjustment_def)
4001 {
4002 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4003 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4004 tree scalar_type = TREE_TYPE (init_val);
4005 tree vectype = get_vectype_for_scalar_type (scalar_type);
4006 enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
4007 tree def_for_init;
4008 tree init_def;
4009 REAL_VALUE_TYPE real_init_val = dconst0;
4010 int int_init_val = 0;
4011 gimple_seq stmts = NULL;
4012
4013 gcc_assert (vectype);
4014
4015 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4016 || SCALAR_FLOAT_TYPE_P (scalar_type));
4017
4018 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4019 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4020
4021 vect_reduction_type reduction_type
4022 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4023
4024 switch (code)
4025 {
4026 case WIDEN_SUM_EXPR:
4027 case DOT_PROD_EXPR:
4028 case SAD_EXPR:
4029 case PLUS_EXPR:
4030 case MINUS_EXPR:
4031 case BIT_IOR_EXPR:
4032 case BIT_XOR_EXPR:
4033 case MULT_EXPR:
4034 case BIT_AND_EXPR:
4035 {
4036 /* ADJUSTMENT_DEF is NULL when called from
4037 vect_create_epilog_for_reduction to vectorize double reduction. */
4038 if (adjustment_def)
4039 *adjustment_def = init_val;
4040
4041 if (code == MULT_EXPR)
4042 {
4043 real_init_val = dconst1;
4044 int_init_val = 1;
4045 }
4046
4047 if (code == BIT_AND_EXPR)
4048 int_init_val = -1;
4049
4050 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4051 def_for_init = build_real (scalar_type, real_init_val);
4052 else
4053 def_for_init = build_int_cst (scalar_type, int_init_val);
4054
4055 if (adjustment_def)
4056 /* Option1: the first element is '0' or '1' as well. */
4057 init_def = gimple_build_vector_from_val (&stmts, vectype,
4058 def_for_init);
4059 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4060 {
4061 /* Option2 (variable length): the first element is INIT_VAL. */
4062 init_def = gimple_build_vector_from_val (&stmts, vectype,
4063 def_for_init);
4064 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4065 vectype, init_def, init_val);
4066 }
4067 else
4068 {
4069 /* Option2: the first element is INIT_VAL. */
4070 tree_vector_builder elts (vectype, 1, 2);
4071 elts.quick_push (init_val);
4072 elts.quick_push (def_for_init);
4073 init_def = gimple_build_vector (&stmts, &elts);
4074 }
4075 }
4076 break;
4077
4078 case MIN_EXPR:
4079 case MAX_EXPR:
4080 case COND_EXPR:
4081 {
4082 if (adjustment_def)
4083 {
4084 *adjustment_def = NULL_TREE;
4085 if (reduction_type != COND_REDUCTION
4086 && reduction_type != EXTRACT_LAST_REDUCTION)
4087 {
4088 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
4089 break;
4090 }
4091 }
4092 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4093 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4094 }
4095 break;
4096
4097 default:
4098 gcc_unreachable ();
4099 }
4100
4101 if (stmts)
4102 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4103 return init_def;
4104 }
4105
4106 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4107 NUMBER_OF_VECTORS is the number of vector defs to create.
4108 If NEUTRAL_OP is nonnull, introducing extra elements of that
4109 value will not change the result. */
4110
4111 static void
4112 get_initial_defs_for_reduction (slp_tree slp_node,
4113 vec<tree> *vec_oprnds,
4114 unsigned int number_of_vectors,
4115 bool reduc_chain, tree neutral_op)
4116 {
4117 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4118 stmt_vec_info stmt_vinfo = stmts[0];
4119 unsigned HOST_WIDE_INT nunits;
4120 unsigned j, number_of_places_left_in_vector;
4121 tree vector_type;
4122 tree vop;
4123 int group_size = stmts.length ();
4124 unsigned int vec_num, i;
4125 unsigned number_of_copies = 1;
4126 vec<tree> voprnds;
4127 voprnds.create (number_of_vectors);
4128 struct loop *loop;
4129 auto_vec<tree, 16> permute_results;
4130
4131 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4132
4133 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4134
4135 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4136 gcc_assert (loop);
4137 edge pe = loop_preheader_edge (loop);
4138
4139 gcc_assert (!reduc_chain || neutral_op);
4140
4141 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4142 created vectors. It is greater than 1 if unrolling is performed.
4143
4144 For example, we have two scalar operands, s1 and s2 (e.g., group of
4145 strided accesses of size two), while NUNITS is four (i.e., four scalars
4146 of this type can be packed in a vector). The output vector will contain
4147 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4148 will be 2).
4149
4150 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4151 vectors containing the operands.
4152
4153 For example, NUNITS is four as before, and the group size is 8
4154 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4155 {s5, s6, s7, s8}. */
4156
4157 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4158 nunits = group_size;
4159
4160 number_of_copies = nunits * number_of_vectors / group_size;
4161
4162 number_of_places_left_in_vector = nunits;
4163 bool constant_p = true;
4164 tree_vector_builder elts (vector_type, nunits, 1);
4165 elts.quick_grow (nunits);
4166 for (j = 0; j < number_of_copies; j++)
4167 {
4168 for (i = group_size - 1; stmts.iterate (i, &stmt_vinfo); i--)
4169 {
4170 tree op;
4171 /* Get the def before the loop. In reduction chain we have only
4172 one initial value. */
4173 if ((j != (number_of_copies - 1)
4174 || (reduc_chain && i != 0))
4175 && neutral_op)
4176 op = neutral_op;
4177 else
4178 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4179
4180 /* Create 'vect_ = {op0,op1,...,opn}'. */
4181 number_of_places_left_in_vector--;
4182 elts[number_of_places_left_in_vector] = op;
4183 if (!CONSTANT_CLASS_P (op))
4184 constant_p = false;
4185
4186 if (number_of_places_left_in_vector == 0)
4187 {
4188 gimple_seq ctor_seq = NULL;
4189 tree init;
4190 if (constant_p && !neutral_op
4191 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4192 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4193 /* Build the vector directly from ELTS. */
4194 init = gimple_build_vector (&ctor_seq, &elts);
4195 else if (neutral_op)
4196 {
4197 /* Build a vector of the neutral value and shift the
4198 other elements into place. */
4199 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4200 neutral_op);
4201 int k = nunits;
4202 while (k > 0 && elts[k - 1] == neutral_op)
4203 k -= 1;
4204 while (k > 0)
4205 {
4206 k -= 1;
4207 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4208 vector_type, init, elts[k]);
4209 }
4210 }
4211 else
4212 {
4213 /* First time round, duplicate ELTS to fill the
4214 required number of vectors, then cherry pick the
4215 appropriate result for each iteration. */
4216 if (vec_oprnds->is_empty ())
4217 duplicate_and_interleave (&ctor_seq, vector_type, elts,
4218 number_of_vectors,
4219 permute_results);
4220 init = permute_results[number_of_vectors - j - 1];
4221 }
4222 if (ctor_seq != NULL)
4223 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4224 voprnds.quick_push (init);
4225
4226 number_of_places_left_in_vector = nunits;
4227 elts.new_vector (vector_type, nunits, 1);
4228 elts.quick_grow (nunits);
4229 constant_p = true;
4230 }
4231 }
4232 }
4233
4234 /* Since the vectors are created in the reverse order, we should invert
4235 them. */
4236 vec_num = voprnds.length ();
4237 for (j = vec_num; j != 0; j--)
4238 {
4239 vop = voprnds[j - 1];
4240 vec_oprnds->quick_push (vop);
4241 }
4242
4243 voprnds.release ();
4244
4245 /* In case that VF is greater than the unrolling factor needed for the SLP
4246 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4247 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4248 to replicate the vectors. */
4249 tree neutral_vec = NULL;
4250 while (number_of_vectors > vec_oprnds->length ())
4251 {
4252 if (neutral_op)
4253 {
4254 if (!neutral_vec)
4255 {
4256 gimple_seq ctor_seq = NULL;
4257 neutral_vec = gimple_build_vector_from_val
4258 (&ctor_seq, vector_type, neutral_op);
4259 if (ctor_seq != NULL)
4260 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4261 }
4262 vec_oprnds->quick_push (neutral_vec);
4263 }
4264 else
4265 {
4266 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4267 vec_oprnds->quick_push (vop);
4268 }
4269 }
4270 }
4271
4272
4273 /* Function vect_create_epilog_for_reduction
4274
4275 Create code at the loop-epilog to finalize the result of a reduction
4276 computation.
4277
4278 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4279 reduction statements.
4280 STMT_INFO is the scalar reduction stmt that is being vectorized.
4281 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4282 number of elements that we can fit in a vectype (nunits). In this case
4283 we have to generate more than one vector stmt - i.e - we need to "unroll"
4284 the vector stmt by a factor VF/nunits. For more details see documentation
4285 in vectorizable_operation.
4286 REDUC_FN is the internal function for the epilog reduction.
4287 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4288 computation.
4289 REDUC_INDEX is the index of the operand in the right hand side of the
4290 statement that is defined by REDUCTION_PHI.
4291 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4292 SLP_NODE is an SLP node containing a group of reduction statements. The
4293 first one in this group is STMT_INFO.
4294 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4295 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4296 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4297 any value of the IV in the loop.
4298 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4299 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4300 null if this is not an SLP reduction
4301
4302 This function:
4303 1. Creates the reduction def-use cycles: sets the arguments for
4304 REDUCTION_PHIS:
4305 The loop-entry argument is the vectorized initial-value of the reduction.
4306 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4307 sums.
4308 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4309 by calling the function specified by REDUC_FN if available, or by
4310 other means (whole-vector shifts or a scalar loop).
4311 The function also creates a new phi node at the loop exit to preserve
4312 loop-closed form, as illustrated below.
4313
4314 The flow at the entry to this function:
4315
4316 loop:
4317 vec_def = phi <null, null> # REDUCTION_PHI
4318 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4319 s_loop = scalar_stmt # (scalar) STMT_INFO
4320 loop_exit:
4321 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4322 use <s_out0>
4323 use <s_out0>
4324
4325 The above is transformed by this function into:
4326
4327 loop:
4328 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4329 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4330 s_loop = scalar_stmt # (scalar) STMT_INFO
4331 loop_exit:
4332 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4333 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4334 v_out2 = reduce <v_out1>
4335 s_out3 = extract_field <v_out2, 0>
4336 s_out4 = adjust_result <s_out3>
4337 use <s_out4>
4338 use <s_out4>
4339 */
4340
4341 static void
4342 vect_create_epilog_for_reduction (vec<tree> vect_defs,
4343 stmt_vec_info stmt_info,
4344 gimple *reduc_def_stmt,
4345 int ncopies, internal_fn reduc_fn,
4346 vec<stmt_vec_info> reduction_phis,
4347 bool double_reduc,
4348 slp_tree slp_node,
4349 slp_instance slp_node_instance,
4350 tree induc_val, enum tree_code induc_code,
4351 tree neutral_op)
4352 {
4353 stmt_vec_info prev_phi_info;
4354 tree vectype;
4355 machine_mode mode;
4356 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4357 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4358 basic_block exit_bb;
4359 tree scalar_dest;
4360 tree scalar_type;
4361 gimple *new_phi = NULL, *phi;
4362 stmt_vec_info phi_info;
4363 gimple_stmt_iterator exit_gsi;
4364 tree vec_dest;
4365 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4366 gimple *epilog_stmt = NULL;
4367 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
4368 gimple *exit_phi;
4369 tree bitsize;
4370 tree adjustment_def = NULL;
4371 tree vec_initial_def = NULL;
4372 tree expr, def, initial_def = NULL;
4373 tree orig_name, scalar_result;
4374 imm_use_iterator imm_iter, phi_imm_iter;
4375 use_operand_p use_p, phi_use_p;
4376 gimple *use_stmt;
4377 stmt_vec_info reduction_phi_info = NULL;
4378 bool nested_in_vect_loop = false;
4379 auto_vec<gimple *> new_phis;
4380 auto_vec<stmt_vec_info> inner_phis;
4381 int j, i;
4382 auto_vec<tree> scalar_results;
4383 unsigned int group_size = 1, k, ratio;
4384 auto_vec<tree> vec_initial_defs;
4385 auto_vec<gimple *> phis;
4386 bool slp_reduc = false;
4387 bool direct_slp_reduc;
4388 tree new_phi_result;
4389 stmt_vec_info inner_phi = NULL;
4390 tree induction_index = NULL_TREE;
4391
4392 if (slp_node)
4393 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4394
4395 if (nested_in_vect_loop_p (loop, stmt_info))
4396 {
4397 outer_loop = loop;
4398 loop = loop->inner;
4399 nested_in_vect_loop = true;
4400 gcc_assert (!slp_node);
4401 }
4402
4403 vectype = STMT_VINFO_VECTYPE (stmt_info);
4404 gcc_assert (vectype);
4405 mode = TYPE_MODE (vectype);
4406
4407 /* 1. Create the reduction def-use cycle:
4408 Set the arguments of REDUCTION_PHIS, i.e., transform
4409
4410 loop:
4411 vec_def = phi <null, null> # REDUCTION_PHI
4412 VECT_DEF = vector_stmt # vectorized form of STMT
4413 ...
4414
4415 into:
4416
4417 loop:
4418 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4419 VECT_DEF = vector_stmt # vectorized form of STMT
4420 ...
4421
4422 (in case of SLP, do it for all the phis). */
4423
4424 /* Get the loop-entry arguments. */
4425 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4426 if (slp_node)
4427 {
4428 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4429 vec_initial_defs.reserve (vec_num);
4430 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4431 &vec_initial_defs, vec_num,
4432 REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4433 neutral_op);
4434 }
4435 else
4436 {
4437 /* Get at the scalar def before the loop, that defines the initial value
4438 of the reduction variable. */
4439 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4440 loop_preheader_edge (loop));
4441 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4442 and we can't use zero for induc_val, use initial_def. Similarly
4443 for REDUC_MIN and initial_def larger than the base. */
4444 if (TREE_CODE (initial_def) == INTEGER_CST
4445 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4446 == INTEGER_INDUC_COND_REDUCTION)
4447 && !integer_zerop (induc_val)
4448 && ((induc_code == MAX_EXPR
4449 && tree_int_cst_lt (initial_def, induc_val))
4450 || (induc_code == MIN_EXPR
4451 && tree_int_cst_lt (induc_val, initial_def))))
4452 induc_val = initial_def;
4453
4454 if (double_reduc)
4455 /* In case of double reduction we only create a vector variable
4456 to be put in the reduction phi node. The actual statement
4457 creation is done later in this function. */
4458 vec_initial_def = vect_create_destination_var (initial_def, vectype);
4459 else if (nested_in_vect_loop)
4460 {
4461 /* Do not use an adjustment def as that case is not supported
4462 correctly if ncopies is not one. */
4463 vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4464 vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4465 stmt_info);
4466 }
4467 else
4468 vec_initial_def
4469 = get_initial_def_for_reduction (stmt_info, initial_def,
4470 &adjustment_def);
4471 vec_initial_defs.create (1);
4472 vec_initial_defs.quick_push (vec_initial_def);
4473 }
4474
4475 /* Set phi nodes arguments. */
4476 FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4477 {
4478 tree vec_init_def = vec_initial_defs[i];
4479 tree def = vect_defs[i];
4480 for (j = 0; j < ncopies; j++)
4481 {
4482 if (j != 0)
4483 {
4484 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4485 if (nested_in_vect_loop)
4486 vec_init_def
4487 = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
4488 }
4489
4490 /* Set the loop-entry arg of the reduction-phi. */
4491
4492 gphi *phi = as_a <gphi *> (phi_info->stmt);
4493 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4494 == INTEGER_INDUC_COND_REDUCTION)
4495 {
4496 /* Initialise the reduction phi to zero. This prevents initial
4497 values of non-zero interferring with the reduction op. */
4498 gcc_assert (ncopies == 1);
4499 gcc_assert (i == 0);
4500
4501 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4502 tree induc_val_vec
4503 = build_vector_from_val (vec_init_def_type, induc_val);
4504
4505 add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4506 UNKNOWN_LOCATION);
4507 }
4508 else
4509 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4510 UNKNOWN_LOCATION);
4511
4512 /* Set the loop-latch arg for the reduction-phi. */
4513 if (j > 0)
4514 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4515
4516 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4517
4518 if (dump_enabled_p ())
4519 dump_printf_loc (MSG_NOTE, vect_location,
4520 "transform reduction: created def-use cycle: %G%G",
4521 phi, SSA_NAME_DEF_STMT (def));
4522 }
4523 }
4524
4525 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4526 which is updated with the current index of the loop for every match of
4527 the original loop's cond_expr (VEC_STMT). This results in a vector
4528 containing the last time the condition passed for that vector lane.
4529 The first match will be a 1 to allow 0 to be used for non-matching
4530 indexes. If there are no matches at all then the vector will be all
4531 zeroes. */
4532 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4533 {
4534 tree indx_before_incr, indx_after_incr;
4535 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4536
4537 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4538 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4539
4540 int scalar_precision
4541 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4542 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4543 tree cr_index_vector_type = build_vector_type
4544 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4545
4546 /* First we create a simple vector induction variable which starts
4547 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4548 vector size (STEP). */
4549
4550 /* Create a {1,2,3,...} vector. */
4551 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4552
4553 /* Create a vector of the step value. */
4554 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4555 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4556
4557 /* Create an induction variable. */
4558 gimple_stmt_iterator incr_gsi;
4559 bool insert_after;
4560 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4561 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4562 insert_after, &indx_before_incr, &indx_after_incr);
4563
4564 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4565 filled with zeros (VEC_ZERO). */
4566
4567 /* Create a vector of 0s. */
4568 tree zero = build_zero_cst (cr_index_scalar_type);
4569 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4570
4571 /* Create a vector phi node. */
4572 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4573 new_phi = create_phi_node (new_phi_tree, loop->header);
4574 loop_vinfo->add_stmt (new_phi);
4575 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4576 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4577
4578 /* Now take the condition from the loops original cond_expr
4579 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4580 every match uses values from the induction variable
4581 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4582 (NEW_PHI_TREE).
4583 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4584 the new cond_expr (INDEX_COND_EXPR). */
4585
4586 /* Duplicate the condition from vec_stmt. */
4587 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4588
4589 /* Create a conditional, where the condition is taken from vec_stmt
4590 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4591 else is the phi (NEW_PHI_TREE). */
4592 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4593 ccompare, indx_before_incr,
4594 new_phi_tree);
4595 induction_index = make_ssa_name (cr_index_vector_type);
4596 gimple *index_condition = gimple_build_assign (induction_index,
4597 index_cond_expr);
4598 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4599 stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4600 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4601
4602 /* Update the phi with the vec cond. */
4603 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4604 loop_latch_edge (loop), UNKNOWN_LOCATION);
4605 }
4606
4607 /* 2. Create epilog code.
4608 The reduction epilog code operates across the elements of the vector
4609 of partial results computed by the vectorized loop.
4610 The reduction epilog code consists of:
4611
4612 step 1: compute the scalar result in a vector (v_out2)
4613 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4614 step 3: adjust the scalar result (s_out3) if needed.
4615
4616 Step 1 can be accomplished using one the following three schemes:
4617 (scheme 1) using reduc_fn, if available.
4618 (scheme 2) using whole-vector shifts, if available.
4619 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4620 combined.
4621
4622 The overall epilog code looks like this:
4623
4624 s_out0 = phi <s_loop> # original EXIT_PHI
4625 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4626 v_out2 = reduce <v_out1> # step 1
4627 s_out3 = extract_field <v_out2, 0> # step 2
4628 s_out4 = adjust_result <s_out3> # step 3
4629
4630 (step 3 is optional, and steps 1 and 2 may be combined).
4631 Lastly, the uses of s_out0 are replaced by s_out4. */
4632
4633
4634 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4635 v_out1 = phi <VECT_DEF>
4636 Store them in NEW_PHIS. */
4637
4638 exit_bb = single_exit (loop)->dest;
4639 prev_phi_info = NULL;
4640 new_phis.create (vect_defs.length ());
4641 FOR_EACH_VEC_ELT (vect_defs, i, def)
4642 {
4643 for (j = 0; j < ncopies; j++)
4644 {
4645 tree new_def = copy_ssa_name (def);
4646 phi = create_phi_node (new_def, exit_bb);
4647 stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4648 if (j == 0)
4649 new_phis.quick_push (phi);
4650 else
4651 {
4652 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4653 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4654 }
4655
4656 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4657 prev_phi_info = phi_info;
4658 }
4659 }
4660
4661 /* The epilogue is created for the outer-loop, i.e., for the loop being
4662 vectorized. Create exit phis for the outer loop. */
4663 if (double_reduc)
4664 {
4665 loop = outer_loop;
4666 exit_bb = single_exit (loop)->dest;
4667 inner_phis.create (vect_defs.length ());
4668 FOR_EACH_VEC_ELT (new_phis, i, phi)
4669 {
4670 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4671 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4672 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4673 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4674 PHI_RESULT (phi));
4675 prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4676 inner_phis.quick_push (phi_info);
4677 new_phis[i] = outer_phi;
4678 while (STMT_VINFO_RELATED_STMT (phi_info))
4679 {
4680 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4681 new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4682 outer_phi = create_phi_node (new_result, exit_bb);
4683 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4684 PHI_RESULT (phi_info->stmt));
4685 stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4686 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4687 prev_phi_info = outer_phi_info;
4688 }
4689 }
4690 }
4691
4692 exit_gsi = gsi_after_labels (exit_bb);
4693
4694 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4695 (i.e. when reduc_fn is not available) and in the final adjustment
4696 code (if needed). Also get the original scalar reduction variable as
4697 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4698 represents a reduction pattern), the tree-code and scalar-def are
4699 taken from the original stmt that the pattern-stmt (STMT) replaces.
4700 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4701 are taken from STMT. */
4702
4703 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4704 if (orig_stmt_info != stmt_info)
4705 {
4706 /* Reduction pattern */
4707 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4708 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4709 }
4710
4711 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4712 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4713 partial results are added and not subtracted. */
4714 if (code == MINUS_EXPR)
4715 code = PLUS_EXPR;
4716
4717 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4718 scalar_type = TREE_TYPE (scalar_dest);
4719 scalar_results.create (group_size);
4720 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4721 bitsize = TYPE_SIZE (scalar_type);
4722
4723 /* In case this is a reduction in an inner-loop while vectorizing an outer
4724 loop - we don't need to extract a single scalar result at the end of the
4725 inner-loop (unless it is double reduction, i.e., the use of reduction is
4726 outside the outer-loop). The final vector of partial results will be used
4727 in the vectorized outer-loop, or reduced to a scalar result at the end of
4728 the outer-loop. */
4729 if (nested_in_vect_loop && !double_reduc)
4730 goto vect_finalize_reduction;
4731
4732 /* SLP reduction without reduction chain, e.g.,
4733 # a1 = phi <a2, a0>
4734 # b1 = phi <b2, b0>
4735 a2 = operation (a1)
4736 b2 = operation (b1) */
4737 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4738
4739 /* True if we should implement SLP_REDUC using native reduction operations
4740 instead of scalar operations. */
4741 direct_slp_reduc = (reduc_fn != IFN_LAST
4742 && slp_reduc
4743 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4744
4745 /* In case of reduction chain, e.g.,
4746 # a1 = phi <a3, a0>
4747 a2 = operation (a1)
4748 a3 = operation (a2),
4749
4750 we may end up with more than one vector result. Here we reduce them to
4751 one vector. */
4752 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4753 {
4754 tree first_vect = PHI_RESULT (new_phis[0]);
4755 gassign *new_vec_stmt = NULL;
4756 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4757 for (k = 1; k < new_phis.length (); k++)
4758 {
4759 gimple *next_phi = new_phis[k];
4760 tree second_vect = PHI_RESULT (next_phi);
4761 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4762 new_vec_stmt = gimple_build_assign (tem, code,
4763 first_vect, second_vect);
4764 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4765 first_vect = tem;
4766 }
4767
4768 new_phi_result = first_vect;
4769 if (new_vec_stmt)
4770 {
4771 new_phis.truncate (0);
4772 new_phis.safe_push (new_vec_stmt);
4773 }
4774 }
4775 /* Likewise if we couldn't use a single defuse cycle. */
4776 else if (ncopies > 1)
4777 {
4778 gcc_assert (new_phis.length () == 1);
4779 tree first_vect = PHI_RESULT (new_phis[0]);
4780 gassign *new_vec_stmt = NULL;
4781 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4782 stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4783 for (int k = 1; k < ncopies; ++k)
4784 {
4785 next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4786 tree second_vect = PHI_RESULT (next_phi_info->stmt);
4787 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4788 new_vec_stmt = gimple_build_assign (tem, code,
4789 first_vect, second_vect);
4790 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4791 first_vect = tem;
4792 }
4793 new_phi_result = first_vect;
4794 new_phis.truncate (0);
4795 new_phis.safe_push (new_vec_stmt);
4796 }
4797 else
4798 new_phi_result = PHI_RESULT (new_phis[0]);
4799
4800 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4801 && reduc_fn != IFN_LAST)
4802 {
4803 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4804 various data values where the condition matched and another vector
4805 (INDUCTION_INDEX) containing all the indexes of those matches. We
4806 need to extract the last matching index (which will be the index with
4807 highest value) and use this to index into the data vector.
4808 For the case where there were no matches, the data vector will contain
4809 all default values and the index vector will be all zeros. */
4810
4811 /* Get various versions of the type of the vector of indexes. */
4812 tree index_vec_type = TREE_TYPE (induction_index);
4813 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4814 tree index_scalar_type = TREE_TYPE (index_vec_type);
4815 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4816 (index_vec_type);
4817
4818 /* Get an unsigned integer version of the type of the data vector. */
4819 int scalar_precision
4820 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4821 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4822 tree vectype_unsigned = build_vector_type
4823 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4824
4825 /* First we need to create a vector (ZERO_VEC) of zeros and another
4826 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4827 can create using a MAX reduction and then expanding.
4828 In the case where the loop never made any matches, the max index will
4829 be zero. */
4830
4831 /* Vector of {0, 0, 0,...}. */
4832 tree zero_vec = make_ssa_name (vectype);
4833 tree zero_vec_rhs = build_zero_cst (vectype);
4834 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4835 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4836
4837 /* Find maximum value from the vector of found indexes. */
4838 tree max_index = make_ssa_name (index_scalar_type);
4839 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4840 1, induction_index);
4841 gimple_call_set_lhs (max_index_stmt, max_index);
4842 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4843
4844 /* Vector of {max_index, max_index, max_index,...}. */
4845 tree max_index_vec = make_ssa_name (index_vec_type);
4846 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4847 max_index);
4848 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4849 max_index_vec_rhs);
4850 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4851
4852 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4853 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4854 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4855 otherwise. Only one value should match, resulting in a vector
4856 (VEC_COND) with one data value and the rest zeros.
4857 In the case where the loop never made any matches, every index will
4858 match, resulting in a vector with all data values (which will all be
4859 the default value). */
4860
4861 /* Compare the max index vector to the vector of found indexes to find
4862 the position of the max value. */
4863 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4864 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4865 induction_index,
4866 max_index_vec);
4867 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4868
4869 /* Use the compare to choose either values from the data vector or
4870 zero. */
4871 tree vec_cond = make_ssa_name (vectype);
4872 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4873 vec_compare, new_phi_result,
4874 zero_vec);
4875 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4876
4877 /* Finally we need to extract the data value from the vector (VEC_COND)
4878 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4879 reduction, but because this doesn't exist, we can use a MAX reduction
4880 instead. The data value might be signed or a float so we need to cast
4881 it first.
4882 In the case where the loop never made any matches, the data values are
4883 all identical, and so will reduce down correctly. */
4884
4885 /* Make the matched data values unsigned. */
4886 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4887 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4888 vec_cond);
4889 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4890 VIEW_CONVERT_EXPR,
4891 vec_cond_cast_rhs);
4892 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4893
4894 /* Reduce down to a scalar value. */
4895 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4896 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4897 1, vec_cond_cast);
4898 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4899 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4900
4901 /* Convert the reduced value back to the result type and set as the
4902 result. */
4903 gimple_seq stmts = NULL;
4904 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4905 data_reduc);
4906 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4907 scalar_results.safe_push (new_temp);
4908 }
4909 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4910 && reduc_fn == IFN_LAST)
4911 {
4912 /* Condition reduction without supported IFN_REDUC_MAX. Generate
4913 idx = 0;
4914 idx_val = induction_index[0];
4915 val = data_reduc[0];
4916 for (idx = 0, val = init, i = 0; i < nelts; ++i)
4917 if (induction_index[i] > idx_val)
4918 val = data_reduc[i], idx_val = induction_index[i];
4919 return val; */
4920
4921 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4922 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4923 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4924 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4925 /* Enforced by vectorizable_reduction, which ensures we have target
4926 support before allowing a conditional reduction on variable-length
4927 vectors. */
4928 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4929 tree idx_val = NULL_TREE, val = NULL_TREE;
4930 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4931 {
4932 tree old_idx_val = idx_val;
4933 tree old_val = val;
4934 idx_val = make_ssa_name (idx_eltype);
4935 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4936 build3 (BIT_FIELD_REF, idx_eltype,
4937 induction_index,
4938 bitsize_int (el_size),
4939 bitsize_int (off)));
4940 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4941 val = make_ssa_name (data_eltype);
4942 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4943 build3 (BIT_FIELD_REF,
4944 data_eltype,
4945 new_phi_result,
4946 bitsize_int (el_size),
4947 bitsize_int (off)));
4948 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4949 if (off != 0)
4950 {
4951 tree new_idx_val = idx_val;
4952 tree new_val = val;
4953 if (off != v_size - el_size)
4954 {
4955 new_idx_val = make_ssa_name (idx_eltype);
4956 epilog_stmt = gimple_build_assign (new_idx_val,
4957 MAX_EXPR, idx_val,
4958 old_idx_val);
4959 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4960 }
4961 new_val = make_ssa_name (data_eltype);
4962 epilog_stmt = gimple_build_assign (new_val,
4963 COND_EXPR,
4964 build2 (GT_EXPR,
4965 boolean_type_node,
4966 idx_val,
4967 old_idx_val),
4968 val, old_val);
4969 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4970 idx_val = new_idx_val;
4971 val = new_val;
4972 }
4973 }
4974 /* Convert the reduced value back to the result type and set as the
4975 result. */
4976 gimple_seq stmts = NULL;
4977 val = gimple_convert (&stmts, scalar_type, val);
4978 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4979 scalar_results.safe_push (val);
4980 }
4981
4982 /* 2.3 Create the reduction code, using one of the three schemes described
4983 above. In SLP we simply need to extract all the elements from the
4984 vector (without reducing them), so we use scalar shifts. */
4985 else if (reduc_fn != IFN_LAST && !slp_reduc)
4986 {
4987 tree tmp;
4988 tree vec_elem_type;
4989
4990 /* Case 1: Create:
4991 v_out2 = reduc_expr <v_out1> */
4992
4993 if (dump_enabled_p ())
4994 dump_printf_loc (MSG_NOTE, vect_location,
4995 "Reduce using direct vector reduction.\n");
4996
4997 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4998 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4999 {
5000 tree tmp_dest
5001 = vect_create_destination_var (scalar_dest, vec_elem_type);
5002 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5003 new_phi_result);
5004 gimple_set_lhs (epilog_stmt, tmp_dest);
5005 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5006 gimple_set_lhs (epilog_stmt, new_temp);
5007 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5008
5009 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5010 new_temp);
5011 }
5012 else
5013 {
5014 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5015 new_phi_result);
5016 gimple_set_lhs (epilog_stmt, new_scalar_dest);
5017 }
5018
5019 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5020 gimple_set_lhs (epilog_stmt, new_temp);
5021 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5022
5023 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5024 == INTEGER_INDUC_COND_REDUCTION)
5025 && !operand_equal_p (initial_def, induc_val, 0))
5026 {
5027 /* Earlier we set the initial value to be a vector if induc_val
5028 values. Check the result and if it is induc_val then replace
5029 with the original initial value, unless induc_val is
5030 the same as initial_def already. */
5031 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5032 induc_val);
5033
5034 tmp = make_ssa_name (new_scalar_dest);
5035 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5036 initial_def, new_temp);
5037 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5038 new_temp = tmp;
5039 }
5040
5041 scalar_results.safe_push (new_temp);
5042 }
5043 else if (direct_slp_reduc)
5044 {
5045 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5046 with the elements for other SLP statements replaced with the
5047 neutral value. We can then do a normal reduction on each vector. */
5048
5049 /* Enforced by vectorizable_reduction. */
5050 gcc_assert (new_phis.length () == 1);
5051 gcc_assert (pow2p_hwi (group_size));
5052
5053 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5054 vec<stmt_vec_info> orig_phis
5055 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5056 gimple_seq seq = NULL;
5057
5058 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5059 and the same element size as VECTYPE. */
5060 tree index = build_index_vector (vectype, 0, 1);
5061 tree index_type = TREE_TYPE (index);
5062 tree index_elt_type = TREE_TYPE (index_type);
5063 tree mask_type = build_same_sized_truth_vector_type (index_type);
5064
5065 /* Create a vector that, for each element, identifies which of
5066 the REDUC_GROUP_SIZE results should use it. */
5067 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5068 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5069 build_vector_from_val (index_type, index_mask));
5070
5071 /* Get a neutral vector value. This is simply a splat of the neutral
5072 scalar value if we have one, otherwise the initial scalar value
5073 is itself a neutral value. */
5074 tree vector_identity = NULL_TREE;
5075 if (neutral_op)
5076 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5077 neutral_op);
5078 for (unsigned int i = 0; i < group_size; ++i)
5079 {
5080 /* If there's no univeral neutral value, we can use the
5081 initial scalar value from the original PHI. This is used
5082 for MIN and MAX reduction, for example. */
5083 if (!neutral_op)
5084 {
5085 tree scalar_value
5086 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5087 loop_preheader_edge (loop));
5088 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5089 scalar_value);
5090 }
5091
5092 /* Calculate the equivalent of:
5093
5094 sel[j] = (index[j] == i);
5095
5096 which selects the elements of NEW_PHI_RESULT that should
5097 be included in the result. */
5098 tree compare_val = build_int_cst (index_elt_type, i);
5099 compare_val = build_vector_from_val (index_type, compare_val);
5100 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5101 index, compare_val);
5102
5103 /* Calculate the equivalent of:
5104
5105 vec = seq ? new_phi_result : vector_identity;
5106
5107 VEC is now suitable for a full vector reduction. */
5108 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5109 sel, new_phi_result, vector_identity);
5110
5111 /* Do the reduction and convert it to the appropriate type. */
5112 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5113 TREE_TYPE (vectype), vec);
5114 scalar = gimple_convert (&seq, scalar_type, scalar);
5115 scalar_results.safe_push (scalar);
5116 }
5117 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5118 }
5119 else
5120 {
5121 bool reduce_with_shift;
5122 tree vec_temp;
5123
5124 /* COND reductions all do the final reduction with MAX_EXPR
5125 or MIN_EXPR. */
5126 if (code == COND_EXPR)
5127 {
5128 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5129 == INTEGER_INDUC_COND_REDUCTION)
5130 code = induc_code;
5131 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5132 == CONST_COND_REDUCTION)
5133 code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5134 else
5135 code = MAX_EXPR;
5136 }
5137
5138 /* See if the target wants to do the final (shift) reduction
5139 in a vector mode of smaller size and first reduce upper/lower
5140 halves against each other. */
5141 enum machine_mode mode1 = mode;
5142 tree vectype1 = vectype;
5143 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5144 unsigned sz1 = sz;
5145 if (!slp_reduc
5146 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5147 sz1 = GET_MODE_SIZE (mode1).to_constant ();
5148
5149 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5150 reduce_with_shift = have_whole_vector_shift (mode1);
5151 if (!VECTOR_MODE_P (mode1))
5152 reduce_with_shift = false;
5153 else
5154 {
5155 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5156 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5157 reduce_with_shift = false;
5158 }
5159
5160 /* First reduce the vector to the desired vector size we should
5161 do shift reduction on by combining upper and lower halves. */
5162 new_temp = new_phi_result;
5163 while (sz > sz1)
5164 {
5165 gcc_assert (!slp_reduc);
5166 sz /= 2;
5167 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5168
5169 /* The target has to make sure we support lowpart/highpart
5170 extraction, either via direct vector extract or through
5171 an integer mode punning. */
5172 tree dst1, dst2;
5173 if (convert_optab_handler (vec_extract_optab,
5174 TYPE_MODE (TREE_TYPE (new_temp)),
5175 TYPE_MODE (vectype1))
5176 != CODE_FOR_nothing)
5177 {
5178 /* Extract sub-vectors directly once vec_extract becomes
5179 a conversion optab. */
5180 dst1 = make_ssa_name (vectype1);
5181 epilog_stmt
5182 = gimple_build_assign (dst1, BIT_FIELD_REF,
5183 build3 (BIT_FIELD_REF, vectype1,
5184 new_temp, TYPE_SIZE (vectype1),
5185 bitsize_int (0)));
5186 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5187 dst2 = make_ssa_name (vectype1);
5188 epilog_stmt
5189 = gimple_build_assign (dst2, BIT_FIELD_REF,
5190 build3 (BIT_FIELD_REF, vectype1,
5191 new_temp, TYPE_SIZE (vectype1),
5192 bitsize_int (sz * BITS_PER_UNIT)));
5193 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5194 }
5195 else
5196 {
5197 /* Extract via punning to appropriately sized integer mode
5198 vector. */
5199 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5200 1);
5201 tree etype = build_vector_type (eltype, 2);
5202 gcc_assert (convert_optab_handler (vec_extract_optab,
5203 TYPE_MODE (etype),
5204 TYPE_MODE (eltype))
5205 != CODE_FOR_nothing);
5206 tree tem = make_ssa_name (etype);
5207 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5208 build1 (VIEW_CONVERT_EXPR,
5209 etype, new_temp));
5210 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5211 new_temp = tem;
5212 tem = make_ssa_name (eltype);
5213 epilog_stmt
5214 = gimple_build_assign (tem, BIT_FIELD_REF,
5215 build3 (BIT_FIELD_REF, eltype,
5216 new_temp, TYPE_SIZE (eltype),
5217 bitsize_int (0)));
5218 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5219 dst1 = make_ssa_name (vectype1);
5220 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5221 build1 (VIEW_CONVERT_EXPR,
5222 vectype1, tem));
5223 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5224 tem = make_ssa_name (eltype);
5225 epilog_stmt
5226 = gimple_build_assign (tem, BIT_FIELD_REF,
5227 build3 (BIT_FIELD_REF, eltype,
5228 new_temp, TYPE_SIZE (eltype),
5229 bitsize_int (sz * BITS_PER_UNIT)));
5230 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5231 dst2 = make_ssa_name (vectype1);
5232 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5233 build1 (VIEW_CONVERT_EXPR,
5234 vectype1, tem));
5235 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5236 }
5237
5238 new_temp = make_ssa_name (vectype1);
5239 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5240 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5241 }
5242
5243 if (reduce_with_shift && !slp_reduc)
5244 {
5245 int element_bitsize = tree_to_uhwi (bitsize);
5246 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5247 for variable-length vectors and also requires direct target support
5248 for loop reductions. */
5249 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5250 int nelements = vec_size_in_bits / element_bitsize;
5251 vec_perm_builder sel;
5252 vec_perm_indices indices;
5253
5254 int elt_offset;
5255
5256 tree zero_vec = build_zero_cst (vectype1);
5257 /* Case 2: Create:
5258 for (offset = nelements/2; offset >= 1; offset/=2)
5259 {
5260 Create: va' = vec_shift <va, offset>
5261 Create: va = vop <va, va'>
5262 } */
5263
5264 tree rhs;
5265
5266 if (dump_enabled_p ())
5267 dump_printf_loc (MSG_NOTE, vect_location,
5268 "Reduce using vector shifts\n");
5269
5270 mode1 = TYPE_MODE (vectype1);
5271 vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5272 for (elt_offset = nelements / 2;
5273 elt_offset >= 1;
5274 elt_offset /= 2)
5275 {
5276 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5277 indices.new_vector (sel, 2, nelements);
5278 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5279 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5280 new_temp, zero_vec, mask);
5281 new_name = make_ssa_name (vec_dest, epilog_stmt);
5282 gimple_assign_set_lhs (epilog_stmt, new_name);
5283 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5284
5285 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5286 new_temp);
5287 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5288 gimple_assign_set_lhs (epilog_stmt, new_temp);
5289 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5290 }
5291
5292 /* 2.4 Extract the final scalar result. Create:
5293 s_out3 = extract_field <v_out2, bitpos> */
5294
5295 if (dump_enabled_p ())
5296 dump_printf_loc (MSG_NOTE, vect_location,
5297 "extract scalar result\n");
5298
5299 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5300 bitsize, bitsize_zero_node);
5301 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5302 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5303 gimple_assign_set_lhs (epilog_stmt, new_temp);
5304 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5305 scalar_results.safe_push (new_temp);
5306 }
5307 else
5308 {
5309 /* Case 3: Create:
5310 s = extract_field <v_out2, 0>
5311 for (offset = element_size;
5312 offset < vector_size;
5313 offset += element_size;)
5314 {
5315 Create: s' = extract_field <v_out2, offset>
5316 Create: s = op <s, s'> // For non SLP cases
5317 } */
5318
5319 if (dump_enabled_p ())
5320 dump_printf_loc (MSG_NOTE, vect_location,
5321 "Reduce using scalar code.\n");
5322
5323 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5324 int element_bitsize = tree_to_uhwi (bitsize);
5325 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5326 {
5327 int bit_offset;
5328 if (gimple_code (new_phi) == GIMPLE_PHI)
5329 vec_temp = PHI_RESULT (new_phi);
5330 else
5331 vec_temp = gimple_assign_lhs (new_phi);
5332 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5333 bitsize_zero_node);
5334 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5335 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5336 gimple_assign_set_lhs (epilog_stmt, new_temp);
5337 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5338
5339 /* In SLP we don't need to apply reduction operation, so we just
5340 collect s' values in SCALAR_RESULTS. */
5341 if (slp_reduc)
5342 scalar_results.safe_push (new_temp);
5343
5344 for (bit_offset = element_bitsize;
5345 bit_offset < vec_size_in_bits;
5346 bit_offset += element_bitsize)
5347 {
5348 tree bitpos = bitsize_int (bit_offset);
5349 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5350 bitsize, bitpos);
5351
5352 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5353 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5354 gimple_assign_set_lhs (epilog_stmt, new_name);
5355 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5356
5357 if (slp_reduc)
5358 {
5359 /* In SLP we don't need to apply reduction operation, so
5360 we just collect s' values in SCALAR_RESULTS. */
5361 new_temp = new_name;
5362 scalar_results.safe_push (new_name);
5363 }
5364 else
5365 {
5366 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5367 new_name, new_temp);
5368 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5369 gimple_assign_set_lhs (epilog_stmt, new_temp);
5370 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5371 }
5372 }
5373 }
5374
5375 /* The only case where we need to reduce scalar results in SLP, is
5376 unrolling. If the size of SCALAR_RESULTS is greater than
5377 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5378 REDUC_GROUP_SIZE. */
5379 if (slp_reduc)
5380 {
5381 tree res, first_res, new_res;
5382 gimple *new_stmt;
5383
5384 /* Reduce multiple scalar results in case of SLP unrolling. */
5385 for (j = group_size; scalar_results.iterate (j, &res);
5386 j++)
5387 {
5388 first_res = scalar_results[j % group_size];
5389 new_stmt = gimple_build_assign (new_scalar_dest, code,
5390 first_res, res);
5391 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5392 gimple_assign_set_lhs (new_stmt, new_res);
5393 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5394 scalar_results[j % group_size] = new_res;
5395 }
5396 }
5397 else
5398 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5399 scalar_results.safe_push (new_temp);
5400 }
5401
5402 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5403 == INTEGER_INDUC_COND_REDUCTION)
5404 && !operand_equal_p (initial_def, induc_val, 0))
5405 {
5406 /* Earlier we set the initial value to be a vector if induc_val
5407 values. Check the result and if it is induc_val then replace
5408 with the original initial value, unless induc_val is
5409 the same as initial_def already. */
5410 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5411 induc_val);
5412
5413 tree tmp = make_ssa_name (new_scalar_dest);
5414 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5415 initial_def, new_temp);
5416 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5417 scalar_results[0] = tmp;
5418 }
5419 }
5420
5421 vect_finalize_reduction:
5422
5423 if (double_reduc)
5424 loop = loop->inner;
5425
5426 /* 2.5 Adjust the final result by the initial value of the reduction
5427 variable. (When such adjustment is not needed, then
5428 'adjustment_def' is zero). For example, if code is PLUS we create:
5429 new_temp = loop_exit_def + adjustment_def */
5430
5431 if (adjustment_def)
5432 {
5433 gcc_assert (!slp_reduc);
5434 if (nested_in_vect_loop)
5435 {
5436 new_phi = new_phis[0];
5437 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5438 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5439 new_dest = vect_create_destination_var (scalar_dest, vectype);
5440 }
5441 else
5442 {
5443 new_temp = scalar_results[0];
5444 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5445 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5446 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5447 }
5448
5449 epilog_stmt = gimple_build_assign (new_dest, expr);
5450 new_temp = make_ssa_name (new_dest, epilog_stmt);
5451 gimple_assign_set_lhs (epilog_stmt, new_temp);
5452 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5453 if (nested_in_vect_loop)
5454 {
5455 stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5456 STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5457 = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5458
5459 if (!double_reduc)
5460 scalar_results.quick_push (new_temp);
5461 else
5462 scalar_results[0] = new_temp;
5463 }
5464 else
5465 scalar_results[0] = new_temp;
5466
5467 new_phis[0] = epilog_stmt;
5468 }
5469
5470 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5471 phis with new adjusted scalar results, i.e., replace use <s_out0>
5472 with use <s_out4>.
5473
5474 Transform:
5475 loop_exit:
5476 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5477 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5478 v_out2 = reduce <v_out1>
5479 s_out3 = extract_field <v_out2, 0>
5480 s_out4 = adjust_result <s_out3>
5481 use <s_out0>
5482 use <s_out0>
5483
5484 into:
5485
5486 loop_exit:
5487 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5488 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5489 v_out2 = reduce <v_out1>
5490 s_out3 = extract_field <v_out2, 0>
5491 s_out4 = adjust_result <s_out3>
5492 use <s_out4>
5493 use <s_out4> */
5494
5495
5496 /* In SLP reduction chain we reduce vector results into one vector if
5497 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5498 LHS of the last stmt in the reduction chain, since we are looking for
5499 the loop exit phi node. */
5500 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5501 {
5502 stmt_vec_info dest_stmt_info
5503 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5504 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5505 group_size = 1;
5506 }
5507
5508 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5509 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5510 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5511 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5512 correspond to the first vector stmt, etc.
5513 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5514 if (group_size > new_phis.length ())
5515 {
5516 ratio = group_size / new_phis.length ();
5517 gcc_assert (!(group_size % new_phis.length ()));
5518 }
5519 else
5520 ratio = 1;
5521
5522 stmt_vec_info epilog_stmt_info = NULL;
5523 for (k = 0; k < group_size; k++)
5524 {
5525 if (k % ratio == 0)
5526 {
5527 epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
5528 reduction_phi_info = reduction_phis[k / ratio];
5529 if (double_reduc)
5530 inner_phi = inner_phis[k / ratio];
5531 }
5532
5533 if (slp_reduc)
5534 {
5535 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5536
5537 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5538 /* SLP statements can't participate in patterns. */
5539 gcc_assert (!orig_stmt_info);
5540 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5541 }
5542
5543 phis.create (3);
5544 /* Find the loop-closed-use at the loop exit of the original scalar
5545 result. (The reduction result is expected to have two immediate uses -
5546 one at the latch block, and one at the loop exit). */
5547 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5548 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5549 && !is_gimple_debug (USE_STMT (use_p)))
5550 phis.safe_push (USE_STMT (use_p));
5551
5552 /* While we expect to have found an exit_phi because of loop-closed-ssa
5553 form we can end up without one if the scalar cycle is dead. */
5554
5555 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5556 {
5557 if (outer_loop)
5558 {
5559 stmt_vec_info exit_phi_vinfo
5560 = loop_vinfo->lookup_stmt (exit_phi);
5561 gphi *vect_phi;
5562
5563 /* FORNOW. Currently not supporting the case that an inner-loop
5564 reduction is not used in the outer-loop (but only outside the
5565 outer-loop), unless it is double reduction. */
5566 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5567 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5568 || double_reduc);
5569
5570 if (double_reduc)
5571 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5572 else
5573 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
5574 if (!double_reduc
5575 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5576 != vect_double_reduction_def)
5577 continue;
5578
5579 /* Handle double reduction:
5580
5581 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5582 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5583 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5584 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5585
5586 At that point the regular reduction (stmt2 and stmt3) is
5587 already vectorized, as well as the exit phi node, stmt4.
5588 Here we vectorize the phi node of double reduction, stmt1, and
5589 update all relevant statements. */
5590
5591 /* Go through all the uses of s2 to find double reduction phi
5592 node, i.e., stmt1 above. */
5593 orig_name = PHI_RESULT (exit_phi);
5594 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5595 {
5596 stmt_vec_info use_stmt_vinfo;
5597 tree vect_phi_init, preheader_arg, vect_phi_res;
5598 basic_block bb = gimple_bb (use_stmt);
5599
5600 /* Check that USE_STMT is really double reduction phi
5601 node. */
5602 if (gimple_code (use_stmt) != GIMPLE_PHI
5603 || gimple_phi_num_args (use_stmt) != 2
5604 || bb->loop_father != outer_loop)
5605 continue;
5606 use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5607 if (!use_stmt_vinfo
5608 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5609 != vect_double_reduction_def)
5610 continue;
5611
5612 /* Create vector phi node for double reduction:
5613 vs1 = phi <vs0, vs2>
5614 vs1 was created previously in this function by a call to
5615 vect_get_vec_def_for_operand and is stored in
5616 vec_initial_def;
5617 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5618 vs0 is created here. */
5619
5620 /* Create vector phi node. */
5621 vect_phi = create_phi_node (vec_initial_def, bb);
5622 loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5623
5624 /* Create vs0 - initial def of the double reduction phi. */
5625 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5626 loop_preheader_edge (outer_loop));
5627 vect_phi_init = get_initial_def_for_reduction
5628 (stmt_info, preheader_arg, NULL);
5629
5630 /* Update phi node arguments with vs0 and vs2. */
5631 add_phi_arg (vect_phi, vect_phi_init,
5632 loop_preheader_edge (outer_loop),
5633 UNKNOWN_LOCATION);
5634 add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5635 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5636 if (dump_enabled_p ())
5637 dump_printf_loc (MSG_NOTE, vect_location,
5638 "created double reduction phi node: %G",
5639 vect_phi);
5640
5641 vect_phi_res = PHI_RESULT (vect_phi);
5642
5643 /* Replace the use, i.e., set the correct vs1 in the regular
5644 reduction phi node. FORNOW, NCOPIES is always 1, so the
5645 loop is redundant. */
5646 stmt_vec_info use_info = reduction_phi_info;
5647 for (j = 0; j < ncopies; j++)
5648 {
5649 edge pr_edge = loop_preheader_edge (loop);
5650 SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5651 pr_edge->dest_idx, vect_phi_res);
5652 use_info = STMT_VINFO_RELATED_STMT (use_info);
5653 }
5654 }
5655 }
5656 }
5657
5658 phis.release ();
5659 if (nested_in_vect_loop)
5660 {
5661 if (double_reduc)
5662 loop = outer_loop;
5663 else
5664 continue;
5665 }
5666
5667 phis.create (3);
5668 /* Find the loop-closed-use at the loop exit of the original scalar
5669 result. (The reduction result is expected to have two immediate uses,
5670 one at the latch block, and one at the loop exit). For double
5671 reductions we are looking for exit phis of the outer loop. */
5672 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5673 {
5674 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5675 {
5676 if (!is_gimple_debug (USE_STMT (use_p)))
5677 phis.safe_push (USE_STMT (use_p));
5678 }
5679 else
5680 {
5681 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5682 {
5683 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5684
5685 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5686 {
5687 if (!flow_bb_inside_loop_p (loop,
5688 gimple_bb (USE_STMT (phi_use_p)))
5689 && !is_gimple_debug (USE_STMT (phi_use_p)))
5690 phis.safe_push (USE_STMT (phi_use_p));
5691 }
5692 }
5693 }
5694 }
5695
5696 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5697 {
5698 /* Replace the uses: */
5699 orig_name = PHI_RESULT (exit_phi);
5700 scalar_result = scalar_results[k];
5701 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5702 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5703 SET_USE (use_p, scalar_result);
5704 }
5705
5706 phis.release ();
5707 }
5708 }
5709
5710 /* Return a vector of type VECTYPE that is equal to the vector select
5711 operation "MASK ? VEC : IDENTITY". Insert the select statements
5712 before GSI. */
5713
5714 static tree
5715 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5716 tree vec, tree identity)
5717 {
5718 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5719 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5720 mask, vec, identity);
5721 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5722 return cond;
5723 }
5724
5725 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5726 order, starting with LHS. Insert the extraction statements before GSI and
5727 associate the new scalar SSA names with variable SCALAR_DEST.
5728 Return the SSA name for the result. */
5729
5730 static tree
5731 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5732 tree_code code, tree lhs, tree vector_rhs)
5733 {
5734 tree vectype = TREE_TYPE (vector_rhs);
5735 tree scalar_type = TREE_TYPE (vectype);
5736 tree bitsize = TYPE_SIZE (scalar_type);
5737 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5738 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5739
5740 for (unsigned HOST_WIDE_INT bit_offset = 0;
5741 bit_offset < vec_size_in_bits;
5742 bit_offset += element_bitsize)
5743 {
5744 tree bitpos = bitsize_int (bit_offset);
5745 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5746 bitsize, bitpos);
5747
5748 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5749 rhs = make_ssa_name (scalar_dest, stmt);
5750 gimple_assign_set_lhs (stmt, rhs);
5751 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5752
5753 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5754 tree new_name = make_ssa_name (scalar_dest, stmt);
5755 gimple_assign_set_lhs (stmt, new_name);
5756 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5757 lhs = new_name;
5758 }
5759 return lhs;
5760 }
5761
5762 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5763 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5764 statement. CODE is the operation performed by STMT_INFO and OPS are
5765 its scalar operands. REDUC_INDEX is the index of the operand in
5766 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5767 implements in-order reduction, or IFN_LAST if we should open-code it.
5768 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5769 that should be used to control the operation in a fully-masked loop. */
5770
5771 static bool
5772 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5773 gimple_stmt_iterator *gsi,
5774 stmt_vec_info *vec_stmt, slp_tree slp_node,
5775 gimple *reduc_def_stmt,
5776 tree_code code, internal_fn reduc_fn,
5777 tree ops[3], tree vectype_in,
5778 int reduc_index, vec_loop_masks *masks)
5779 {
5780 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5781 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5782 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5783 stmt_vec_info new_stmt_info = NULL;
5784
5785 int ncopies;
5786 if (slp_node)
5787 ncopies = 1;
5788 else
5789 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5790
5791 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5792 gcc_assert (ncopies == 1);
5793 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5794 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5795 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5796 == FOLD_LEFT_REDUCTION);
5797
5798 if (slp_node)
5799 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5800 TYPE_VECTOR_SUBPARTS (vectype_in)));
5801
5802 tree op0 = ops[1 - reduc_index];
5803
5804 int group_size = 1;
5805 stmt_vec_info scalar_dest_def_info;
5806 auto_vec<tree> vec_oprnds0;
5807 if (slp_node)
5808 {
5809 vect_get_vec_defs (op0, NULL_TREE, stmt_info, &vec_oprnds0, NULL,
5810 slp_node);
5811 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5812 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5813 }
5814 else
5815 {
5816 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5817 vec_oprnds0.create (1);
5818 vec_oprnds0.quick_push (loop_vec_def0);
5819 scalar_dest_def_info = stmt_info;
5820 }
5821
5822 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5823 tree scalar_type = TREE_TYPE (scalar_dest);
5824 tree reduc_var = gimple_phi_result (reduc_def_stmt);
5825
5826 int vec_num = vec_oprnds0.length ();
5827 gcc_assert (vec_num == 1 || slp_node);
5828 tree vec_elem_type = TREE_TYPE (vectype_out);
5829 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5830
5831 tree vector_identity = NULL_TREE;
5832 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5833 vector_identity = build_zero_cst (vectype_out);
5834
5835 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5836 int i;
5837 tree def0;
5838 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5839 {
5840 gimple *new_stmt;
5841 tree mask = NULL_TREE;
5842 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5843 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5844
5845 /* Handle MINUS by adding the negative. */
5846 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5847 {
5848 tree negated = make_ssa_name (vectype_out);
5849 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5850 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5851 def0 = negated;
5852 }
5853
5854 if (mask)
5855 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5856 vector_identity);
5857
5858 /* On the first iteration the input is simply the scalar phi
5859 result, and for subsequent iterations it is the output of
5860 the preceding operation. */
5861 if (reduc_fn != IFN_LAST)
5862 {
5863 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5864 /* For chained SLP reductions the output of the previous reduction
5865 operation serves as the input of the next. For the final statement
5866 the output cannot be a temporary - we reuse the original
5867 scalar destination of the last statement. */
5868 if (i != vec_num - 1)
5869 {
5870 gimple_set_lhs (new_stmt, scalar_dest_var);
5871 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5872 gimple_set_lhs (new_stmt, reduc_var);
5873 }
5874 }
5875 else
5876 {
5877 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5878 reduc_var, def0);
5879 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5880 /* Remove the statement, so that we can use the same code paths
5881 as for statements that we've just created. */
5882 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5883 gsi_remove (&tmp_gsi, false);
5884 }
5885
5886 if (i == vec_num - 1)
5887 {
5888 gimple_set_lhs (new_stmt, scalar_dest);
5889 new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5890 new_stmt);
5891 }
5892 else
5893 new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5894 new_stmt, gsi);
5895
5896 if (slp_node)
5897 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5898 }
5899
5900 if (!slp_node)
5901 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5902
5903 return true;
5904 }
5905
5906 /* Function is_nonwrapping_integer_induction.
5907
5908 Check if STMT_VINO (which is part of loop LOOP) both increments and
5909 does not cause overflow. */
5910
5911 static bool
5912 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
5913 {
5914 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5915 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5916 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5917 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5918 widest_int ni, max_loop_value, lhs_max;
5919 wi::overflow_type overflow = wi::OVF_NONE;
5920
5921 /* Make sure the loop is integer based. */
5922 if (TREE_CODE (base) != INTEGER_CST
5923 || TREE_CODE (step) != INTEGER_CST)
5924 return false;
5925
5926 /* Check that the max size of the loop will not wrap. */
5927
5928 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5929 return true;
5930
5931 if (! max_stmt_executions (loop, &ni))
5932 return false;
5933
5934 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5935 &overflow);
5936 if (overflow)
5937 return false;
5938
5939 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5940 TYPE_SIGN (lhs_type), &overflow);
5941 if (overflow)
5942 return false;
5943
5944 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5945 <= TYPE_PRECISION (lhs_type));
5946 }
5947
5948 /* Function vectorizable_reduction.
5949
5950 Check if STMT_INFO performs a reduction operation that can be vectorized.
5951 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5952 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5953 Return true if STMT_INFO is vectorizable in this way.
5954
5955 This function also handles reduction idioms (patterns) that have been
5956 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
5957 may be of this form:
5958 X = pattern_expr (arg0, arg1, ..., X)
5959 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5960 sequence that had been detected and replaced by the pattern-stmt
5961 (STMT_INFO).
5962
5963 This function also handles reduction of condition expressions, for example:
5964 for (int i = 0; i < N; i++)
5965 if (a[i] < value)
5966 last = a[i];
5967 This is handled by vectorising the loop and creating an additional vector
5968 containing the loop indexes for which "a[i] < value" was true. In the
5969 function epilogue this is reduced to a single max value and then used to
5970 index into the vector of results.
5971
5972 In some cases of reduction patterns, the type of the reduction variable X is
5973 different than the type of the other arguments of STMT_INFO.
5974 In such cases, the vectype that is used when transforming STMT_INFO into
5975 a vector stmt is different than the vectype that is used to determine the
5976 vectorization factor, because it consists of a different number of elements
5977 than the actual number of elements that are being operated upon in parallel.
5978
5979 For example, consider an accumulation of shorts into an int accumulator.
5980 On some targets it's possible to vectorize this pattern operating on 8
5981 shorts at a time (hence, the vectype for purposes of determining the
5982 vectorization factor should be V8HI); on the other hand, the vectype that
5983 is used to create the vector form is actually V4SI (the type of the result).
5984
5985 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5986 indicates what is the actual level of parallelism (V8HI in the example), so
5987 that the right vectorization factor would be derived. This vectype
5988 corresponds to the type of arguments to the reduction stmt, and should *NOT*
5989 be used to create the vectorized stmt. The right vectype for the vectorized
5990 stmt is obtained from the type of the result X:
5991 get_vectype_for_scalar_type (TREE_TYPE (X))
5992
5993 This means that, contrary to "regular" reductions (or "regular" stmts in
5994 general), the following equation:
5995 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5996 does *NOT* necessarily hold for reduction patterns. */
5997
5998 bool
5999 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6000 stmt_vec_info *vec_stmt, slp_tree slp_node,
6001 slp_instance slp_node_instance,
6002 stmt_vector_for_cost *cost_vec)
6003 {
6004 tree vec_dest;
6005 tree scalar_dest;
6006 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6007 tree vectype_in = NULL_TREE;
6008 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6009 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6010 enum tree_code code, orig_code;
6011 internal_fn reduc_fn;
6012 machine_mode vec_mode;
6013 int op_type;
6014 optab optab;
6015 tree new_temp = NULL_TREE;
6016 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6017 stmt_vec_info cond_stmt_vinfo = NULL;
6018 enum tree_code cond_reduc_op_code = ERROR_MARK;
6019 tree scalar_type;
6020 bool is_simple_use;
6021 int i;
6022 int ncopies;
6023 int epilog_copies;
6024 stmt_vec_info prev_stmt_info, prev_phi_info;
6025 bool single_defuse_cycle = false;
6026 stmt_vec_info new_stmt_info = NULL;
6027 int j;
6028 tree ops[3];
6029 enum vect_def_type dts[3];
6030 bool nested_cycle = false, found_nested_cycle_def = false;
6031 bool double_reduc = false;
6032 basic_block def_bb;
6033 struct loop * def_stmt_loop;
6034 tree def_arg;
6035 auto_vec<tree> vec_oprnds0;
6036 auto_vec<tree> vec_oprnds1;
6037 auto_vec<tree> vec_oprnds2;
6038 auto_vec<tree> vect_defs;
6039 auto_vec<stmt_vec_info> phis;
6040 int vec_num;
6041 tree def0, tem;
6042 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6043 tree cond_reduc_val = NULL_TREE;
6044
6045 /* Make sure it was already recognized as a reduction computation. */
6046 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6047 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6048 return false;
6049
6050 if (nested_in_vect_loop_p (loop, stmt_info))
6051 {
6052 loop = loop->inner;
6053 nested_cycle = true;
6054 }
6055
6056 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6057 gcc_assert (slp_node
6058 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6059
6060 if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
6061 {
6062 tree phi_result = gimple_phi_result (phi);
6063 /* Analysis is fully done on the reduction stmt invocation. */
6064 if (! vec_stmt)
6065 {
6066 if (slp_node)
6067 slp_node_instance->reduc_phis = slp_node;
6068
6069 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6070 return true;
6071 }
6072
6073 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6074 /* Leave the scalar phi in place. Note that checking
6075 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6076 for reductions involving a single statement. */
6077 return true;
6078
6079 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6080 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6081
6082 if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6083 == EXTRACT_LAST_REDUCTION)
6084 /* Leave the scalar phi in place. */
6085 return true;
6086
6087 gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6088 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6089 {
6090 tree op = gimple_op (reduc_stmt, k);
6091 if (op == phi_result)
6092 continue;
6093 if (k == 1
6094 && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6095 continue;
6096 if (!vectype_in
6097 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6098 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6099 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6100 break;
6101 }
6102 gcc_assert (vectype_in);
6103
6104 if (slp_node)
6105 ncopies = 1;
6106 else
6107 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6108
6109 stmt_vec_info use_stmt_info;
6110 if (ncopies > 1
6111 && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6112 && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6113 && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info)
6114 single_defuse_cycle = true;
6115
6116 /* Create the destination vector */
6117 scalar_dest = gimple_assign_lhs (reduc_stmt);
6118 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6119
6120 if (slp_node)
6121 /* The size vect_schedule_slp_instance computes is off for us. */
6122 vec_num = vect_get_num_vectors
6123 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6124 * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6125 vectype_in);
6126 else
6127 vec_num = 1;
6128
6129 /* Generate the reduction PHIs upfront. */
6130 prev_phi_info = NULL;
6131 for (j = 0; j < ncopies; j++)
6132 {
6133 if (j == 0 || !single_defuse_cycle)
6134 {
6135 for (i = 0; i < vec_num; i++)
6136 {
6137 /* Create the reduction-phi that defines the reduction
6138 operand. */
6139 gimple *new_phi = create_phi_node (vec_dest, loop->header);
6140 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6141
6142 if (slp_node)
6143 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6144 else
6145 {
6146 if (j == 0)
6147 STMT_VINFO_VEC_STMT (stmt_info)
6148 = *vec_stmt = new_phi_info;
6149 else
6150 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6151 prev_phi_info = new_phi_info;
6152 }
6153 }
6154 }
6155 }
6156
6157 return true;
6158 }
6159
6160 /* 1. Is vectorizable reduction? */
6161 /* Not supportable if the reduction variable is used in the loop, unless
6162 it's a reduction chain. */
6163 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6164 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6165 return false;
6166
6167 /* Reductions that are not used even in an enclosing outer-loop,
6168 are expected to be "live" (used out of the loop). */
6169 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6170 && !STMT_VINFO_LIVE_P (stmt_info))
6171 return false;
6172
6173 /* 2. Has this been recognized as a reduction pattern?
6174
6175 Check if STMT represents a pattern that has been recognized
6176 in earlier analysis stages. For stmts that represent a pattern,
6177 the STMT_VINFO_RELATED_STMT field records the last stmt in
6178 the original sequence that constitutes the pattern. */
6179
6180 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6181 if (orig_stmt_info)
6182 {
6183 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6184 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6185 }
6186
6187 /* 3. Check the operands of the operation. The first operands are defined
6188 inside the loop body. The last operand is the reduction variable,
6189 which is defined by the loop-header-phi. */
6190
6191 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6192
6193 /* Flatten RHS. */
6194 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6195 {
6196 case GIMPLE_BINARY_RHS:
6197 code = gimple_assign_rhs_code (stmt);
6198 op_type = TREE_CODE_LENGTH (code);
6199 gcc_assert (op_type == binary_op);
6200 ops[0] = gimple_assign_rhs1 (stmt);
6201 ops[1] = gimple_assign_rhs2 (stmt);
6202 break;
6203
6204 case GIMPLE_TERNARY_RHS:
6205 code = gimple_assign_rhs_code (stmt);
6206 op_type = TREE_CODE_LENGTH (code);
6207 gcc_assert (op_type == ternary_op);
6208 ops[0] = gimple_assign_rhs1 (stmt);
6209 ops[1] = gimple_assign_rhs2 (stmt);
6210 ops[2] = gimple_assign_rhs3 (stmt);
6211 break;
6212
6213 case GIMPLE_UNARY_RHS:
6214 return false;
6215
6216 default:
6217 gcc_unreachable ();
6218 }
6219
6220 if (code == COND_EXPR && slp_node)
6221 return false;
6222
6223 scalar_dest = gimple_assign_lhs (stmt);
6224 scalar_type = TREE_TYPE (scalar_dest);
6225 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6226 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6227 return false;
6228
6229 /* Do not try to vectorize bit-precision reductions. */
6230 if (!type_has_mode_precision_p (scalar_type))
6231 return false;
6232
6233 /* All uses but the last are expected to be defined in the loop.
6234 The last use is the reduction variable. In case of nested cycle this
6235 assumption is not true: we use reduc_index to record the index of the
6236 reduction variable. */
6237 stmt_vec_info reduc_def_info = NULL;
6238 int reduc_index = -1;
6239 for (i = 0; i < op_type; i++)
6240 {
6241 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6242 if (i == 0 && code == COND_EXPR)
6243 continue;
6244
6245 stmt_vec_info def_stmt_info;
6246 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6247 &def_stmt_info);
6248 dt = dts[i];
6249 gcc_assert (is_simple_use);
6250 if (dt == vect_reduction_def)
6251 {
6252 reduc_def_info = def_stmt_info;
6253 reduc_index = i;
6254 continue;
6255 }
6256 else if (tem)
6257 {
6258 /* To properly compute ncopies we are interested in the widest
6259 input type in case we're looking at a widening accumulation. */
6260 if (!vectype_in
6261 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6262 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6263 vectype_in = tem;
6264 }
6265
6266 if (dt != vect_internal_def
6267 && dt != vect_external_def
6268 && dt != vect_constant_def
6269 && dt != vect_induction_def
6270 && !(dt == vect_nested_cycle && nested_cycle))
6271 return false;
6272
6273 if (dt == vect_nested_cycle)
6274 {
6275 found_nested_cycle_def = true;
6276 reduc_def_info = def_stmt_info;
6277 reduc_index = i;
6278 }
6279
6280 if (i == 1 && code == COND_EXPR)
6281 {
6282 /* Record how value of COND_EXPR is defined. */
6283 if (dt == vect_constant_def)
6284 {
6285 cond_reduc_dt = dt;
6286 cond_reduc_val = ops[i];
6287 }
6288 if (dt == vect_induction_def
6289 && def_stmt_info
6290 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6291 {
6292 cond_reduc_dt = dt;
6293 cond_stmt_vinfo = def_stmt_info;
6294 }
6295 }
6296 }
6297
6298 if (!vectype_in)
6299 vectype_in = vectype_out;
6300
6301 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6302 directy used in stmt. */
6303 if (reduc_index == -1)
6304 {
6305 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6306 {
6307 if (dump_enabled_p ())
6308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6309 "in-order reduction chain without SLP.\n");
6310 return false;
6311 }
6312
6313 if (orig_stmt_info)
6314 reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6315 else
6316 reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6317 }
6318
6319 if (! reduc_def_info)
6320 return false;
6321
6322 gphi *reduc_def_phi = dyn_cast <gphi *> (reduc_def_info->stmt);
6323 if (!reduc_def_phi)
6324 return false;
6325
6326 if (!(reduc_index == -1
6327 || dts[reduc_index] == vect_reduction_def
6328 || dts[reduc_index] == vect_nested_cycle
6329 || ((dts[reduc_index] == vect_internal_def
6330 || dts[reduc_index] == vect_external_def
6331 || dts[reduc_index] == vect_constant_def
6332 || dts[reduc_index] == vect_induction_def)
6333 && nested_cycle && found_nested_cycle_def)))
6334 {
6335 /* For pattern recognized stmts, orig_stmt might be a reduction,
6336 but some helper statements for the pattern might not, or
6337 might be COND_EXPRs with reduction uses in the condition. */
6338 gcc_assert (orig_stmt_info);
6339 return false;
6340 }
6341
6342 /* PHIs should not participate in patterns. */
6343 gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6344 enum vect_reduction_type v_reduc_type
6345 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6346 stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6347
6348 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6349 /* If we have a condition reduction, see if we can simplify it further. */
6350 if (v_reduc_type == COND_REDUCTION)
6351 {
6352 /* TODO: We can't yet handle reduction chains, since we need to treat
6353 each COND_EXPR in the chain specially, not just the last one.
6354 E.g. for:
6355
6356 x_1 = PHI <x_3, ...>
6357 x_2 = a_2 ? ... : x_1;
6358 x_3 = a_3 ? ... : x_2;
6359
6360 we're interested in the last element in x_3 for which a_2 || a_3
6361 is true, whereas the current reduction chain handling would
6362 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6363 as a reduction operation. */
6364 if (reduc_index == -1)
6365 {
6366 if (dump_enabled_p ())
6367 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6368 "conditional reduction chains not supported\n");
6369 return false;
6370 }
6371
6372 /* vect_is_simple_reduction ensured that operand 2 is the
6373 loop-carried operand. */
6374 gcc_assert (reduc_index == 2);
6375
6376 /* Loop peeling modifies initial value of reduction PHI, which
6377 makes the reduction stmt to be transformed different to the
6378 original stmt analyzed. We need to record reduction code for
6379 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6380 it can be used directly at transform stage. */
6381 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6382 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6383 {
6384 /* Also set the reduction type to CONST_COND_REDUCTION. */
6385 gcc_assert (cond_reduc_dt == vect_constant_def);
6386 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6387 }
6388 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6389 vectype_in, OPTIMIZE_FOR_SPEED))
6390 {
6391 if (dump_enabled_p ())
6392 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6393 "optimizing condition reduction with"
6394 " FOLD_EXTRACT_LAST.\n");
6395 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6396 }
6397 else if (cond_reduc_dt == vect_induction_def)
6398 {
6399 tree base
6400 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6401 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6402
6403 gcc_assert (TREE_CODE (base) == INTEGER_CST
6404 && TREE_CODE (step) == INTEGER_CST);
6405 cond_reduc_val = NULL_TREE;
6406 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6407 above base; punt if base is the minimum value of the type for
6408 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6409 if (tree_int_cst_sgn (step) == -1)
6410 {
6411 cond_reduc_op_code = MIN_EXPR;
6412 if (tree_int_cst_sgn (base) == -1)
6413 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6414 else if (tree_int_cst_lt (base,
6415 TYPE_MAX_VALUE (TREE_TYPE (base))))
6416 cond_reduc_val
6417 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6418 }
6419 else
6420 {
6421 cond_reduc_op_code = MAX_EXPR;
6422 if (tree_int_cst_sgn (base) == 1)
6423 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6424 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6425 base))
6426 cond_reduc_val
6427 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6428 }
6429 if (cond_reduc_val)
6430 {
6431 if (dump_enabled_p ())
6432 dump_printf_loc (MSG_NOTE, vect_location,
6433 "condition expression based on "
6434 "integer induction.\n");
6435 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6436 = INTEGER_INDUC_COND_REDUCTION;
6437 }
6438 }
6439 else if (cond_reduc_dt == vect_constant_def)
6440 {
6441 enum vect_def_type cond_initial_dt;
6442 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6443 tree cond_initial_val
6444 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6445
6446 gcc_assert (cond_reduc_val != NULL_TREE);
6447 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6448 if (cond_initial_dt == vect_constant_def
6449 && types_compatible_p (TREE_TYPE (cond_initial_val),
6450 TREE_TYPE (cond_reduc_val)))
6451 {
6452 tree e = fold_binary (LE_EXPR, boolean_type_node,
6453 cond_initial_val, cond_reduc_val);
6454 if (e && (integer_onep (e) || integer_zerop (e)))
6455 {
6456 if (dump_enabled_p ())
6457 dump_printf_loc (MSG_NOTE, vect_location,
6458 "condition expression based on "
6459 "compile time constant.\n");
6460 /* Record reduction code at analysis stage. */
6461 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6462 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6463 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6464 = CONST_COND_REDUCTION;
6465 }
6466 }
6467 }
6468 }
6469
6470 if (orig_stmt_info)
6471 gcc_assert (tmp == orig_stmt_info
6472 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6473 else
6474 /* We changed STMT to be the first stmt in reduction chain, hence we
6475 check that in this case the first element in the chain is STMT. */
6476 gcc_assert (tmp == stmt_info
6477 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6478
6479 if (STMT_VINFO_LIVE_P (reduc_def_info))
6480 return false;
6481
6482 if (slp_node)
6483 ncopies = 1;
6484 else
6485 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6486
6487 gcc_assert (ncopies >= 1);
6488
6489 vec_mode = TYPE_MODE (vectype_in);
6490 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6491
6492 if (code == COND_EXPR)
6493 {
6494 /* Only call during the analysis stage, otherwise we'll lose
6495 STMT_VINFO_TYPE. */
6496 if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
6497 ops[reduc_index], 0, NULL,
6498 cost_vec))
6499 {
6500 if (dump_enabled_p ())
6501 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6502 "unsupported condition in reduction\n");
6503 return false;
6504 }
6505 }
6506 else
6507 {
6508 /* 4. Supportable by target? */
6509
6510 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6511 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6512 {
6513 /* Shifts and rotates are only supported by vectorizable_shifts,
6514 not vectorizable_reduction. */
6515 if (dump_enabled_p ())
6516 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6517 "unsupported shift or rotation.\n");
6518 return false;
6519 }
6520
6521 /* 4.1. check support for the operation in the loop */
6522 optab = optab_for_tree_code (code, vectype_in, optab_default);
6523 if (!optab)
6524 {
6525 if (dump_enabled_p ())
6526 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6527 "no optab.\n");
6528
6529 return false;
6530 }
6531
6532 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6533 {
6534 if (dump_enabled_p ())
6535 dump_printf (MSG_NOTE, "op not supported by target.\n");
6536
6537 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6538 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6539 return false;
6540
6541 if (dump_enabled_p ())
6542 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6543 }
6544
6545 /* Worthwhile without SIMD support? */
6546 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6547 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6548 {
6549 if (dump_enabled_p ())
6550 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6551 "not worthwhile without SIMD support.\n");
6552
6553 return false;
6554 }
6555 }
6556
6557 /* 4.2. Check support for the epilog operation.
6558
6559 If STMT represents a reduction pattern, then the type of the
6560 reduction variable may be different than the type of the rest
6561 of the arguments. For example, consider the case of accumulation
6562 of shorts into an int accumulator; The original code:
6563 S1: int_a = (int) short_a;
6564 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6565
6566 was replaced with:
6567 STMT: int_acc = widen_sum <short_a, int_acc>
6568
6569 This means that:
6570 1. The tree-code that is used to create the vector operation in the
6571 epilog code (that reduces the partial results) is not the
6572 tree-code of STMT, but is rather the tree-code of the original
6573 stmt from the pattern that STMT is replacing. I.e, in the example
6574 above we want to use 'widen_sum' in the loop, but 'plus' in the
6575 epilog.
6576 2. The type (mode) we use to check available target support
6577 for the vector operation to be created in the *epilog*, is
6578 determined by the type of the reduction variable (in the example
6579 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6580 However the type (mode) we use to check available target support
6581 for the vector operation to be created *inside the loop*, is
6582 determined by the type of the other arguments to STMT (in the
6583 example we'd check this: optab_handler (widen_sum_optab,
6584 vect_short_mode)).
6585
6586 This is contrary to "regular" reductions, in which the types of all
6587 the arguments are the same as the type of the reduction variable.
6588 For "regular" reductions we can therefore use the same vector type
6589 (and also the same tree-code) when generating the epilog code and
6590 when generating the code inside the loop. */
6591
6592 vect_reduction_type reduction_type
6593 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6594 if (orig_stmt_info
6595 && (reduction_type == TREE_CODE_REDUCTION
6596 || reduction_type == FOLD_LEFT_REDUCTION))
6597 {
6598 /* This is a reduction pattern: get the vectype from the type of the
6599 reduction variable, and get the tree-code from orig_stmt. */
6600 orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6601 gcc_assert (vectype_out);
6602 vec_mode = TYPE_MODE (vectype_out);
6603 }
6604 else
6605 {
6606 /* Regular reduction: use the same vectype and tree-code as used for
6607 the vector code inside the loop can be used for the epilog code. */
6608 orig_code = code;
6609
6610 if (code == MINUS_EXPR)
6611 orig_code = PLUS_EXPR;
6612
6613 /* For simple condition reductions, replace with the actual expression
6614 we want to base our reduction around. */
6615 if (reduction_type == CONST_COND_REDUCTION)
6616 {
6617 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6618 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6619 }
6620 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6621 orig_code = cond_reduc_op_code;
6622 }
6623
6624 if (nested_cycle)
6625 {
6626 def_bb = gimple_bb (reduc_def_phi);
6627 def_stmt_loop = def_bb->loop_father;
6628 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6629 loop_preheader_edge (def_stmt_loop));
6630 stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6631 if (def_arg_stmt_info
6632 && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6633 == vect_double_reduction_def))
6634 double_reduc = true;
6635 }
6636
6637 reduc_fn = IFN_LAST;
6638
6639 if (reduction_type == TREE_CODE_REDUCTION
6640 || reduction_type == FOLD_LEFT_REDUCTION
6641 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6642 || reduction_type == CONST_COND_REDUCTION)
6643 {
6644 if (reduction_type == FOLD_LEFT_REDUCTION
6645 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6646 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6647 {
6648 if (reduc_fn != IFN_LAST
6649 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6650 OPTIMIZE_FOR_SPEED))
6651 {
6652 if (dump_enabled_p ())
6653 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6654 "reduc op not supported by target.\n");
6655
6656 reduc_fn = IFN_LAST;
6657 }
6658 }
6659 else
6660 {
6661 if (!nested_cycle || double_reduc)
6662 {
6663 if (dump_enabled_p ())
6664 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6665 "no reduc code for scalar code.\n");
6666
6667 return false;
6668 }
6669 }
6670 }
6671 else if (reduction_type == COND_REDUCTION)
6672 {
6673 int scalar_precision
6674 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6675 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6676 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6677 nunits_out);
6678
6679 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6680 OPTIMIZE_FOR_SPEED))
6681 reduc_fn = IFN_REDUC_MAX;
6682 }
6683
6684 if (reduction_type != EXTRACT_LAST_REDUCTION
6685 && (!nested_cycle || double_reduc)
6686 && reduc_fn == IFN_LAST
6687 && !nunits_out.is_constant ())
6688 {
6689 if (dump_enabled_p ())
6690 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6691 "missing target support for reduction on"
6692 " variable-length vectors.\n");
6693 return false;
6694 }
6695
6696 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6697 && ncopies > 1)
6698 {
6699 if (dump_enabled_p ())
6700 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6701 "multiple types in double reduction or condition "
6702 "reduction.\n");
6703 return false;
6704 }
6705
6706 /* For SLP reductions, see if there is a neutral value we can use. */
6707 tree neutral_op = NULL_TREE;
6708 if (slp_node)
6709 neutral_op = neutral_op_for_slp_reduction
6710 (slp_node_instance->reduc_phis, code,
6711 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6712
6713 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6714 {
6715 /* We can't support in-order reductions of code such as this:
6716
6717 for (int i = 0; i < n1; ++i)
6718 for (int j = 0; j < n2; ++j)
6719 l += a[j];
6720
6721 since GCC effectively transforms the loop when vectorizing:
6722
6723 for (int i = 0; i < n1 / VF; ++i)
6724 for (int j = 0; j < n2; ++j)
6725 for (int k = 0; k < VF; ++k)
6726 l += a[j];
6727
6728 which is a reassociation of the original operation. */
6729 if (dump_enabled_p ())
6730 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6731 "in-order double reduction not supported.\n");
6732
6733 return false;
6734 }
6735
6736 if (reduction_type == FOLD_LEFT_REDUCTION
6737 && slp_node
6738 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6739 {
6740 /* We cannot use in-order reductions in this case because there is
6741 an implicit reassociation of the operations involved. */
6742 if (dump_enabled_p ())
6743 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6744 "in-order unchained SLP reductions not supported.\n");
6745 return false;
6746 }
6747
6748 /* For double reductions, and for SLP reductions with a neutral value,
6749 we construct a variable-length initial vector by loading a vector
6750 full of the neutral value and then shift-and-inserting the start
6751 values into the low-numbered elements. */
6752 if ((double_reduc || neutral_op)
6753 && !nunits_out.is_constant ()
6754 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6755 vectype_out, OPTIMIZE_FOR_SPEED))
6756 {
6757 if (dump_enabled_p ())
6758 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6759 "reduction on variable-length vectors requires"
6760 " target support for a vector-shift-and-insert"
6761 " operation.\n");
6762 return false;
6763 }
6764
6765 /* Check extra constraints for variable-length unchained SLP reductions. */
6766 if (STMT_SLP_TYPE (stmt_info)
6767 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6768 && !nunits_out.is_constant ())
6769 {
6770 /* We checked above that we could build the initial vector when
6771 there's a neutral element value. Check here for the case in
6772 which each SLP statement has its own initial value and in which
6773 that value needs to be repeated for every instance of the
6774 statement within the initial vector. */
6775 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6776 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6777 if (!neutral_op
6778 && !can_duplicate_and_interleave_p (group_size, elt_mode))
6779 {
6780 if (dump_enabled_p ())
6781 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6782 "unsupported form of SLP reduction for"
6783 " variable-length vectors: cannot build"
6784 " initial vector.\n");
6785 return false;
6786 }
6787 /* The epilogue code relies on the number of elements being a multiple
6788 of the group size. The duplicate-and-interleave approach to setting
6789 up the the initial vector does too. */
6790 if (!multiple_p (nunits_out, group_size))
6791 {
6792 if (dump_enabled_p ())
6793 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6794 "unsupported form of SLP reduction for"
6795 " variable-length vectors: the vector size"
6796 " is not a multiple of the number of results.\n");
6797 return false;
6798 }
6799 }
6800
6801 /* In case of widenning multiplication by a constant, we update the type
6802 of the constant to be the type of the other operand. We check that the
6803 constant fits the type in the pattern recognition pass. */
6804 if (code == DOT_PROD_EXPR
6805 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6806 {
6807 if (TREE_CODE (ops[0]) == INTEGER_CST)
6808 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6809 else if (TREE_CODE (ops[1]) == INTEGER_CST)
6810 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6811 else
6812 {
6813 if (dump_enabled_p ())
6814 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6815 "invalid types in dot-prod\n");
6816
6817 return false;
6818 }
6819 }
6820
6821 if (reduction_type == COND_REDUCTION)
6822 {
6823 widest_int ni;
6824
6825 if (! max_loop_iterations (loop, &ni))
6826 {
6827 if (dump_enabled_p ())
6828 dump_printf_loc (MSG_NOTE, vect_location,
6829 "loop count not known, cannot create cond "
6830 "reduction.\n");
6831 return false;
6832 }
6833 /* Convert backedges to iterations. */
6834 ni += 1;
6835
6836 /* The additional index will be the same type as the condition. Check
6837 that the loop can fit into this less one (because we'll use up the
6838 zero slot for when there are no matches). */
6839 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6840 if (wi::geu_p (ni, wi::to_widest (max_index)))
6841 {
6842 if (dump_enabled_p ())
6843 dump_printf_loc (MSG_NOTE, vect_location,
6844 "loop size is greater than data size.\n");
6845 return false;
6846 }
6847 }
6848
6849 /* In case the vectorization factor (VF) is bigger than the number
6850 of elements that we can fit in a vectype (nunits), we have to generate
6851 more than one vector stmt - i.e - we need to "unroll" the
6852 vector stmt by a factor VF/nunits. For more details see documentation
6853 in vectorizable_operation. */
6854
6855 /* If the reduction is used in an outer loop we need to generate
6856 VF intermediate results, like so (e.g. for ncopies=2):
6857 r0 = phi (init, r0)
6858 r1 = phi (init, r1)
6859 r0 = x0 + r0;
6860 r1 = x1 + r1;
6861 (i.e. we generate VF results in 2 registers).
6862 In this case we have a separate def-use cycle for each copy, and therefore
6863 for each copy we get the vector def for the reduction variable from the
6864 respective phi node created for this copy.
6865
6866 Otherwise (the reduction is unused in the loop nest), we can combine
6867 together intermediate results, like so (e.g. for ncopies=2):
6868 r = phi (init, r)
6869 r = x0 + r;
6870 r = x1 + r;
6871 (i.e. we generate VF/2 results in a single register).
6872 In this case for each copy we get the vector def for the reduction variable
6873 from the vectorized reduction operation generated in the previous iteration.
6874
6875 This only works when we see both the reduction PHI and its only consumer
6876 in vectorizable_reduction and there are no intermediate stmts
6877 participating. */
6878 stmt_vec_info use_stmt_info;
6879 tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6880 if (ncopies > 1
6881 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6882 && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6883 && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
6884 {
6885 single_defuse_cycle = true;
6886 epilog_copies = 1;
6887 }
6888 else
6889 epilog_copies = ncopies;
6890
6891 /* If the reduction stmt is one of the patterns that have lane
6892 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
6893 if ((ncopies > 1
6894 && ! single_defuse_cycle)
6895 && (code == DOT_PROD_EXPR
6896 || code == WIDEN_SUM_EXPR
6897 || code == SAD_EXPR))
6898 {
6899 if (dump_enabled_p ())
6900 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6901 "multi def-use cycle not possible for lane-reducing "
6902 "reduction operation\n");
6903 return false;
6904 }
6905
6906 if (slp_node)
6907 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6908 else
6909 vec_num = 1;
6910
6911 internal_fn cond_fn = get_conditional_internal_fn (code);
6912 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6913
6914 if (!vec_stmt) /* transformation not required. */
6915 {
6916 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
6917 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6918 {
6919 if (reduction_type != FOLD_LEFT_REDUCTION
6920 && (cond_fn == IFN_LAST
6921 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6922 OPTIMIZE_FOR_SPEED)))
6923 {
6924 if (dump_enabled_p ())
6925 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6926 "can't use a fully-masked loop because no"
6927 " conditional operation is available.\n");
6928 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6929 }
6930 else if (reduc_index == -1)
6931 {
6932 if (dump_enabled_p ())
6933 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6934 "can't use a fully-masked loop for chained"
6935 " reductions.\n");
6936 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6937 }
6938 else
6939 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6940 vectype_in);
6941 }
6942 if (dump_enabled_p ()
6943 && reduction_type == FOLD_LEFT_REDUCTION)
6944 dump_printf_loc (MSG_NOTE, vect_location,
6945 "using an in-order (fold-left) reduction.\n");
6946 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6947 return true;
6948 }
6949
6950 /* Transform. */
6951
6952 if (dump_enabled_p ())
6953 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6954
6955 /* FORNOW: Multiple types are not supported for condition. */
6956 if (code == COND_EXPR)
6957 gcc_assert (ncopies == 1);
6958
6959 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6960
6961 if (reduction_type == FOLD_LEFT_REDUCTION)
6962 return vectorize_fold_left_reduction
6963 (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6964 reduc_fn, ops, vectype_in, reduc_index, masks);
6965
6966 if (reduction_type == EXTRACT_LAST_REDUCTION)
6967 {
6968 gcc_assert (!slp_node);
6969 return vectorizable_condition (stmt_info, gsi, vec_stmt,
6970 NULL, reduc_index, NULL, NULL);
6971 }
6972
6973 /* Create the destination vector */
6974 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6975
6976 prev_stmt_info = NULL;
6977 prev_phi_info = NULL;
6978 if (!slp_node)
6979 {
6980 vec_oprnds0.create (1);
6981 vec_oprnds1.create (1);
6982 if (op_type == ternary_op)
6983 vec_oprnds2.create (1);
6984 }
6985
6986 phis.create (vec_num);
6987 vect_defs.create (vec_num);
6988 if (!slp_node)
6989 vect_defs.quick_push (NULL_TREE);
6990
6991 if (slp_node)
6992 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
6993 else
6994 phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
6995
6996 for (j = 0; j < ncopies; j++)
6997 {
6998 if (code == COND_EXPR)
6999 {
7000 gcc_assert (!slp_node);
7001 vectorizable_condition (stmt_info, gsi, vec_stmt,
7002 PHI_RESULT (phis[0]->stmt),
7003 reduc_index, NULL, NULL);
7004 /* Multiple types are not supported for condition. */
7005 break;
7006 }
7007
7008 /* Handle uses. */
7009 if (j == 0)
7010 {
7011 if (slp_node)
7012 {
7013 /* Get vec defs for all the operands except the reduction index,
7014 ensuring the ordering of the ops in the vector is kept. */
7015 auto_vec<tree, 3> slp_ops;
7016 auto_vec<vec<tree>, 3> vec_defs;
7017
7018 slp_ops.quick_push (ops[0]);
7019 slp_ops.quick_push (ops[1]);
7020 if (op_type == ternary_op)
7021 slp_ops.quick_push (ops[2]);
7022
7023 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7024
7025 vec_oprnds0.safe_splice (vec_defs[0]);
7026 vec_defs[0].release ();
7027 vec_oprnds1.safe_splice (vec_defs[1]);
7028 vec_defs[1].release ();
7029 if (op_type == ternary_op)
7030 {
7031 vec_oprnds2.safe_splice (vec_defs[2]);
7032 vec_defs[2].release ();
7033 }
7034 }
7035 else
7036 {
7037 vec_oprnds0.quick_push
7038 (vect_get_vec_def_for_operand (ops[0], stmt_info));
7039 vec_oprnds1.quick_push
7040 (vect_get_vec_def_for_operand (ops[1], stmt_info));
7041 if (op_type == ternary_op)
7042 vec_oprnds2.quick_push
7043 (vect_get_vec_def_for_operand (ops[2], stmt_info));
7044 }
7045 }
7046 else
7047 {
7048 if (!slp_node)
7049 {
7050 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7051
7052 if (single_defuse_cycle && reduc_index == 0)
7053 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7054 else
7055 vec_oprnds0[0]
7056 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7057 vec_oprnds0[0]);
7058 if (single_defuse_cycle && reduc_index == 1)
7059 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7060 else
7061 vec_oprnds1[0]
7062 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7063 vec_oprnds1[0]);
7064 if (op_type == ternary_op)
7065 {
7066 if (single_defuse_cycle && reduc_index == 2)
7067 vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7068 else
7069 vec_oprnds2[0]
7070 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7071 vec_oprnds2[0]);
7072 }
7073 }
7074 }
7075
7076 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7077 {
7078 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7079 if (masked_loop_p)
7080 {
7081 /* Make sure that the reduction accumulator is vop[0]. */
7082 if (reduc_index == 1)
7083 {
7084 gcc_assert (commutative_tree_code (code));
7085 std::swap (vop[0], vop[1]);
7086 }
7087 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7088 vectype_in, i * ncopies + j);
7089 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7090 vop[0], vop[1],
7091 vop[0]);
7092 new_temp = make_ssa_name (vec_dest, call);
7093 gimple_call_set_lhs (call, new_temp);
7094 gimple_call_set_nothrow (call, true);
7095 new_stmt_info
7096 = vect_finish_stmt_generation (stmt_info, call, gsi);
7097 }
7098 else
7099 {
7100 if (op_type == ternary_op)
7101 vop[2] = vec_oprnds2[i];
7102
7103 gassign *new_stmt = gimple_build_assign (vec_dest, code,
7104 vop[0], vop[1], vop[2]);
7105 new_temp = make_ssa_name (vec_dest, new_stmt);
7106 gimple_assign_set_lhs (new_stmt, new_temp);
7107 new_stmt_info
7108 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7109 }
7110
7111 if (slp_node)
7112 {
7113 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7114 vect_defs.quick_push (new_temp);
7115 }
7116 else
7117 vect_defs[0] = new_temp;
7118 }
7119
7120 if (slp_node)
7121 continue;
7122
7123 if (j == 0)
7124 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7125 else
7126 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7127
7128 prev_stmt_info = new_stmt_info;
7129 }
7130
7131 /* Finalize the reduction-phi (set its arguments) and create the
7132 epilog reduction code. */
7133 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7134 vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7135
7136 vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
7137 epilog_copies, reduc_fn, phis,
7138 double_reduc, slp_node, slp_node_instance,
7139 cond_reduc_val, cond_reduc_op_code,
7140 neutral_op);
7141
7142 return true;
7143 }
7144
7145 /* Function vect_min_worthwhile_factor.
7146
7147 For a loop where we could vectorize the operation indicated by CODE,
7148 return the minimum vectorization factor that makes it worthwhile
7149 to use generic vectors. */
7150 static unsigned int
7151 vect_min_worthwhile_factor (enum tree_code code)
7152 {
7153 switch (code)
7154 {
7155 case PLUS_EXPR:
7156 case MINUS_EXPR:
7157 case NEGATE_EXPR:
7158 return 4;
7159
7160 case BIT_AND_EXPR:
7161 case BIT_IOR_EXPR:
7162 case BIT_XOR_EXPR:
7163 case BIT_NOT_EXPR:
7164 return 2;
7165
7166 default:
7167 return INT_MAX;
7168 }
7169 }
7170
7171 /* Return true if VINFO indicates we are doing loop vectorization and if
7172 it is worth decomposing CODE operations into scalar operations for
7173 that loop's vectorization factor. */
7174
7175 bool
7176 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7177 {
7178 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7179 unsigned HOST_WIDE_INT value;
7180 return (loop_vinfo
7181 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7182 && value >= vect_min_worthwhile_factor (code));
7183 }
7184
7185 /* Function vectorizable_induction
7186
7187 Check if STMT_INFO performs an induction computation that can be vectorized.
7188 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7189 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7190 Return true if STMT_INFO is vectorizable in this way. */
7191
7192 bool
7193 vectorizable_induction (stmt_vec_info stmt_info,
7194 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7195 stmt_vec_info *vec_stmt, slp_tree slp_node,
7196 stmt_vector_for_cost *cost_vec)
7197 {
7198 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7199 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7200 unsigned ncopies;
7201 bool nested_in_vect_loop = false;
7202 struct loop *iv_loop;
7203 tree vec_def;
7204 edge pe = loop_preheader_edge (loop);
7205 basic_block new_bb;
7206 tree new_vec, vec_init, vec_step, t;
7207 tree new_name;
7208 gimple *new_stmt;
7209 gphi *induction_phi;
7210 tree induc_def, vec_dest;
7211 tree init_expr, step_expr;
7212 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7213 unsigned i;
7214 tree expr;
7215 gimple_seq stmts;
7216 imm_use_iterator imm_iter;
7217 use_operand_p use_p;
7218 gimple *exit_phi;
7219 edge latch_e;
7220 tree loop_arg;
7221 gimple_stmt_iterator si;
7222
7223 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7224 if (!phi)
7225 return false;
7226
7227 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7228 return false;
7229
7230 /* Make sure it was recognized as induction computation. */
7231 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7232 return false;
7233
7234 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7235 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7236
7237 if (slp_node)
7238 ncopies = 1;
7239 else
7240 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7241 gcc_assert (ncopies >= 1);
7242
7243 /* FORNOW. These restrictions should be relaxed. */
7244 if (nested_in_vect_loop_p (loop, stmt_info))
7245 {
7246 imm_use_iterator imm_iter;
7247 use_operand_p use_p;
7248 gimple *exit_phi;
7249 edge latch_e;
7250 tree loop_arg;
7251
7252 if (ncopies > 1)
7253 {
7254 if (dump_enabled_p ())
7255 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7256 "multiple types in nested loop.\n");
7257 return false;
7258 }
7259
7260 /* FORNOW: outer loop induction with SLP not supported. */
7261 if (STMT_SLP_TYPE (stmt_info))
7262 return false;
7263
7264 exit_phi = NULL;
7265 latch_e = loop_latch_edge (loop->inner);
7266 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7267 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7268 {
7269 gimple *use_stmt = USE_STMT (use_p);
7270 if (is_gimple_debug (use_stmt))
7271 continue;
7272
7273 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7274 {
7275 exit_phi = use_stmt;
7276 break;
7277 }
7278 }
7279 if (exit_phi)
7280 {
7281 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7282 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7283 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7284 {
7285 if (dump_enabled_p ())
7286 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7287 "inner-loop induction only used outside "
7288 "of the outer vectorized loop.\n");
7289 return false;
7290 }
7291 }
7292
7293 nested_in_vect_loop = true;
7294 iv_loop = loop->inner;
7295 }
7296 else
7297 iv_loop = loop;
7298 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7299
7300 if (slp_node && !nunits.is_constant ())
7301 {
7302 /* The current SLP code creates the initial value element-by-element. */
7303 if (dump_enabled_p ())
7304 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7305 "SLP induction not supported for variable-length"
7306 " vectors.\n");
7307 return false;
7308 }
7309
7310 if (!vec_stmt) /* transformation not required. */
7311 {
7312 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7313 DUMP_VECT_SCOPE ("vectorizable_induction");
7314 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7315 return true;
7316 }
7317
7318 /* Transform. */
7319
7320 /* Compute a vector variable, initialized with the first VF values of
7321 the induction variable. E.g., for an iv with IV_PHI='X' and
7322 evolution S, for a vector of 4 units, we want to compute:
7323 [X, X + S, X + 2*S, X + 3*S]. */
7324
7325 if (dump_enabled_p ())
7326 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7327
7328 latch_e = loop_latch_edge (iv_loop);
7329 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7330
7331 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7332 gcc_assert (step_expr != NULL_TREE);
7333
7334 pe = loop_preheader_edge (iv_loop);
7335 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7336 loop_preheader_edge (iv_loop));
7337
7338 stmts = NULL;
7339 if (!nested_in_vect_loop)
7340 {
7341 /* Convert the initial value to the desired type. */
7342 tree new_type = TREE_TYPE (vectype);
7343 init_expr = gimple_convert (&stmts, new_type, init_expr);
7344
7345 /* If we are using the loop mask to "peel" for alignment then we need
7346 to adjust the start value here. */
7347 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7348 if (skip_niters != NULL_TREE)
7349 {
7350 if (FLOAT_TYPE_P (vectype))
7351 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7352 skip_niters);
7353 else
7354 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7355 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7356 skip_niters, step_expr);
7357 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7358 init_expr, skip_step);
7359 }
7360 }
7361
7362 /* Convert the step to the desired type. */
7363 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7364
7365 if (stmts)
7366 {
7367 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7368 gcc_assert (!new_bb);
7369 }
7370
7371 /* Find the first insertion point in the BB. */
7372 basic_block bb = gimple_bb (phi);
7373 si = gsi_after_labels (bb);
7374
7375 /* For SLP induction we have to generate several IVs as for example
7376 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7377 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7378 [VF*S, VF*S, VF*S, VF*S] for all. */
7379 if (slp_node)
7380 {
7381 /* Enforced above. */
7382 unsigned int const_nunits = nunits.to_constant ();
7383
7384 /* Generate [VF*S, VF*S, ... ]. */
7385 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7386 {
7387 expr = build_int_cst (integer_type_node, vf);
7388 expr = fold_convert (TREE_TYPE (step_expr), expr);
7389 }
7390 else
7391 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7392 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7393 expr, step_expr);
7394 if (! CONSTANT_CLASS_P (new_name))
7395 new_name = vect_init_vector (stmt_info, new_name,
7396 TREE_TYPE (step_expr), NULL);
7397 new_vec = build_vector_from_val (vectype, new_name);
7398 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7399
7400 /* Now generate the IVs. */
7401 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7402 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7403 unsigned elts = const_nunits * nvects;
7404 unsigned nivs = least_common_multiple (group_size,
7405 const_nunits) / const_nunits;
7406 gcc_assert (elts % group_size == 0);
7407 tree elt = init_expr;
7408 unsigned ivn;
7409 for (ivn = 0; ivn < nivs; ++ivn)
7410 {
7411 tree_vector_builder elts (vectype, const_nunits, 1);
7412 stmts = NULL;
7413 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7414 {
7415 if (ivn*const_nunits + eltn >= group_size
7416 && (ivn * const_nunits + eltn) % group_size == 0)
7417 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7418 elt, step_expr);
7419 elts.quick_push (elt);
7420 }
7421 vec_init = gimple_build_vector (&stmts, &elts);
7422 if (stmts)
7423 {
7424 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7425 gcc_assert (!new_bb);
7426 }
7427
7428 /* Create the induction-phi that defines the induction-operand. */
7429 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7430 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7431 stmt_vec_info induction_phi_info
7432 = loop_vinfo->add_stmt (induction_phi);
7433 induc_def = PHI_RESULT (induction_phi);
7434
7435 /* Create the iv update inside the loop */
7436 vec_def = make_ssa_name (vec_dest);
7437 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7438 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7439 loop_vinfo->add_stmt (new_stmt);
7440
7441 /* Set the arguments of the phi node: */
7442 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7443 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7444 UNKNOWN_LOCATION);
7445
7446 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7447 }
7448
7449 /* Re-use IVs when we can. */
7450 if (ivn < nvects)
7451 {
7452 unsigned vfp
7453 = least_common_multiple (group_size, const_nunits) / group_size;
7454 /* Generate [VF'*S, VF'*S, ... ]. */
7455 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7456 {
7457 expr = build_int_cst (integer_type_node, vfp);
7458 expr = fold_convert (TREE_TYPE (step_expr), expr);
7459 }
7460 else
7461 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7462 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7463 expr, step_expr);
7464 if (! CONSTANT_CLASS_P (new_name))
7465 new_name = vect_init_vector (stmt_info, new_name,
7466 TREE_TYPE (step_expr), NULL);
7467 new_vec = build_vector_from_val (vectype, new_name);
7468 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7469 for (; ivn < nvects; ++ivn)
7470 {
7471 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7472 tree def;
7473 if (gimple_code (iv) == GIMPLE_PHI)
7474 def = gimple_phi_result (iv);
7475 else
7476 def = gimple_assign_lhs (iv);
7477 new_stmt = gimple_build_assign (make_ssa_name (vectype),
7478 PLUS_EXPR,
7479 def, vec_step);
7480 if (gimple_code (iv) == GIMPLE_PHI)
7481 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7482 else
7483 {
7484 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7485 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7486 }
7487 SLP_TREE_VEC_STMTS (slp_node).quick_push
7488 (loop_vinfo->add_stmt (new_stmt));
7489 }
7490 }
7491
7492 return true;
7493 }
7494
7495 /* Create the vector that holds the initial_value of the induction. */
7496 if (nested_in_vect_loop)
7497 {
7498 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7499 been created during vectorization of previous stmts. We obtain it
7500 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7501 vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7502 /* If the initial value is not of proper type, convert it. */
7503 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7504 {
7505 new_stmt
7506 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7507 vect_simple_var,
7508 "vec_iv_"),
7509 VIEW_CONVERT_EXPR,
7510 build1 (VIEW_CONVERT_EXPR, vectype,
7511 vec_init));
7512 vec_init = gimple_assign_lhs (new_stmt);
7513 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7514 new_stmt);
7515 gcc_assert (!new_bb);
7516 loop_vinfo->add_stmt (new_stmt);
7517 }
7518 }
7519 else
7520 {
7521 /* iv_loop is the loop to be vectorized. Create:
7522 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7523 stmts = NULL;
7524 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7525
7526 unsigned HOST_WIDE_INT const_nunits;
7527 if (nunits.is_constant (&const_nunits))
7528 {
7529 tree_vector_builder elts (vectype, const_nunits, 1);
7530 elts.quick_push (new_name);
7531 for (i = 1; i < const_nunits; i++)
7532 {
7533 /* Create: new_name_i = new_name + step_expr */
7534 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7535 new_name, step_expr);
7536 elts.quick_push (new_name);
7537 }
7538 /* Create a vector from [new_name_0, new_name_1, ...,
7539 new_name_nunits-1] */
7540 vec_init = gimple_build_vector (&stmts, &elts);
7541 }
7542 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7543 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7544 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7545 new_name, step_expr);
7546 else
7547 {
7548 /* Build:
7549 [base, base, base, ...]
7550 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7551 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7552 gcc_assert (flag_associative_math);
7553 tree index = build_index_vector (vectype, 0, 1);
7554 tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7555 new_name);
7556 tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7557 step_expr);
7558 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7559 vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7560 vec_init, step_vec);
7561 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7562 vec_init, base_vec);
7563 }
7564
7565 if (stmts)
7566 {
7567 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7568 gcc_assert (!new_bb);
7569 }
7570 }
7571
7572
7573 /* Create the vector that holds the step of the induction. */
7574 if (nested_in_vect_loop)
7575 /* iv_loop is nested in the loop to be vectorized. Generate:
7576 vec_step = [S, S, S, S] */
7577 new_name = step_expr;
7578 else
7579 {
7580 /* iv_loop is the loop to be vectorized. Generate:
7581 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7582 gimple_seq seq = NULL;
7583 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7584 {
7585 expr = build_int_cst (integer_type_node, vf);
7586 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7587 }
7588 else
7589 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7590 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7591 expr, step_expr);
7592 if (seq)
7593 {
7594 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7595 gcc_assert (!new_bb);
7596 }
7597 }
7598
7599 t = unshare_expr (new_name);
7600 gcc_assert (CONSTANT_CLASS_P (new_name)
7601 || TREE_CODE (new_name) == SSA_NAME);
7602 new_vec = build_vector_from_val (vectype, t);
7603 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7604
7605
7606 /* Create the following def-use cycle:
7607 loop prolog:
7608 vec_init = ...
7609 vec_step = ...
7610 loop:
7611 vec_iv = PHI <vec_init, vec_loop>
7612 ...
7613 STMT
7614 ...
7615 vec_loop = vec_iv + vec_step; */
7616
7617 /* Create the induction-phi that defines the induction-operand. */
7618 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7619 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7620 stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7621 induc_def = PHI_RESULT (induction_phi);
7622
7623 /* Create the iv update inside the loop */
7624 vec_def = make_ssa_name (vec_dest);
7625 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7626 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7627 stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7628
7629 /* Set the arguments of the phi node: */
7630 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7631 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7632 UNKNOWN_LOCATION);
7633
7634 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7635
7636 /* In case that vectorization factor (VF) is bigger than the number
7637 of elements that we can fit in a vectype (nunits), we have to generate
7638 more than one vector stmt - i.e - we need to "unroll" the
7639 vector stmt by a factor VF/nunits. For more details see documentation
7640 in vectorizable_operation. */
7641
7642 if (ncopies > 1)
7643 {
7644 gimple_seq seq = NULL;
7645 stmt_vec_info prev_stmt_vinfo;
7646 /* FORNOW. This restriction should be relaxed. */
7647 gcc_assert (!nested_in_vect_loop);
7648
7649 /* Create the vector that holds the step of the induction. */
7650 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7651 {
7652 expr = build_int_cst (integer_type_node, nunits);
7653 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7654 }
7655 else
7656 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7657 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7658 expr, step_expr);
7659 if (seq)
7660 {
7661 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7662 gcc_assert (!new_bb);
7663 }
7664
7665 t = unshare_expr (new_name);
7666 gcc_assert (CONSTANT_CLASS_P (new_name)
7667 || TREE_CODE (new_name) == SSA_NAME);
7668 new_vec = build_vector_from_val (vectype, t);
7669 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7670
7671 vec_def = induc_def;
7672 prev_stmt_vinfo = induction_phi_info;
7673 for (i = 1; i < ncopies; i++)
7674 {
7675 /* vec_i = vec_prev + vec_step */
7676 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7677 vec_def, vec_step);
7678 vec_def = make_ssa_name (vec_dest, new_stmt);
7679 gimple_assign_set_lhs (new_stmt, vec_def);
7680
7681 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7682 new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7683 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7684 prev_stmt_vinfo = new_stmt_info;
7685 }
7686 }
7687
7688 if (nested_in_vect_loop)
7689 {
7690 /* Find the loop-closed exit-phi of the induction, and record
7691 the final vector of induction results: */
7692 exit_phi = NULL;
7693 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7694 {
7695 gimple *use_stmt = USE_STMT (use_p);
7696 if (is_gimple_debug (use_stmt))
7697 continue;
7698
7699 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7700 {
7701 exit_phi = use_stmt;
7702 break;
7703 }
7704 }
7705 if (exit_phi)
7706 {
7707 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7708 /* FORNOW. Currently not supporting the case that an inner-loop induction
7709 is not used in the outer-loop (i.e. only outside the outer-loop). */
7710 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7711 && !STMT_VINFO_LIVE_P (stmt_vinfo));
7712
7713 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7714 if (dump_enabled_p ())
7715 dump_printf_loc (MSG_NOTE, vect_location,
7716 "vector of inductions after inner-loop:%G",
7717 new_stmt);
7718 }
7719 }
7720
7721
7722 if (dump_enabled_p ())
7723 dump_printf_loc (MSG_NOTE, vect_location,
7724 "transform induction: created def-use cycle: %G%G",
7725 induction_phi, SSA_NAME_DEF_STMT (vec_def));
7726
7727 return true;
7728 }
7729
7730 /* Function vectorizable_live_operation.
7731
7732 STMT_INFO computes a value that is used outside the loop. Check if
7733 it can be supported. */
7734
7735 bool
7736 vectorizable_live_operation (stmt_vec_info stmt_info,
7737 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7738 slp_tree slp_node, int slp_index,
7739 stmt_vec_info *vec_stmt,
7740 stmt_vector_for_cost *)
7741 {
7742 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7743 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7744 imm_use_iterator imm_iter;
7745 tree lhs, lhs_type, bitsize, vec_bitsize;
7746 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7747 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7748 int ncopies;
7749 gimple *use_stmt;
7750 auto_vec<tree> vec_oprnds;
7751 int vec_entry = 0;
7752 poly_uint64 vec_index = 0;
7753
7754 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7755
7756 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7757 return false;
7758
7759 /* FORNOW. CHECKME. */
7760 if (nested_in_vect_loop_p (loop, stmt_info))
7761 return false;
7762
7763 /* If STMT is not relevant and it is a simple assignment and its inputs are
7764 invariant then it can remain in place, unvectorized. The original last
7765 scalar value that it computes will be used. */
7766 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7767 {
7768 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7769 if (dump_enabled_p ())
7770 dump_printf_loc (MSG_NOTE, vect_location,
7771 "statement is simple and uses invariant. Leaving in "
7772 "place.\n");
7773 return true;
7774 }
7775
7776 if (slp_node)
7777 ncopies = 1;
7778 else
7779 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7780
7781 if (slp_node)
7782 {
7783 gcc_assert (slp_index >= 0);
7784
7785 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7786 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7787
7788 /* Get the last occurrence of the scalar index from the concatenation of
7789 all the slp vectors. Calculate which slp vector it is and the index
7790 within. */
7791 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7792
7793 /* Calculate which vector contains the result, and which lane of
7794 that vector we need. */
7795 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7796 {
7797 if (dump_enabled_p ())
7798 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7799 "Cannot determine which vector holds the"
7800 " final result.\n");
7801 return false;
7802 }
7803 }
7804
7805 if (!vec_stmt)
7806 {
7807 /* No transformation required. */
7808 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7809 {
7810 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7811 OPTIMIZE_FOR_SPEED))
7812 {
7813 if (dump_enabled_p ())
7814 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7815 "can't use a fully-masked loop because "
7816 "the target doesn't support extract last "
7817 "reduction.\n");
7818 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7819 }
7820 else if (slp_node)
7821 {
7822 if (dump_enabled_p ())
7823 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7824 "can't use a fully-masked loop because an "
7825 "SLP statement is live after the loop.\n");
7826 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7827 }
7828 else if (ncopies > 1)
7829 {
7830 if (dump_enabled_p ())
7831 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7832 "can't use a fully-masked loop because"
7833 " ncopies is greater than 1.\n");
7834 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7835 }
7836 else
7837 {
7838 gcc_assert (ncopies == 1 && !slp_node);
7839 vect_record_loop_mask (loop_vinfo,
7840 &LOOP_VINFO_MASKS (loop_vinfo),
7841 1, vectype);
7842 }
7843 }
7844 return true;
7845 }
7846
7847 /* Use the lhs of the original scalar statement. */
7848 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
7849
7850 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7851 : gimple_get_lhs (stmt);
7852 lhs_type = TREE_TYPE (lhs);
7853
7854 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7855 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7856 : TYPE_SIZE (TREE_TYPE (vectype)));
7857 vec_bitsize = TYPE_SIZE (vectype);
7858
7859 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
7860 tree vec_lhs, bitstart;
7861 if (slp_node)
7862 {
7863 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7864
7865 /* Get the correct slp vectorized stmt. */
7866 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7867 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7868 vec_lhs = gimple_phi_result (phi);
7869 else
7870 vec_lhs = gimple_get_lhs (vec_stmt);
7871
7872 /* Get entry to use. */
7873 bitstart = bitsize_int (vec_index);
7874 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7875 }
7876 else
7877 {
7878 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7879 vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7880 gcc_checking_assert (ncopies == 1
7881 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7882
7883 /* For multiple copies, get the last copy. */
7884 for (int i = 1; i < ncopies; ++i)
7885 vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
7886
7887 /* Get the last lane in the vector. */
7888 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7889 }
7890
7891 gimple_seq stmts = NULL;
7892 tree new_tree;
7893 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7894 {
7895 /* Emit:
7896
7897 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7898
7899 where VEC_LHS is the vectorized live-out result and MASK is
7900 the loop mask for the final iteration. */
7901 gcc_assert (ncopies == 1 && !slp_node);
7902 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7903 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
7904 1, vectype, 0);
7905 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
7906 scalar_type, mask, vec_lhs);
7907
7908 /* Convert the extracted vector element to the required scalar type. */
7909 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7910 }
7911 else
7912 {
7913 tree bftype = TREE_TYPE (vectype);
7914 if (VECTOR_BOOLEAN_TYPE_P (vectype))
7915 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7916 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7917 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
7918 &stmts, true, NULL_TREE);
7919 }
7920
7921 if (stmts)
7922 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7923
7924 /* Replace use of lhs with newly computed result. If the use stmt is a
7925 single arg PHI, just replace all uses of PHI result. It's necessary
7926 because lcssa PHI defining lhs may be before newly inserted stmt. */
7927 use_operand_p use_p;
7928 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7929 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7930 && !is_gimple_debug (use_stmt))
7931 {
7932 if (gimple_code (use_stmt) == GIMPLE_PHI
7933 && gimple_phi_num_args (use_stmt) == 1)
7934 {
7935 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7936 }
7937 else
7938 {
7939 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7940 SET_USE (use_p, new_tree);
7941 }
7942 update_stmt (use_stmt);
7943 }
7944
7945 return true;
7946 }
7947
7948 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
7949
7950 static void
7951 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info)
7952 {
7953 ssa_op_iter op_iter;
7954 imm_use_iterator imm_iter;
7955 def_operand_p def_p;
7956 gimple *ustmt;
7957
7958 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
7959 {
7960 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7961 {
7962 basic_block bb;
7963
7964 if (!is_gimple_debug (ustmt))
7965 continue;
7966
7967 bb = gimple_bb (ustmt);
7968
7969 if (!flow_bb_inside_loop_p (loop, bb))
7970 {
7971 if (gimple_debug_bind_p (ustmt))
7972 {
7973 if (dump_enabled_p ())
7974 dump_printf_loc (MSG_NOTE, vect_location,
7975 "killing debug use\n");
7976
7977 gimple_debug_bind_reset_value (ustmt);
7978 update_stmt (ustmt);
7979 }
7980 else
7981 gcc_unreachable ();
7982 }
7983 }
7984 }
7985 }
7986
7987 /* Given loop represented by LOOP_VINFO, return true if computation of
7988 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7989 otherwise. */
7990
7991 static bool
7992 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7993 {
7994 /* Constant case. */
7995 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7996 {
7997 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7998 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7999
8000 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8001 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8002 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8003 return true;
8004 }
8005
8006 widest_int max;
8007 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8008 /* Check the upper bound of loop niters. */
8009 if (get_max_loop_iterations (loop, &max))
8010 {
8011 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8012 signop sgn = TYPE_SIGN (type);
8013 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8014 if (max < type_max)
8015 return true;
8016 }
8017 return false;
8018 }
8019
8020 /* Return a mask type with half the number of elements as TYPE. */
8021
8022 tree
8023 vect_halve_mask_nunits (tree type)
8024 {
8025 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8026 return build_truth_vector_type (nunits, current_vector_size);
8027 }
8028
8029 /* Return a mask type with twice as many elements as TYPE. */
8030
8031 tree
8032 vect_double_mask_nunits (tree type)
8033 {
8034 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8035 return build_truth_vector_type (nunits, current_vector_size);
8036 }
8037
8038 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8039 contain a sequence of NVECTORS masks that each control a vector of type
8040 VECTYPE. */
8041
8042 void
8043 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8044 unsigned int nvectors, tree vectype)
8045 {
8046 gcc_assert (nvectors != 0);
8047 if (masks->length () < nvectors)
8048 masks->safe_grow_cleared (nvectors);
8049 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8050 /* The number of scalars per iteration and the number of vectors are
8051 both compile-time constants. */
8052 unsigned int nscalars_per_iter
8053 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8054 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8055 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8056 {
8057 rgm->max_nscalars_per_iter = nscalars_per_iter;
8058 rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8059 }
8060 }
8061
8062 /* Given a complete set of masks MASKS, extract mask number INDEX
8063 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8064 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8065
8066 See the comment above vec_loop_masks for more details about the mask
8067 arrangement. */
8068
8069 tree
8070 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8071 unsigned int nvectors, tree vectype, unsigned int index)
8072 {
8073 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8074 tree mask_type = rgm->mask_type;
8075
8076 /* Populate the rgroup's mask array, if this is the first time we've
8077 used it. */
8078 if (rgm->masks.is_empty ())
8079 {
8080 rgm->masks.safe_grow_cleared (nvectors);
8081 for (unsigned int i = 0; i < nvectors; ++i)
8082 {
8083 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8084 /* Provide a dummy definition until the real one is available. */
8085 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8086 rgm->masks[i] = mask;
8087 }
8088 }
8089
8090 tree mask = rgm->masks[index];
8091 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8092 TYPE_VECTOR_SUBPARTS (vectype)))
8093 {
8094 /* A loop mask for data type X can be reused for data type Y
8095 if X has N times more elements than Y and if Y's elements
8096 are N times bigger than X's. In this case each sequence
8097 of N elements in the loop mask will be all-zero or all-one.
8098 We can then view-convert the mask so that each sequence of
8099 N elements is replaced by a single element. */
8100 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8101 TYPE_VECTOR_SUBPARTS (vectype)));
8102 gimple_seq seq = NULL;
8103 mask_type = build_same_sized_truth_vector_type (vectype);
8104 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8105 if (seq)
8106 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8107 }
8108 return mask;
8109 }
8110
8111 /* Scale profiling counters by estimation for LOOP which is vectorized
8112 by factor VF. */
8113
8114 static void
8115 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8116 {
8117 edge preheader = loop_preheader_edge (loop);
8118 /* Reduce loop iterations by the vectorization factor. */
8119 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8120 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8121
8122 if (freq_h.nonzero_p ())
8123 {
8124 profile_probability p;
8125
8126 /* Avoid dropping loop body profile counter to 0 because of zero count
8127 in loop's preheader. */
8128 if (!(freq_e == profile_count::zero ()))
8129 freq_e = freq_e.force_nonzero ();
8130 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8131 scale_loop_frequencies (loop, p);
8132 }
8133
8134 edge exit_e = single_exit (loop);
8135 exit_e->probability = profile_probability::always ()
8136 .apply_scale (1, new_est_niter + 1);
8137
8138 edge exit_l = single_pred_edge (loop->latch);
8139 profile_probability prob = exit_l->probability;
8140 exit_l->probability = exit_e->probability.invert ();
8141 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8142 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8143 }
8144
8145 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8146 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8147 stmt_vec_info. */
8148
8149 static void
8150 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8151 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8152 {
8153 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8154 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8155
8156 if (dump_enabled_p ())
8157 dump_printf_loc (MSG_NOTE, vect_location,
8158 "------>vectorizing statement: %G", stmt_info->stmt);
8159
8160 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8161 vect_loop_kill_debug_uses (loop, stmt_info);
8162
8163 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8164 && !STMT_VINFO_LIVE_P (stmt_info))
8165 return;
8166
8167 if (STMT_VINFO_VECTYPE (stmt_info))
8168 {
8169 poly_uint64 nunits
8170 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8171 if (!STMT_SLP_TYPE (stmt_info)
8172 && maybe_ne (nunits, vf)
8173 && dump_enabled_p ())
8174 /* For SLP VF is set according to unrolling factor, and not
8175 to vector size, hence for SLP this print is not valid. */
8176 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8177 }
8178
8179 /* Pure SLP statements have already been vectorized. We still need
8180 to apply loop vectorization to hybrid SLP statements. */
8181 if (PURE_SLP_STMT (stmt_info))
8182 return;
8183
8184 if (dump_enabled_p ())
8185 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8186
8187 if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8188 *seen_store = stmt_info;
8189 }
8190
8191 /* Function vect_transform_loop.
8192
8193 The analysis phase has determined that the loop is vectorizable.
8194 Vectorize the loop - created vectorized stmts to replace the scalar
8195 stmts in the loop, and update the loop exit condition.
8196 Returns scalar epilogue loop if any. */
8197
8198 struct loop *
8199 vect_transform_loop (loop_vec_info loop_vinfo)
8200 {
8201 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8202 struct loop *epilogue = NULL;
8203 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8204 int nbbs = loop->num_nodes;
8205 int i;
8206 tree niters_vector = NULL_TREE;
8207 tree step_vector = NULL_TREE;
8208 tree niters_vector_mult_vf = NULL_TREE;
8209 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8210 unsigned int lowest_vf = constant_lower_bound (vf);
8211 gimple *stmt;
8212 bool check_profitability = false;
8213 unsigned int th;
8214
8215 DUMP_VECT_SCOPE ("vec_transform_loop");
8216
8217 loop_vinfo->shared->check_datarefs ();
8218
8219 /* Use the more conservative vectorization threshold. If the number
8220 of iterations is constant assume the cost check has been performed
8221 by our caller. If the threshold makes all loops profitable that
8222 run at least the (estimated) vectorization factor number of times
8223 checking is pointless, too. */
8224 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8225 if (th >= vect_vf_for_cost (loop_vinfo)
8226 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8227 {
8228 if (dump_enabled_p ())
8229 dump_printf_loc (MSG_NOTE, vect_location,
8230 "Profitability threshold is %d loop iterations.\n",
8231 th);
8232 check_profitability = true;
8233 }
8234
8235 /* Make sure there exists a single-predecessor exit bb. Do this before
8236 versioning. */
8237 edge e = single_exit (loop);
8238 if (! single_pred_p (e->dest))
8239 {
8240 split_loop_exit_edge (e);
8241 if (dump_enabled_p ())
8242 dump_printf (MSG_NOTE, "split exit edge\n");
8243 }
8244
8245 /* Version the loop first, if required, so the profitability check
8246 comes first. */
8247
8248 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8249 {
8250 poly_uint64 versioning_threshold
8251 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8252 if (check_profitability
8253 && ordered_p (poly_uint64 (th), versioning_threshold))
8254 {
8255 versioning_threshold = ordered_max (poly_uint64 (th),
8256 versioning_threshold);
8257 check_profitability = false;
8258 }
8259 vect_loop_versioning (loop_vinfo, th, check_profitability,
8260 versioning_threshold);
8261 check_profitability = false;
8262 }
8263
8264 /* Make sure there exists a single-predecessor exit bb also on the
8265 scalar loop copy. Do this after versioning but before peeling
8266 so CFG structure is fine for both scalar and if-converted loop
8267 to make slpeel_duplicate_current_defs_from_edges face matched
8268 loop closed PHI nodes on the exit. */
8269 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8270 {
8271 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8272 if (! single_pred_p (e->dest))
8273 {
8274 split_loop_exit_edge (e);
8275 if (dump_enabled_p ())
8276 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8277 }
8278 }
8279
8280 tree niters = vect_build_loop_niters (loop_vinfo);
8281 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8282 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8283 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8284 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8285 &step_vector, &niters_vector_mult_vf, th,
8286 check_profitability, niters_no_overflow);
8287
8288 if (niters_vector == NULL_TREE)
8289 {
8290 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8291 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8292 && known_eq (lowest_vf, vf))
8293 {
8294 niters_vector
8295 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8296 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8297 step_vector = build_one_cst (TREE_TYPE (niters));
8298 }
8299 else
8300 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8301 &step_vector, niters_no_overflow);
8302 }
8303
8304 /* 1) Make sure the loop header has exactly two entries
8305 2) Make sure we have a preheader basic block. */
8306
8307 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8308
8309 split_edge (loop_preheader_edge (loop));
8310
8311 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8312 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8313 /* This will deal with any possible peeling. */
8314 vect_prepare_for_masked_peels (loop_vinfo);
8315
8316 /* Schedule the SLP instances first, then handle loop vectorization
8317 below. */
8318 if (!loop_vinfo->slp_instances.is_empty ())
8319 {
8320 DUMP_VECT_SCOPE ("scheduling SLP instances");
8321 vect_schedule_slp (loop_vinfo);
8322 }
8323
8324 /* FORNOW: the vectorizer supports only loops which body consist
8325 of one basic block (header + empty latch). When the vectorizer will
8326 support more involved loop forms, the order by which the BBs are
8327 traversed need to be reconsidered. */
8328
8329 for (i = 0; i < nbbs; i++)
8330 {
8331 basic_block bb = bbs[i];
8332 stmt_vec_info stmt_info;
8333
8334 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8335 gsi_next (&si))
8336 {
8337 gphi *phi = si.phi ();
8338 if (dump_enabled_p ())
8339 dump_printf_loc (MSG_NOTE, vect_location,
8340 "------>vectorizing phi: %G", phi);
8341 stmt_info = loop_vinfo->lookup_stmt (phi);
8342 if (!stmt_info)
8343 continue;
8344
8345 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8346 vect_loop_kill_debug_uses (loop, stmt_info);
8347
8348 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8349 && !STMT_VINFO_LIVE_P (stmt_info))
8350 continue;
8351
8352 if (STMT_VINFO_VECTYPE (stmt_info)
8353 && (maybe_ne
8354 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8355 && dump_enabled_p ())
8356 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8357
8358 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8359 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8360 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8361 && ! PURE_SLP_STMT (stmt_info))
8362 {
8363 if (dump_enabled_p ())
8364 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8365 vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8366 }
8367 }
8368
8369 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8370 !gsi_end_p (si);)
8371 {
8372 stmt = gsi_stmt (si);
8373 /* During vectorization remove existing clobber stmts. */
8374 if (gimple_clobber_p (stmt))
8375 {
8376 unlink_stmt_vdef (stmt);
8377 gsi_remove (&si, true);
8378 release_defs (stmt);
8379 }
8380 else
8381 {
8382 stmt_info = loop_vinfo->lookup_stmt (stmt);
8383
8384 /* vector stmts created in the outer-loop during vectorization of
8385 stmts in an inner-loop may not have a stmt_info, and do not
8386 need to be vectorized. */
8387 stmt_vec_info seen_store = NULL;
8388 if (stmt_info)
8389 {
8390 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8391 {
8392 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8393 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8394 !gsi_end_p (subsi); gsi_next (&subsi))
8395 {
8396 stmt_vec_info pat_stmt_info
8397 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8398 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8399 &si, &seen_store);
8400 }
8401 stmt_vec_info pat_stmt_info
8402 = STMT_VINFO_RELATED_STMT (stmt_info);
8403 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8404 &seen_store);
8405 }
8406 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8407 &seen_store);
8408 }
8409 gsi_next (&si);
8410 if (seen_store)
8411 {
8412 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8413 /* Interleaving. If IS_STORE is TRUE, the
8414 vectorization of the interleaving chain was
8415 completed - free all the stores in the chain. */
8416 vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8417 else
8418 /* Free the attached stmt_vec_info and remove the stmt. */
8419 loop_vinfo->remove_stmt (stmt_info);
8420 }
8421 }
8422 }
8423
8424 /* Stub out scalar statements that must not survive vectorization.
8425 Doing this here helps with grouped statements, or statements that
8426 are involved in patterns. */
8427 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8428 !gsi_end_p (gsi); gsi_next (&gsi))
8429 {
8430 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8431 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8432 {
8433 tree lhs = gimple_get_lhs (call);
8434 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8435 {
8436 tree zero = build_zero_cst (TREE_TYPE (lhs));
8437 gimple *new_stmt = gimple_build_assign (lhs, zero);
8438 gsi_replace (&gsi, new_stmt, true);
8439 }
8440 }
8441 }
8442 } /* BBs in loop */
8443
8444 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8445 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8446 if (integer_onep (step_vector))
8447 niters_no_overflow = true;
8448 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8449 niters_vector_mult_vf, !niters_no_overflow);
8450
8451 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8452 scale_profile_for_vect_loop (loop, assumed_vf);
8453
8454 /* True if the final iteration might not handle a full vector's
8455 worth of scalar iterations. */
8456 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8457 /* The minimum number of iterations performed by the epilogue. This
8458 is 1 when peeling for gaps because we always need a final scalar
8459 iteration. */
8460 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8461 /* +1 to convert latch counts to loop iteration counts,
8462 -min_epilogue_iters to remove iterations that cannot be performed
8463 by the vector code. */
8464 int bias_for_lowest = 1 - min_epilogue_iters;
8465 int bias_for_assumed = bias_for_lowest;
8466 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8467 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8468 {
8469 /* When the amount of peeling is known at compile time, the first
8470 iteration will have exactly alignment_npeels active elements.
8471 In the worst case it will have at least one. */
8472 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8473 bias_for_lowest += lowest_vf - min_first_active;
8474 bias_for_assumed += assumed_vf - min_first_active;
8475 }
8476 /* In these calculations the "- 1" converts loop iteration counts
8477 back to latch counts. */
8478 if (loop->any_upper_bound)
8479 loop->nb_iterations_upper_bound
8480 = (final_iter_may_be_partial
8481 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8482 lowest_vf) - 1
8483 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8484 lowest_vf) - 1);
8485 if (loop->any_likely_upper_bound)
8486 loop->nb_iterations_likely_upper_bound
8487 = (final_iter_may_be_partial
8488 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8489 + bias_for_lowest, lowest_vf) - 1
8490 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8491 + bias_for_lowest, lowest_vf) - 1);
8492 if (loop->any_estimate)
8493 loop->nb_iterations_estimate
8494 = (final_iter_may_be_partial
8495 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8496 assumed_vf) - 1
8497 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8498 assumed_vf) - 1);
8499
8500 if (dump_enabled_p ())
8501 {
8502 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8503 {
8504 dump_printf_loc (MSG_NOTE, vect_location,
8505 "LOOP VECTORIZED\n");
8506 if (loop->inner)
8507 dump_printf_loc (MSG_NOTE, vect_location,
8508 "OUTER LOOP VECTORIZED\n");
8509 dump_printf (MSG_NOTE, "\n");
8510 }
8511 else
8512 {
8513 dump_printf_loc (MSG_NOTE, vect_location,
8514 "LOOP EPILOGUE VECTORIZED (VS=");
8515 dump_dec (MSG_NOTE, current_vector_size);
8516 dump_printf (MSG_NOTE, ")\n");
8517 }
8518 }
8519
8520 /* Free SLP instances here because otherwise stmt reference counting
8521 won't work. */
8522 slp_instance instance;
8523 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8524 vect_free_slp_instance (instance, true);
8525 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8526 /* Clear-up safelen field since its value is invalid after vectorization
8527 since vectorized loop can have loop-carried dependencies. */
8528 loop->safelen = 0;
8529
8530 /* Don't vectorize epilogue for epilogue. */
8531 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8532 epilogue = NULL;
8533
8534 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8535 epilogue = NULL;
8536
8537 if (epilogue)
8538 {
8539 auto_vector_sizes vector_sizes;
8540 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8541 unsigned int next_size = 0;
8542
8543 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8544 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8545 && known_eq (vf, lowest_vf))
8546 {
8547 unsigned int eiters
8548 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8549 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8550 eiters = eiters % lowest_vf;
8551 epilogue->nb_iterations_upper_bound = eiters - 1;
8552
8553 unsigned int ratio;
8554 while (next_size < vector_sizes.length ()
8555 && !(constant_multiple_p (current_vector_size,
8556 vector_sizes[next_size], &ratio)
8557 && eiters >= lowest_vf / ratio))
8558 next_size += 1;
8559 }
8560 else
8561 while (next_size < vector_sizes.length ()
8562 && maybe_lt (current_vector_size, vector_sizes[next_size]))
8563 next_size += 1;
8564
8565 if (next_size == vector_sizes.length ())
8566 epilogue = NULL;
8567 }
8568
8569 if (epilogue)
8570 {
8571 epilogue->force_vectorize = loop->force_vectorize;
8572 epilogue->safelen = loop->safelen;
8573 epilogue->dont_vectorize = false;
8574
8575 /* We may need to if-convert epilogue to vectorize it. */
8576 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8577 tree_if_conversion (epilogue);
8578 }
8579
8580 return epilogue;
8581 }
8582
8583 /* The code below is trying to perform simple optimization - revert
8584 if-conversion for masked stores, i.e. if the mask of a store is zero
8585 do not perform it and all stored value producers also if possible.
8586 For example,
8587 for (i=0; i<n; i++)
8588 if (c[i])
8589 {
8590 p1[i] += 1;
8591 p2[i] = p3[i] +2;
8592 }
8593 this transformation will produce the following semi-hammock:
8594
8595 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8596 {
8597 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8598 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8599 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8600 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8601 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8602 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8603 }
8604 */
8605
8606 void
8607 optimize_mask_stores (struct loop *loop)
8608 {
8609 basic_block *bbs = get_loop_body (loop);
8610 unsigned nbbs = loop->num_nodes;
8611 unsigned i;
8612 basic_block bb;
8613 struct loop *bb_loop;
8614 gimple_stmt_iterator gsi;
8615 gimple *stmt;
8616 auto_vec<gimple *> worklist;
8617
8618 vect_location = find_loop_location (loop);
8619 /* Pick up all masked stores in loop if any. */
8620 for (i = 0; i < nbbs; i++)
8621 {
8622 bb = bbs[i];
8623 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8624 gsi_next (&gsi))
8625 {
8626 stmt = gsi_stmt (gsi);
8627 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8628 worklist.safe_push (stmt);
8629 }
8630 }
8631
8632 free (bbs);
8633 if (worklist.is_empty ())
8634 return;
8635
8636 /* Loop has masked stores. */
8637 while (!worklist.is_empty ())
8638 {
8639 gimple *last, *last_store;
8640 edge e, efalse;
8641 tree mask;
8642 basic_block store_bb, join_bb;
8643 gimple_stmt_iterator gsi_to;
8644 tree vdef, new_vdef;
8645 gphi *phi;
8646 tree vectype;
8647 tree zero;
8648
8649 last = worklist.pop ();
8650 mask = gimple_call_arg (last, 2);
8651 bb = gimple_bb (last);
8652 /* Create then_bb and if-then structure in CFG, then_bb belongs to
8653 the same loop as if_bb. It could be different to LOOP when two
8654 level loop-nest is vectorized and mask_store belongs to the inner
8655 one. */
8656 e = split_block (bb, last);
8657 bb_loop = bb->loop_father;
8658 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8659 join_bb = e->dest;
8660 store_bb = create_empty_bb (bb);
8661 add_bb_to_loop (store_bb, bb_loop);
8662 e->flags = EDGE_TRUE_VALUE;
8663 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8664 /* Put STORE_BB to likely part. */
8665 efalse->probability = profile_probability::unlikely ();
8666 store_bb->count = efalse->count ();
8667 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8668 if (dom_info_available_p (CDI_DOMINATORS))
8669 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8670 if (dump_enabled_p ())
8671 dump_printf_loc (MSG_NOTE, vect_location,
8672 "Create new block %d to sink mask stores.",
8673 store_bb->index);
8674 /* Create vector comparison with boolean result. */
8675 vectype = TREE_TYPE (mask);
8676 zero = build_zero_cst (vectype);
8677 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8678 gsi = gsi_last_bb (bb);
8679 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8680 /* Create new PHI node for vdef of the last masked store:
8681 .MEM_2 = VDEF <.MEM_1>
8682 will be converted to
8683 .MEM.3 = VDEF <.MEM_1>
8684 and new PHI node will be created in join bb
8685 .MEM_2 = PHI <.MEM_1, .MEM_3>
8686 */
8687 vdef = gimple_vdef (last);
8688 new_vdef = make_ssa_name (gimple_vop (cfun), last);
8689 gimple_set_vdef (last, new_vdef);
8690 phi = create_phi_node (vdef, join_bb);
8691 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8692
8693 /* Put all masked stores with the same mask to STORE_BB if possible. */
8694 while (true)
8695 {
8696 gimple_stmt_iterator gsi_from;
8697 gimple *stmt1 = NULL;
8698
8699 /* Move masked store to STORE_BB. */
8700 last_store = last;
8701 gsi = gsi_for_stmt (last);
8702 gsi_from = gsi;
8703 /* Shift GSI to the previous stmt for further traversal. */
8704 gsi_prev (&gsi);
8705 gsi_to = gsi_start_bb (store_bb);
8706 gsi_move_before (&gsi_from, &gsi_to);
8707 /* Setup GSI_TO to the non-empty block start. */
8708 gsi_to = gsi_start_bb (store_bb);
8709 if (dump_enabled_p ())
8710 dump_printf_loc (MSG_NOTE, vect_location,
8711 "Move stmt to created bb\n%G", last);
8712 /* Move all stored value producers if possible. */
8713 while (!gsi_end_p (gsi))
8714 {
8715 tree lhs;
8716 imm_use_iterator imm_iter;
8717 use_operand_p use_p;
8718 bool res;
8719
8720 /* Skip debug statements. */
8721 if (is_gimple_debug (gsi_stmt (gsi)))
8722 {
8723 gsi_prev (&gsi);
8724 continue;
8725 }
8726 stmt1 = gsi_stmt (gsi);
8727 /* Do not consider statements writing to memory or having
8728 volatile operand. */
8729 if (gimple_vdef (stmt1)
8730 || gimple_has_volatile_ops (stmt1))
8731 break;
8732 gsi_from = gsi;
8733 gsi_prev (&gsi);
8734 lhs = gimple_get_lhs (stmt1);
8735 if (!lhs)
8736 break;
8737
8738 /* LHS of vectorized stmt must be SSA_NAME. */
8739 if (TREE_CODE (lhs) != SSA_NAME)
8740 break;
8741
8742 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8743 {
8744 /* Remove dead scalar statement. */
8745 if (has_zero_uses (lhs))
8746 {
8747 gsi_remove (&gsi_from, true);
8748 continue;
8749 }
8750 }
8751
8752 /* Check that LHS does not have uses outside of STORE_BB. */
8753 res = true;
8754 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8755 {
8756 gimple *use_stmt;
8757 use_stmt = USE_STMT (use_p);
8758 if (is_gimple_debug (use_stmt))
8759 continue;
8760 if (gimple_bb (use_stmt) != store_bb)
8761 {
8762 res = false;
8763 break;
8764 }
8765 }
8766 if (!res)
8767 break;
8768
8769 if (gimple_vuse (stmt1)
8770 && gimple_vuse (stmt1) != gimple_vuse (last_store))
8771 break;
8772
8773 /* Can move STMT1 to STORE_BB. */
8774 if (dump_enabled_p ())
8775 dump_printf_loc (MSG_NOTE, vect_location,
8776 "Move stmt to created bb\n%G", stmt1);
8777 gsi_move_before (&gsi_from, &gsi_to);
8778 /* Shift GSI_TO for further insertion. */
8779 gsi_prev (&gsi_to);
8780 }
8781 /* Put other masked stores with the same mask to STORE_BB. */
8782 if (worklist.is_empty ()
8783 || gimple_call_arg (worklist.last (), 2) != mask
8784 || worklist.last () != stmt1)
8785 break;
8786 last = worklist.pop ();
8787 }
8788 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8789 }
8790 }