Daily bump.
[gcc.git] / gcc / tree-vect-loop.c
1 /* Loop Vectorization
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
57
58 /* Loop Vectorization Pass.
59
60 This pass tries to vectorize loops.
61
62 For example, the vectorizer transforms the following simple loop:
63
64 short a[N]; short b[N]; short c[N]; int i;
65
66 for (i=0; i<N; i++){
67 a[i] = b[i] + c[i];
68 }
69
70 as if it was manually vectorized by rewriting the source code into:
71
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75 v8hi va, vb, vc;
76
77 for (i=0; i<N/8; i++){
78 vb = pb[i];
79 vc = pc[i];
80 va = vb + vc;
81 pa[i] = va;
82 }
83
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
95
96 Analysis phase:
97 ===============
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
101
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
106
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
117
118 For example, say stmt S1 was vectorized into stmt VS1:
119
120 VS1: vb = px[i];
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122 S2: a = b;
123
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
128
129 VS1: vb = px[i];
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 VS2: va = vb;
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
133
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
136
137 Target modeling:
138 =================
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
144
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
151
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
154 */
155
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
157
158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
159 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
160 may already be set for general statements (not just data refs). */
161
162 static bool
163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
164 bool vectype_maybe_set_p,
165 poly_uint64 *vf,
166 vec<stmt_vec_info > *mask_producers)
167 {
168 gimple *stmt = stmt_info->stmt;
169
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
173 {
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return true;
177 }
178
179 tree stmt_vectype, nunits_vectype;
180 if (!vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
181 &nunits_vectype))
182 return false;
183
184 if (stmt_vectype)
185 {
186 if (STMT_VINFO_VECTYPE (stmt_info))
187 /* The only case when a vectype had been already set is for stmts
188 that contain a data ref, or for "pattern-stmts" (stmts generated
189 by the vectorizer to represent/replace a certain idiom). */
190 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
191 || vectype_maybe_set_p)
192 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
193 else if (stmt_vectype == boolean_type_node)
194 mask_producers->safe_push (stmt_info);
195 else
196 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
197 }
198
199 if (nunits_vectype)
200 vect_update_max_nunits (vf, nunits_vectype);
201
202 return true;
203 }
204
205 /* Subroutine of vect_determine_vectorization_factor. Set the vector
206 types of STMT_INFO and all attached pattern statements and update
207 the vectorization factor VF accordingly. If some of the statements
208 produce a mask result whose vector type can only be calculated later,
209 add them to MASK_PRODUCERS. Return true on success or false if
210 something prevented vectorization. */
211
212 static bool
213 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
214 vec<stmt_vec_info > *mask_producers)
215 {
216 vec_info *vinfo = stmt_info->vinfo;
217 if (dump_enabled_p ())
218 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
219 stmt_info->stmt);
220 if (!vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers))
221 return false;
222
223 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
224 && STMT_VINFO_RELATED_STMT (stmt_info))
225 {
226 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
227 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
228
229 /* If a pattern statement has def stmts, analyze them too. */
230 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
231 !gsi_end_p (si); gsi_next (&si))
232 {
233 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
234 if (dump_enabled_p ())
235 dump_printf_loc (MSG_NOTE, vect_location,
236 "==> examining pattern def stmt: %G",
237 def_stmt_info->stmt);
238 if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
239 vf, mask_producers))
240 return false;
241 }
242
243 if (dump_enabled_p ())
244 dump_printf_loc (MSG_NOTE, vect_location,
245 "==> examining pattern statement: %G",
246 stmt_info->stmt);
247 if (!vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers))
248 return false;
249 }
250
251 return true;
252 }
253
254 /* Function vect_determine_vectorization_factor
255
256 Determine the vectorization factor (VF). VF is the number of data elements
257 that are operated upon in parallel in a single iteration of the vectorized
258 loop. For example, when vectorizing a loop that operates on 4byte elements,
259 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
260 elements can fit in a single vector register.
261
262 We currently support vectorization of loops in which all types operated upon
263 are of the same size. Therefore this function currently sets VF according to
264 the size of the types operated upon, and fails if there are multiple sizes
265 in the loop.
266
267 VF is also the factor by which the loop iterations are strip-mined, e.g.:
268 original loop:
269 for (i=0; i<N; i++){
270 a[i] = b[i] + c[i];
271 }
272
273 vectorized loop:
274 for (i=0; i<N; i+=VF){
275 a[i:VF] = b[i:VF] + c[i:VF];
276 }
277 */
278
279 static bool
280 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
281 {
282 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
283 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
284 unsigned nbbs = loop->num_nodes;
285 poly_uint64 vectorization_factor = 1;
286 tree scalar_type = NULL_TREE;
287 gphi *phi;
288 tree vectype;
289 stmt_vec_info stmt_info;
290 unsigned i;
291 auto_vec<stmt_vec_info> mask_producers;
292
293 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
294
295 for (i = 0; i < nbbs; i++)
296 {
297 basic_block bb = bbs[i];
298
299 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
300 gsi_next (&si))
301 {
302 phi = si.phi ();
303 stmt_info = loop_vinfo->lookup_stmt (phi);
304 if (dump_enabled_p ())
305 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
306 phi);
307
308 gcc_assert (stmt_info);
309
310 if (STMT_VINFO_RELEVANT_P (stmt_info)
311 || STMT_VINFO_LIVE_P (stmt_info))
312 {
313 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
314 scalar_type = TREE_TYPE (PHI_RESULT (phi));
315
316 if (dump_enabled_p ())
317 dump_printf_loc (MSG_NOTE, vect_location,
318 "get vectype for scalar type: %T\n",
319 scalar_type);
320
321 vectype = get_vectype_for_scalar_type (scalar_type);
322 if (!vectype)
323 {
324 if (dump_enabled_p ())
325 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
326 "not vectorized: unsupported "
327 "data-type %T\n",
328 scalar_type);
329 return false;
330 }
331 STMT_VINFO_VECTYPE (stmt_info) = vectype;
332
333 if (dump_enabled_p ())
334 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
335 vectype);
336
337 if (dump_enabled_p ())
338 {
339 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
340 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
341 dump_printf (MSG_NOTE, "\n");
342 }
343
344 vect_update_max_nunits (&vectorization_factor, vectype);
345 }
346 }
347
348 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
349 gsi_next (&si))
350 {
351 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
352 if (!vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
353 &mask_producers))
354 return false;
355 }
356 }
357
358 /* TODO: Analyze cost. Decide if worth while to vectorize. */
359 if (dump_enabled_p ())
360 {
361 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
362 dump_dec (MSG_NOTE, vectorization_factor);
363 dump_printf (MSG_NOTE, "\n");
364 }
365
366 if (known_le (vectorization_factor, 1U))
367 {
368 if (dump_enabled_p ())
369 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
370 "not vectorized: unsupported data-type\n");
371 return false;
372 }
373 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
374
375 for (i = 0; i < mask_producers.length (); i++)
376 {
377 stmt_info = mask_producers[i];
378 tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
379 if (!mask_type)
380 return false;
381 STMT_VINFO_VECTYPE (stmt_info) = mask_type;
382 }
383
384 return true;
385 }
386
387
388 /* Function vect_is_simple_iv_evolution.
389
390 FORNOW: A simple evolution of an induction variables in the loop is
391 considered a polynomial evolution. */
392
393 static bool
394 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
395 tree * step)
396 {
397 tree init_expr;
398 tree step_expr;
399 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
400 basic_block bb;
401
402 /* When there is no evolution in this loop, the evolution function
403 is not "simple". */
404 if (evolution_part == NULL_TREE)
405 return false;
406
407 /* When the evolution is a polynomial of degree >= 2
408 the evolution function is not "simple". */
409 if (tree_is_chrec (evolution_part))
410 return false;
411
412 step_expr = evolution_part;
413 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
414
415 if (dump_enabled_p ())
416 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
417 step_expr, init_expr);
418
419 *init = init_expr;
420 *step = step_expr;
421
422 if (TREE_CODE (step_expr) != INTEGER_CST
423 && (TREE_CODE (step_expr) != SSA_NAME
424 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
425 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
426 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
427 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
428 || !flag_associative_math)))
429 && (TREE_CODE (step_expr) != REAL_CST
430 || !flag_associative_math))
431 {
432 if (dump_enabled_p ())
433 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
434 "step unknown.\n");
435 return false;
436 }
437
438 return true;
439 }
440
441 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
442 what we are assuming is a double reduction. For example, given
443 a structure like this:
444
445 outer1:
446 x_1 = PHI <x_4(outer2), ...>;
447 ...
448
449 inner:
450 x_2 = PHI <x_1(outer1), ...>;
451 ...
452 x_3 = ...;
453 ...
454
455 outer2:
456 x_4 = PHI <x_3(inner)>;
457 ...
458
459 outer loop analysis would treat x_1 as a double reduction phi and
460 this function would then return true for x_2. */
461
462 static bool
463 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
464 {
465 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
466 use_operand_p use_p;
467 ssa_op_iter op_iter;
468 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
469 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
470 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
471 return true;
472 return false;
473 }
474
475 /* Function vect_analyze_scalar_cycles_1.
476
477 Examine the cross iteration def-use cycles of scalar variables
478 in LOOP. LOOP_VINFO represents the loop that is now being
479 considered for vectorization (can be LOOP, or an outer-loop
480 enclosing LOOP). */
481
482 static void
483 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
484 {
485 basic_block bb = loop->header;
486 tree init, step;
487 auto_vec<stmt_vec_info, 64> worklist;
488 gphi_iterator gsi;
489 bool double_reduc;
490
491 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
492
493 /* First - identify all inductions. Reduction detection assumes that all the
494 inductions have been identified, therefore, this order must not be
495 changed. */
496 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
497 {
498 gphi *phi = gsi.phi ();
499 tree access_fn = NULL;
500 tree def = PHI_RESULT (phi);
501 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
502
503 if (dump_enabled_p ())
504 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
505
506 /* Skip virtual phi's. The data dependences that are associated with
507 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
508 if (virtual_operand_p (def))
509 continue;
510
511 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
512
513 /* Analyze the evolution function. */
514 access_fn = analyze_scalar_evolution (loop, def);
515 if (access_fn)
516 {
517 STRIP_NOPS (access_fn);
518 if (dump_enabled_p ())
519 dump_printf_loc (MSG_NOTE, vect_location,
520 "Access function of PHI: %T\n", access_fn);
521 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
522 = initial_condition_in_loop_num (access_fn, loop->num);
523 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
524 = evolution_part_in_loop_num (access_fn, loop->num);
525 }
526
527 if (!access_fn
528 || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
529 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
530 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
531 && TREE_CODE (step) != INTEGER_CST))
532 {
533 worklist.safe_push (stmt_vinfo);
534 continue;
535 }
536
537 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
538 != NULL_TREE);
539 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
540
541 if (dump_enabled_p ())
542 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
543 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
544 }
545
546
547 /* Second - identify all reductions and nested cycles. */
548 while (worklist.length () > 0)
549 {
550 stmt_vec_info stmt_vinfo = worklist.pop ();
551 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
552 tree def = PHI_RESULT (phi);
553
554 if (dump_enabled_p ())
555 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
556
557 gcc_assert (!virtual_operand_p (def)
558 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
559
560 stmt_vec_info reduc_stmt_info
561 = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
562 &double_reduc, false);
563 if (reduc_stmt_info)
564 {
565 if (double_reduc)
566 {
567 if (dump_enabled_p ())
568 dump_printf_loc (MSG_NOTE, vect_location,
569 "Detected double reduction.\n");
570
571 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
572 STMT_VINFO_DEF_TYPE (reduc_stmt_info)
573 = vect_double_reduction_def;
574 }
575 else
576 {
577 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
578 {
579 if (dump_enabled_p ())
580 dump_printf_loc (MSG_NOTE, vect_location,
581 "Detected vectorizable nested cycle.\n");
582
583 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
584 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
585 }
586 else
587 {
588 if (dump_enabled_p ())
589 dump_printf_loc (MSG_NOTE, vect_location,
590 "Detected reduction.\n");
591
592 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
593 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
594 /* Store the reduction cycles for possible vectorization in
595 loop-aware SLP if it was not detected as reduction
596 chain. */
597 if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
598 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
599 (reduc_stmt_info);
600 }
601 }
602 }
603 else
604 if (dump_enabled_p ())
605 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
606 "Unknown def-use cycle pattern.\n");
607 }
608 }
609
610
611 /* Function vect_analyze_scalar_cycles.
612
613 Examine the cross iteration def-use cycles of scalar variables, by
614 analyzing the loop-header PHIs of scalar variables. Classify each
615 cycle as one of the following: invariant, induction, reduction, unknown.
616 We do that for the loop represented by LOOP_VINFO, and also to its
617 inner-loop, if exists.
618 Examples for scalar cycles:
619
620 Example1: reduction:
621
622 loop1:
623 for (i=0; i<N; i++)
624 sum += a[i];
625
626 Example2: induction:
627
628 loop2:
629 for (i=0; i<N; i++)
630 a[i] = i; */
631
632 static void
633 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
634 {
635 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
636
637 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
638
639 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
640 Reductions in such inner-loop therefore have different properties than
641 the reductions in the nest that gets vectorized:
642 1. When vectorized, they are executed in the same order as in the original
643 scalar loop, so we can't change the order of computation when
644 vectorizing them.
645 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
646 current checks are too strict. */
647
648 if (loop->inner)
649 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
650 }
651
652 /* Transfer group and reduction information from STMT_INFO to its
653 pattern stmt. */
654
655 static void
656 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
657 {
658 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
659 stmt_vec_info stmtp;
660 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
661 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
662 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
663 do
664 {
665 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
666 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
667 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
668 if (stmt_info)
669 REDUC_GROUP_NEXT_ELEMENT (stmtp)
670 = STMT_VINFO_RELATED_STMT (stmt_info);
671 }
672 while (stmt_info);
673 STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
674 }
675
676 /* Fixup scalar cycles that now have their stmts detected as patterns. */
677
678 static void
679 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
680 {
681 stmt_vec_info first;
682 unsigned i;
683
684 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
685 if (STMT_VINFO_IN_PATTERN_P (first))
686 {
687 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
688 while (next)
689 {
690 if (! STMT_VINFO_IN_PATTERN_P (next))
691 break;
692 next = REDUC_GROUP_NEXT_ELEMENT (next);
693 }
694 /* If not all stmt in the chain are patterns try to handle
695 the chain without patterns. */
696 if (! next)
697 {
698 vect_fixup_reduc_chain (first);
699 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
700 = STMT_VINFO_RELATED_STMT (first);
701 }
702 }
703 }
704
705 /* Function vect_get_loop_niters.
706
707 Determine how many iterations the loop is executed and place it
708 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
709 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
710 niter information holds in ASSUMPTIONS.
711
712 Return the loop exit condition. */
713
714
715 static gcond *
716 vect_get_loop_niters (struct loop *loop, tree *assumptions,
717 tree *number_of_iterations, tree *number_of_iterationsm1)
718 {
719 edge exit = single_exit (loop);
720 struct tree_niter_desc niter_desc;
721 tree niter_assumptions, niter, may_be_zero;
722 gcond *cond = get_loop_exit_condition (loop);
723
724 *assumptions = boolean_true_node;
725 *number_of_iterationsm1 = chrec_dont_know;
726 *number_of_iterations = chrec_dont_know;
727 DUMP_VECT_SCOPE ("get_loop_niters");
728
729 if (!exit)
730 return cond;
731
732 niter = chrec_dont_know;
733 may_be_zero = NULL_TREE;
734 niter_assumptions = boolean_true_node;
735 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
736 || chrec_contains_undetermined (niter_desc.niter))
737 return cond;
738
739 niter_assumptions = niter_desc.assumptions;
740 may_be_zero = niter_desc.may_be_zero;
741 niter = niter_desc.niter;
742
743 if (may_be_zero && integer_zerop (may_be_zero))
744 may_be_zero = NULL_TREE;
745
746 if (may_be_zero)
747 {
748 if (COMPARISON_CLASS_P (may_be_zero))
749 {
750 /* Try to combine may_be_zero with assumptions, this can simplify
751 computation of niter expression. */
752 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
753 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
754 niter_assumptions,
755 fold_build1 (TRUTH_NOT_EXPR,
756 boolean_type_node,
757 may_be_zero));
758 else
759 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
760 build_int_cst (TREE_TYPE (niter), 0),
761 rewrite_to_non_trapping_overflow (niter));
762
763 may_be_zero = NULL_TREE;
764 }
765 else if (integer_nonzerop (may_be_zero))
766 {
767 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
768 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
769 return cond;
770 }
771 else
772 return cond;
773 }
774
775 *assumptions = niter_assumptions;
776 *number_of_iterationsm1 = niter;
777
778 /* We want the number of loop header executions which is the number
779 of latch executions plus one.
780 ??? For UINT_MAX latch executions this number overflows to zero
781 for loops like do { n++; } while (n != 0); */
782 if (niter && !chrec_contains_undetermined (niter))
783 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
784 build_int_cst (TREE_TYPE (niter), 1));
785 *number_of_iterations = niter;
786
787 return cond;
788 }
789
790 /* Function bb_in_loop_p
791
792 Used as predicate for dfs order traversal of the loop bbs. */
793
794 static bool
795 bb_in_loop_p (const_basic_block bb, const void *data)
796 {
797 const struct loop *const loop = (const struct loop *)data;
798 if (flow_bb_inside_loop_p (loop, bb))
799 return true;
800 return false;
801 }
802
803
804 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
805 stmt_vec_info structs for all the stmts in LOOP_IN. */
806
807 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
808 : vec_info (vec_info::loop, init_cost (loop_in), shared),
809 loop (loop_in),
810 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
811 num_itersm1 (NULL_TREE),
812 num_iters (NULL_TREE),
813 num_iters_unchanged (NULL_TREE),
814 num_iters_assumptions (NULL_TREE),
815 th (0),
816 versioning_threshold (0),
817 vectorization_factor (0),
818 max_vectorization_factor (0),
819 mask_skip_niters (NULL_TREE),
820 mask_compare_type (NULL_TREE),
821 unaligned_dr (NULL),
822 peeling_for_alignment (0),
823 ptr_mask (0),
824 ivexpr_map (NULL),
825 slp_unrolling_factor (1),
826 single_scalar_iteration_cost (0),
827 vectorizable (false),
828 can_fully_mask_p (true),
829 fully_masked_p (false),
830 peeling_for_gaps (false),
831 peeling_for_niter (false),
832 operands_swapped (false),
833 no_data_dependencies (false),
834 has_mask_store (false),
835 scalar_loop (NULL),
836 orig_loop_info (NULL)
837 {
838 /* CHECKME: We want to visit all BBs before their successors (except for
839 latch blocks, for which this assertion wouldn't hold). In the simple
840 case of the loop forms we allow, a dfs order of the BBs would the same
841 as reversed postorder traversal, so we are safe. */
842
843 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
844 bbs, loop->num_nodes, loop);
845 gcc_assert (nbbs == loop->num_nodes);
846
847 for (unsigned int i = 0; i < nbbs; i++)
848 {
849 basic_block bb = bbs[i];
850 gimple_stmt_iterator si;
851
852 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
853 {
854 gimple *phi = gsi_stmt (si);
855 gimple_set_uid (phi, 0);
856 add_stmt (phi);
857 }
858
859 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
860 {
861 gimple *stmt = gsi_stmt (si);
862 gimple_set_uid (stmt, 0);
863 add_stmt (stmt);
864 }
865 }
866 }
867
868 /* Free all levels of MASKS. */
869
870 void
871 release_vec_loop_masks (vec_loop_masks *masks)
872 {
873 rgroup_masks *rgm;
874 unsigned int i;
875 FOR_EACH_VEC_ELT (*masks, i, rgm)
876 rgm->masks.release ();
877 masks->release ();
878 }
879
880 /* Free all memory used by the _loop_vec_info, as well as all the
881 stmt_vec_info structs of all the stmts in the loop. */
882
883 _loop_vec_info::~_loop_vec_info ()
884 {
885 int nbbs;
886 gimple_stmt_iterator si;
887 int j;
888
889 nbbs = loop->num_nodes;
890 for (j = 0; j < nbbs; j++)
891 {
892 basic_block bb = bbs[j];
893 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
894 {
895 gimple *stmt = gsi_stmt (si);
896
897 /* We may have broken canonical form by moving a constant
898 into RHS1 of a commutative op. Fix such occurrences. */
899 if (operands_swapped && is_gimple_assign (stmt))
900 {
901 enum tree_code code = gimple_assign_rhs_code (stmt);
902
903 if ((code == PLUS_EXPR
904 || code == POINTER_PLUS_EXPR
905 || code == MULT_EXPR)
906 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
907 swap_ssa_operands (stmt,
908 gimple_assign_rhs1_ptr (stmt),
909 gimple_assign_rhs2_ptr (stmt));
910 else if (code == COND_EXPR
911 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
912 {
913 tree cond_expr = gimple_assign_rhs1 (stmt);
914 enum tree_code cond_code = TREE_CODE (cond_expr);
915
916 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
917 {
918 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
919 0));
920 cond_code = invert_tree_comparison (cond_code,
921 honor_nans);
922 if (cond_code != ERROR_MARK)
923 {
924 TREE_SET_CODE (cond_expr, cond_code);
925 swap_ssa_operands (stmt,
926 gimple_assign_rhs2_ptr (stmt),
927 gimple_assign_rhs3_ptr (stmt));
928 }
929 }
930 }
931 }
932 gsi_next (&si);
933 }
934 }
935
936 free (bbs);
937
938 release_vec_loop_masks (&masks);
939 delete ivexpr_map;
940
941 loop->aux = NULL;
942 }
943
944 /* Return an invariant or register for EXPR and emit necessary
945 computations in the LOOP_VINFO loop preheader. */
946
947 tree
948 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
949 {
950 if (is_gimple_reg (expr)
951 || is_gimple_min_invariant (expr))
952 return expr;
953
954 if (! loop_vinfo->ivexpr_map)
955 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
956 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
957 if (! cached)
958 {
959 gimple_seq stmts = NULL;
960 cached = force_gimple_operand (unshare_expr (expr),
961 &stmts, true, NULL_TREE);
962 if (stmts)
963 {
964 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
965 gsi_insert_seq_on_edge_immediate (e, stmts);
966 }
967 }
968 return cached;
969 }
970
971 /* Return true if we can use CMP_TYPE as the comparison type to produce
972 all masks required to mask LOOP_VINFO. */
973
974 static bool
975 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
976 {
977 rgroup_masks *rgm;
978 unsigned int i;
979 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
980 if (rgm->mask_type != NULL_TREE
981 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
982 cmp_type, rgm->mask_type,
983 OPTIMIZE_FOR_SPEED))
984 return false;
985 return true;
986 }
987
988 /* Calculate the maximum number of scalars per iteration for every
989 rgroup in LOOP_VINFO. */
990
991 static unsigned int
992 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
993 {
994 unsigned int res = 1;
995 unsigned int i;
996 rgroup_masks *rgm;
997 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
998 res = MAX (res, rgm->max_nscalars_per_iter);
999 return res;
1000 }
1001
1002 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1003 whether we can actually generate the masks required. Return true if so,
1004 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1005
1006 static bool
1007 vect_verify_full_masking (loop_vec_info loop_vinfo)
1008 {
1009 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1010 unsigned int min_ni_width;
1011
1012 /* Use a normal loop if there are no statements that need masking.
1013 This only happens in rare degenerate cases: it means that the loop
1014 has no loads, no stores, and no live-out values. */
1015 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1016 return false;
1017
1018 /* Get the maximum number of iterations that is representable
1019 in the counter type. */
1020 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1021 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1022
1023 /* Get a more refined estimate for the number of iterations. */
1024 widest_int max_back_edges;
1025 if (max_loop_iterations (loop, &max_back_edges))
1026 max_ni = wi::smin (max_ni, max_back_edges + 1);
1027
1028 /* Account for rgroup masks, in which each bit is replicated N times. */
1029 max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1030
1031 /* Work out how many bits we need to represent the limit. */
1032 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1033
1034 /* Find a scalar mode for which WHILE_ULT is supported. */
1035 opt_scalar_int_mode cmp_mode_iter;
1036 tree cmp_type = NULL_TREE;
1037 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1038 {
1039 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1040 if (cmp_bits >= min_ni_width
1041 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1042 {
1043 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1044 if (this_type
1045 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1046 {
1047 /* Although we could stop as soon as we find a valid mode,
1048 it's often better to continue until we hit Pmode, since the
1049 operands to the WHILE are more likely to be reusable in
1050 address calculations. */
1051 cmp_type = this_type;
1052 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1053 break;
1054 }
1055 }
1056 }
1057
1058 if (!cmp_type)
1059 return false;
1060
1061 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1062 return true;
1063 }
1064
1065 /* Calculate the cost of one scalar iteration of the loop. */
1066 static void
1067 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1068 {
1069 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1070 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1071 int nbbs = loop->num_nodes, factor;
1072 int innerloop_iters, i;
1073
1074 /* Gather costs for statements in the scalar loop. */
1075
1076 /* FORNOW. */
1077 innerloop_iters = 1;
1078 if (loop->inner)
1079 innerloop_iters = 50; /* FIXME */
1080
1081 for (i = 0; i < nbbs; i++)
1082 {
1083 gimple_stmt_iterator si;
1084 basic_block bb = bbs[i];
1085
1086 if (bb->loop_father == loop->inner)
1087 factor = innerloop_iters;
1088 else
1089 factor = 1;
1090
1091 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1092 {
1093 gimple *stmt = gsi_stmt (si);
1094 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1095
1096 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1097 continue;
1098
1099 /* Skip stmts that are not vectorized inside the loop. */
1100 if (stmt_info
1101 && !STMT_VINFO_RELEVANT_P (stmt_info)
1102 && (!STMT_VINFO_LIVE_P (stmt_info)
1103 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1104 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1105 continue;
1106
1107 vect_cost_for_stmt kind;
1108 if (STMT_VINFO_DATA_REF (stmt_info))
1109 {
1110 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1111 kind = scalar_load;
1112 else
1113 kind = scalar_store;
1114 }
1115 else
1116 kind = scalar_stmt;
1117
1118 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1119 factor, kind, stmt_info, 0, vect_prologue);
1120 }
1121 }
1122
1123 /* Now accumulate cost. */
1124 void *target_cost_data = init_cost (loop);
1125 stmt_info_for_cost *si;
1126 int j;
1127 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1128 j, si)
1129 (void) add_stmt_cost (target_cost_data, si->count,
1130 si->kind, si->stmt_info, si->misalign,
1131 vect_body);
1132 unsigned dummy, body_cost = 0;
1133 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1134 destroy_cost_data (target_cost_data);
1135 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1136 }
1137
1138
1139 /* Function vect_analyze_loop_form_1.
1140
1141 Verify that certain CFG restrictions hold, including:
1142 - the loop has a pre-header
1143 - the loop has a single entry and exit
1144 - the loop exit condition is simple enough
1145 - the number of iterations can be analyzed, i.e, a countable loop. The
1146 niter could be analyzed under some assumptions. */
1147
1148 bool
1149 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1150 tree *assumptions, tree *number_of_iterationsm1,
1151 tree *number_of_iterations, gcond **inner_loop_cond)
1152 {
1153 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1154
1155 /* Different restrictions apply when we are considering an inner-most loop,
1156 vs. an outer (nested) loop.
1157 (FORNOW. May want to relax some of these restrictions in the future). */
1158
1159 if (!loop->inner)
1160 {
1161 /* Inner-most loop. We currently require that the number of BBs is
1162 exactly 2 (the header and latch). Vectorizable inner-most loops
1163 look like this:
1164
1165 (pre-header)
1166 |
1167 header <--------+
1168 | | |
1169 | +--> latch --+
1170 |
1171 (exit-bb) */
1172
1173 if (loop->num_nodes != 2)
1174 {
1175 if (dump_enabled_p ())
1176 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1177 "not vectorized: control flow in loop.\n");
1178 return false;
1179 }
1180
1181 if (empty_block_p (loop->header))
1182 {
1183 if (dump_enabled_p ())
1184 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1185 "not vectorized: empty loop.\n");
1186 return false;
1187 }
1188 }
1189 else
1190 {
1191 struct loop *innerloop = loop->inner;
1192 edge entryedge;
1193
1194 /* Nested loop. We currently require that the loop is doubly-nested,
1195 contains a single inner loop, and the number of BBs is exactly 5.
1196 Vectorizable outer-loops look like this:
1197
1198 (pre-header)
1199 |
1200 header <---+
1201 | |
1202 inner-loop |
1203 | |
1204 tail ------+
1205 |
1206 (exit-bb)
1207
1208 The inner-loop has the properties expected of inner-most loops
1209 as described above. */
1210
1211 if ((loop->inner)->inner || (loop->inner)->next)
1212 {
1213 if (dump_enabled_p ())
1214 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1215 "not vectorized: multiple nested loops.\n");
1216 return false;
1217 }
1218
1219 if (loop->num_nodes != 5)
1220 {
1221 if (dump_enabled_p ())
1222 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1223 "not vectorized: control flow in loop.\n");
1224 return false;
1225 }
1226
1227 entryedge = loop_preheader_edge (innerloop);
1228 if (entryedge->src != loop->header
1229 || !single_exit (innerloop)
1230 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1231 {
1232 if (dump_enabled_p ())
1233 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1234 "not vectorized: unsupported outerloop form.\n");
1235 return false;
1236 }
1237
1238 /* Analyze the inner-loop. */
1239 tree inner_niterm1, inner_niter, inner_assumptions;
1240 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1241 &inner_assumptions, &inner_niterm1,
1242 &inner_niter, NULL)
1243 /* Don't support analyzing niter under assumptions for inner
1244 loop. */
1245 || !integer_onep (inner_assumptions))
1246 {
1247 if (dump_enabled_p ())
1248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1249 "not vectorized: Bad inner loop.\n");
1250 return false;
1251 }
1252
1253 if (!expr_invariant_in_loop_p (loop, inner_niter))
1254 {
1255 if (dump_enabled_p ())
1256 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1257 "not vectorized: inner-loop count not"
1258 " invariant.\n");
1259 return false;
1260 }
1261
1262 if (dump_enabled_p ())
1263 dump_printf_loc (MSG_NOTE, vect_location,
1264 "Considering outer-loop vectorization.\n");
1265 }
1266
1267 if (!single_exit (loop)
1268 || EDGE_COUNT (loop->header->preds) != 2)
1269 {
1270 if (dump_enabled_p ())
1271 {
1272 if (!single_exit (loop))
1273 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1274 "not vectorized: multiple exits.\n");
1275 else if (EDGE_COUNT (loop->header->preds) != 2)
1276 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1277 "not vectorized: too many incoming edges.\n");
1278 }
1279 return false;
1280 }
1281
1282 /* We assume that the loop exit condition is at the end of the loop. i.e,
1283 that the loop is represented as a do-while (with a proper if-guard
1284 before the loop if needed), where the loop header contains all the
1285 executable statements, and the latch is empty. */
1286 if (!empty_block_p (loop->latch)
1287 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1288 {
1289 if (dump_enabled_p ())
1290 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1291 "not vectorized: latch block not empty.\n");
1292 return false;
1293 }
1294
1295 /* Make sure the exit is not abnormal. */
1296 edge e = single_exit (loop);
1297 if (e->flags & EDGE_ABNORMAL)
1298 {
1299 if (dump_enabled_p ())
1300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1301 "not vectorized: abnormal loop exit edge.\n");
1302 return false;
1303 }
1304
1305 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1306 number_of_iterationsm1);
1307 if (!*loop_cond)
1308 {
1309 if (dump_enabled_p ())
1310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1311 "not vectorized: complicated exit condition.\n");
1312 return false;
1313 }
1314
1315 if (integer_zerop (*assumptions)
1316 || !*number_of_iterations
1317 || chrec_contains_undetermined (*number_of_iterations))
1318 {
1319 if (dump_enabled_p ())
1320 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1321 "not vectorized: number of iterations cannot be "
1322 "computed.\n");
1323 return false;
1324 }
1325
1326 if (integer_zerop (*number_of_iterations))
1327 {
1328 if (dump_enabled_p ())
1329 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1330 "not vectorized: number of iterations = 0.\n");
1331 return false;
1332 }
1333
1334 return true;
1335 }
1336
1337 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1338
1339 loop_vec_info
1340 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1341 {
1342 tree assumptions, number_of_iterations, number_of_iterationsm1;
1343 gcond *loop_cond, *inner_loop_cond = NULL;
1344
1345 if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1346 &assumptions, &number_of_iterationsm1,
1347 &number_of_iterations, &inner_loop_cond))
1348 return NULL;
1349
1350 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1351 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1352 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1353 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1354 if (!integer_onep (assumptions))
1355 {
1356 /* We consider to vectorize this loop by versioning it under
1357 some assumptions. In order to do this, we need to clear
1358 existing information computed by scev and niter analyzer. */
1359 scev_reset_htab ();
1360 free_numbers_of_iterations_estimates (loop);
1361 /* Also set flag for this loop so that following scev and niter
1362 analysis are done under the assumptions. */
1363 loop_constraint_set (loop, LOOP_C_FINITE);
1364 /* Also record the assumptions for versioning. */
1365 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1366 }
1367
1368 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1369 {
1370 if (dump_enabled_p ())
1371 {
1372 dump_printf_loc (MSG_NOTE, vect_location,
1373 "Symbolic number of iterations is ");
1374 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1375 dump_printf (MSG_NOTE, "\n");
1376 }
1377 }
1378
1379 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1380 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1381 if (inner_loop_cond)
1382 {
1383 stmt_vec_info inner_loop_cond_info
1384 = loop_vinfo->lookup_stmt (inner_loop_cond);
1385 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1386 }
1387
1388 gcc_assert (!loop->aux);
1389 loop->aux = loop_vinfo;
1390 return loop_vinfo;
1391 }
1392
1393
1394
1395 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1396 statements update the vectorization factor. */
1397
1398 static void
1399 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1400 {
1401 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1402 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1403 int nbbs = loop->num_nodes;
1404 poly_uint64 vectorization_factor;
1405 int i;
1406
1407 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1408
1409 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1410 gcc_assert (known_ne (vectorization_factor, 0U));
1411
1412 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1413 vectorization factor of the loop is the unrolling factor required by
1414 the SLP instances. If that unrolling factor is 1, we say, that we
1415 perform pure SLP on loop - cross iteration parallelism is not
1416 exploited. */
1417 bool only_slp_in_loop = true;
1418 for (i = 0; i < nbbs; i++)
1419 {
1420 basic_block bb = bbs[i];
1421 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1422 gsi_next (&si))
1423 {
1424 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1425 stmt_info = vect_stmt_to_vectorize (stmt_info);
1426 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1427 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1428 && !PURE_SLP_STMT (stmt_info))
1429 /* STMT needs both SLP and loop-based vectorization. */
1430 only_slp_in_loop = false;
1431 }
1432 }
1433
1434 if (only_slp_in_loop)
1435 {
1436 dump_printf_loc (MSG_NOTE, vect_location,
1437 "Loop contains only SLP stmts\n");
1438 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1439 }
1440 else
1441 {
1442 dump_printf_loc (MSG_NOTE, vect_location,
1443 "Loop contains SLP and non-SLP stmts\n");
1444 /* Both the vectorization factor and unroll factor have the form
1445 current_vector_size * X for some rational X, so they must have
1446 a common multiple. */
1447 vectorization_factor
1448 = force_common_multiple (vectorization_factor,
1449 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1450 }
1451
1452 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1453 if (dump_enabled_p ())
1454 {
1455 dump_printf_loc (MSG_NOTE, vect_location,
1456 "Updating vectorization factor to ");
1457 dump_dec (MSG_NOTE, vectorization_factor);
1458 dump_printf (MSG_NOTE, ".\n");
1459 }
1460 }
1461
1462 /* Return true if STMT_INFO describes a double reduction phi and if
1463 the other phi in the reduction is also relevant for vectorization.
1464 This rejects cases such as:
1465
1466 outer1:
1467 x_1 = PHI <x_3(outer2), ...>;
1468 ...
1469
1470 inner:
1471 x_2 = ...;
1472 ...
1473
1474 outer2:
1475 x_3 = PHI <x_2(inner)>;
1476
1477 if nothing in x_2 or elsewhere makes x_1 relevant. */
1478
1479 static bool
1480 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1481 {
1482 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1483 return false;
1484
1485 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1486 }
1487
1488 /* Function vect_analyze_loop_operations.
1489
1490 Scan the loop stmts and make sure they are all vectorizable. */
1491
1492 static bool
1493 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1494 {
1495 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1496 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1497 int nbbs = loop->num_nodes;
1498 int i;
1499 stmt_vec_info stmt_info;
1500 bool need_to_vectorize = false;
1501 bool ok;
1502
1503 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1504
1505 stmt_vector_for_cost cost_vec;
1506 cost_vec.create (2);
1507
1508 for (i = 0; i < nbbs; i++)
1509 {
1510 basic_block bb = bbs[i];
1511
1512 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1513 gsi_next (&si))
1514 {
1515 gphi *phi = si.phi ();
1516 ok = true;
1517
1518 stmt_info = loop_vinfo->lookup_stmt (phi);
1519 if (dump_enabled_p ())
1520 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1521 if (virtual_operand_p (gimple_phi_result (phi)))
1522 continue;
1523
1524 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1525 (i.e., a phi in the tail of the outer-loop). */
1526 if (! is_loop_header_bb_p (bb))
1527 {
1528 /* FORNOW: we currently don't support the case that these phis
1529 are not used in the outerloop (unless it is double reduction,
1530 i.e., this phi is vect_reduction_def), cause this case
1531 requires to actually do something here. */
1532 if (STMT_VINFO_LIVE_P (stmt_info)
1533 && !vect_active_double_reduction_p (stmt_info))
1534 {
1535 if (dump_enabled_p ())
1536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1537 "Unsupported loop-closed phi in "
1538 "outer-loop.\n");
1539 return false;
1540 }
1541
1542 /* If PHI is used in the outer loop, we check that its operand
1543 is defined in the inner loop. */
1544 if (STMT_VINFO_RELEVANT_P (stmt_info))
1545 {
1546 tree phi_op;
1547
1548 if (gimple_phi_num_args (phi) != 1)
1549 return false;
1550
1551 phi_op = PHI_ARG_DEF (phi, 0);
1552 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1553 if (!op_def_info)
1554 return false;
1555
1556 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1557 && (STMT_VINFO_RELEVANT (op_def_info)
1558 != vect_used_in_outer_by_reduction))
1559 return false;
1560 }
1561
1562 continue;
1563 }
1564
1565 gcc_assert (stmt_info);
1566
1567 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1568 || STMT_VINFO_LIVE_P (stmt_info))
1569 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1570 {
1571 /* A scalar-dependence cycle that we don't support. */
1572 if (dump_enabled_p ())
1573 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1574 "not vectorized: scalar dependence cycle.\n");
1575 return false;
1576 }
1577
1578 if (STMT_VINFO_RELEVANT_P (stmt_info))
1579 {
1580 need_to_vectorize = true;
1581 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1582 && ! PURE_SLP_STMT (stmt_info))
1583 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1584 &cost_vec);
1585 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1586 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1587 && ! PURE_SLP_STMT (stmt_info))
1588 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
1589 &cost_vec);
1590 }
1591
1592 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1593 if (ok
1594 && STMT_VINFO_LIVE_P (stmt_info)
1595 && !PURE_SLP_STMT (stmt_info))
1596 ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
1597 &cost_vec);
1598
1599 if (!ok)
1600 {
1601 if (dump_enabled_p ())
1602 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1603 "not vectorized: relevant phi not "
1604 "supported: %G", phi);
1605 return false;
1606 }
1607 }
1608
1609 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1610 gsi_next (&si))
1611 {
1612 gimple *stmt = gsi_stmt (si);
1613 if (!gimple_clobber_p (stmt)
1614 && !vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1615 &need_to_vectorize,
1616 NULL, NULL, &cost_vec))
1617 return false;
1618 }
1619 } /* bbs */
1620
1621 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1622 cost_vec.release ();
1623
1624 /* All operations in the loop are either irrelevant (deal with loop
1625 control, or dead), or only used outside the loop and can be moved
1626 out of the loop (e.g. invariants, inductions). The loop can be
1627 optimized away by scalar optimizations. We're better off not
1628 touching this loop. */
1629 if (!need_to_vectorize)
1630 {
1631 if (dump_enabled_p ())
1632 dump_printf_loc (MSG_NOTE, vect_location,
1633 "All the computation can be taken out of the loop.\n");
1634 if (dump_enabled_p ())
1635 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1636 "not vectorized: redundant loop. no profit to "
1637 "vectorize.\n");
1638 return false;
1639 }
1640
1641 return true;
1642 }
1643
1644 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1645 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1646 definitely no, or -1 if it's worth retrying. */
1647
1648 static int
1649 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1650 {
1651 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1652 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1653
1654 /* Only fully-masked loops can have iteration counts less than the
1655 vectorization factor. */
1656 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1657 {
1658 HOST_WIDE_INT max_niter;
1659
1660 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1661 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1662 else
1663 max_niter = max_stmt_executions_int (loop);
1664
1665 if (max_niter != -1
1666 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1667 {
1668 if (dump_enabled_p ())
1669 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1670 "not vectorized: iteration count smaller than "
1671 "vectorization factor.\n");
1672 return 0;
1673 }
1674 }
1675
1676 int min_profitable_iters, min_profitable_estimate;
1677 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1678 &min_profitable_estimate);
1679
1680 if (min_profitable_iters < 0)
1681 {
1682 if (dump_enabled_p ())
1683 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1684 "not vectorized: vectorization not profitable.\n");
1685 if (dump_enabled_p ())
1686 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1687 "not vectorized: vector version will never be "
1688 "profitable.\n");
1689 return -1;
1690 }
1691
1692 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1693 * assumed_vf);
1694
1695 /* Use the cost model only if it is more conservative than user specified
1696 threshold. */
1697 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1698 min_profitable_iters);
1699
1700 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1701
1702 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1703 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1704 {
1705 if (dump_enabled_p ())
1706 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1707 "not vectorized: vectorization not profitable.\n");
1708 if (dump_enabled_p ())
1709 dump_printf_loc (MSG_NOTE, vect_location,
1710 "not vectorized: iteration count smaller than user "
1711 "specified loop bound parameter or minimum profitable "
1712 "iterations (whichever is more conservative).\n");
1713 return 0;
1714 }
1715
1716 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1717 if (estimated_niter == -1)
1718 estimated_niter = likely_max_stmt_executions_int (loop);
1719 if (estimated_niter != -1
1720 && ((unsigned HOST_WIDE_INT) estimated_niter
1721 < MAX (th, (unsigned) min_profitable_estimate)))
1722 {
1723 if (dump_enabled_p ())
1724 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1725 "not vectorized: estimated iteration count too "
1726 "small.\n");
1727 if (dump_enabled_p ())
1728 dump_printf_loc (MSG_NOTE, vect_location,
1729 "not vectorized: estimated iteration count smaller "
1730 "than specified loop bound parameter or minimum "
1731 "profitable iterations (whichever is more "
1732 "conservative).\n");
1733 return -1;
1734 }
1735
1736 return 1;
1737 }
1738
1739 static bool
1740 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1741 vec<data_reference_p> *datarefs,
1742 unsigned int *n_stmts)
1743 {
1744 *n_stmts = 0;
1745 for (unsigned i = 0; i < loop->num_nodes; i++)
1746 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1747 !gsi_end_p (gsi); gsi_next (&gsi))
1748 {
1749 gimple *stmt = gsi_stmt (gsi);
1750 if (is_gimple_debug (stmt))
1751 continue;
1752 ++(*n_stmts);
1753 if (!vect_find_stmt_data_reference (loop, stmt, datarefs))
1754 {
1755 if (is_gimple_call (stmt) && loop->safelen)
1756 {
1757 tree fndecl = gimple_call_fndecl (stmt), op;
1758 if (fndecl != NULL_TREE)
1759 {
1760 cgraph_node *node = cgraph_node::get (fndecl);
1761 if (node != NULL && node->simd_clones != NULL)
1762 {
1763 unsigned int j, n = gimple_call_num_args (stmt);
1764 for (j = 0; j < n; j++)
1765 {
1766 op = gimple_call_arg (stmt, j);
1767 if (DECL_P (op)
1768 || (REFERENCE_CLASS_P (op)
1769 && get_base_address (op)))
1770 break;
1771 }
1772 op = gimple_call_lhs (stmt);
1773 /* Ignore #pragma omp declare simd functions
1774 if they don't have data references in the
1775 call stmt itself. */
1776 if (j == n
1777 && !(op
1778 && (DECL_P (op)
1779 || (REFERENCE_CLASS_P (op)
1780 && get_base_address (op)))))
1781 continue;
1782 }
1783 }
1784 }
1785 return false;
1786 }
1787 /* If dependence analysis will give up due to the limit on the
1788 number of datarefs stop here and fail fatally. */
1789 if (datarefs->length ()
1790 > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1791 return false;
1792 }
1793 return true;
1794 }
1795
1796 /* Function vect_analyze_loop_2.
1797
1798 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1799 for it. The different analyses will record information in the
1800 loop_vec_info struct. */
1801 static bool
1802 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1803 {
1804 bool ok;
1805 int res;
1806 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1807 poly_uint64 min_vf = 2;
1808
1809 /* The first group of checks is independent of the vector size. */
1810 fatal = true;
1811
1812 /* Find all data references in the loop (which correspond to vdefs/vuses)
1813 and analyze their evolution in the loop. */
1814
1815 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1816
1817 /* Gather the data references and count stmts in the loop. */
1818 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1819 {
1820 if (!vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1821 &LOOP_VINFO_DATAREFS (loop_vinfo),
1822 n_stmts))
1823 {
1824 if (dump_enabled_p ())
1825 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1826 "not vectorized: loop contains function "
1827 "calls or data references that cannot "
1828 "be analyzed\n");
1829 return false;
1830 }
1831 loop_vinfo->shared->save_datarefs ();
1832 }
1833 else
1834 loop_vinfo->shared->check_datarefs ();
1835
1836 /* Analyze the data references and also adjust the minimal
1837 vectorization factor according to the loads and stores. */
1838
1839 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1840 if (!ok)
1841 {
1842 if (dump_enabled_p ())
1843 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1844 "bad data references.\n");
1845 return false;
1846 }
1847
1848 /* Classify all cross-iteration scalar data-flow cycles.
1849 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1850 vect_analyze_scalar_cycles (loop_vinfo);
1851
1852 vect_pattern_recog (loop_vinfo);
1853
1854 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1855
1856 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1857 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1858
1859 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1860 if (!ok)
1861 {
1862 if (dump_enabled_p ())
1863 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1864 "bad data access.\n");
1865 return false;
1866 }
1867
1868 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1869
1870 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1871 if (!ok)
1872 {
1873 if (dump_enabled_p ())
1874 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1875 "unexpected pattern.\n");
1876 return false;
1877 }
1878
1879 /* While the rest of the analysis below depends on it in some way. */
1880 fatal = false;
1881
1882 /* Analyze data dependences between the data-refs in the loop
1883 and adjust the maximum vectorization factor according to
1884 the dependences.
1885 FORNOW: fail at the first data dependence that we encounter. */
1886
1887 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1888 if (!ok
1889 || (max_vf != MAX_VECTORIZATION_FACTOR
1890 && maybe_lt (max_vf, min_vf)))
1891 {
1892 if (dump_enabled_p ())
1893 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1894 "bad data dependence.\n");
1895 return false;
1896 }
1897 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1898
1899 ok = vect_determine_vectorization_factor (loop_vinfo);
1900 if (!ok)
1901 {
1902 if (dump_enabled_p ())
1903 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1904 "can't determine vectorization factor.\n");
1905 return false;
1906 }
1907 if (max_vf != MAX_VECTORIZATION_FACTOR
1908 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1909 {
1910 if (dump_enabled_p ())
1911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1912 "bad data dependence.\n");
1913 return false;
1914 }
1915
1916 /* Compute the scalar iteration cost. */
1917 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1918
1919 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1920 unsigned th;
1921
1922 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1923 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1924 if (!ok)
1925 return false;
1926
1927 /* If there are any SLP instances mark them as pure_slp. */
1928 bool slp = vect_make_slp_decision (loop_vinfo);
1929 if (slp)
1930 {
1931 /* Find stmts that need to be both vectorized and SLPed. */
1932 vect_detect_hybrid_slp (loop_vinfo);
1933
1934 /* Update the vectorization factor based on the SLP decision. */
1935 vect_update_vf_for_slp (loop_vinfo);
1936 }
1937
1938 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1939
1940 /* We don't expect to have to roll back to anything other than an empty
1941 set of rgroups. */
1942 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1943
1944 /* This is the point where we can re-start analysis with SLP forced off. */
1945 start_over:
1946
1947 /* Now the vectorization factor is final. */
1948 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1949 gcc_assert (known_ne (vectorization_factor, 0U));
1950
1951 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1952 {
1953 dump_printf_loc (MSG_NOTE, vect_location,
1954 "vectorization_factor = ");
1955 dump_dec (MSG_NOTE, vectorization_factor);
1956 dump_printf (MSG_NOTE, ", niters = %wd\n",
1957 LOOP_VINFO_INT_NITERS (loop_vinfo));
1958 }
1959
1960 HOST_WIDE_INT max_niter
1961 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1962
1963 /* Analyze the alignment of the data-refs in the loop.
1964 Fail if a data reference is found that cannot be vectorized. */
1965
1966 ok = vect_analyze_data_refs_alignment (loop_vinfo);
1967 if (!ok)
1968 {
1969 if (dump_enabled_p ())
1970 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1971 "bad data alignment.\n");
1972 return false;
1973 }
1974
1975 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1976 It is important to call pruning after vect_analyze_data_ref_accesses,
1977 since we use grouping information gathered by interleaving analysis. */
1978 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1979 if (!ok)
1980 return false;
1981
1982 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
1983 vectorization, since we do not want to add extra peeling or
1984 add versioning for alignment. */
1985 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1986 /* This pass will decide on using loop versioning and/or loop peeling in
1987 order to enhance the alignment of data references in the loop. */
1988 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1989 else
1990 ok = vect_verify_datarefs_alignment (loop_vinfo);
1991 if (!ok)
1992 {
1993 if (dump_enabled_p ())
1994 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1995 "bad data alignment.\n");
1996 return false;
1997 }
1998
1999 if (slp)
2000 {
2001 /* Analyze operations in the SLP instances. Note this may
2002 remove unsupported SLP instances which makes the above
2003 SLP kind detection invalid. */
2004 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2005 vect_slp_analyze_operations (loop_vinfo);
2006 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2007 goto again;
2008 }
2009
2010 /* Scan all the remaining operations in the loop that are not subject
2011 to SLP and make sure they are vectorizable. */
2012 ok = vect_analyze_loop_operations (loop_vinfo);
2013 if (!ok)
2014 {
2015 if (dump_enabled_p ())
2016 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2017 "bad operation or unsupported loop bound.\n");
2018 return false;
2019 }
2020
2021 /* Decide whether to use a fully-masked loop for this vectorization
2022 factor. */
2023 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2024 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2025 && vect_verify_full_masking (loop_vinfo));
2026 if (dump_enabled_p ())
2027 {
2028 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2029 dump_printf_loc (MSG_NOTE, vect_location,
2030 "using a fully-masked loop.\n");
2031 else
2032 dump_printf_loc (MSG_NOTE, vect_location,
2033 "not using a fully-masked loop.\n");
2034 }
2035
2036 /* If epilog loop is required because of data accesses with gaps,
2037 one additional iteration needs to be peeled. Check if there is
2038 enough iterations for vectorization. */
2039 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2040 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2041 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2042 {
2043 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2044 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2045
2046 if (known_lt (wi::to_widest (scalar_niters), vf))
2047 {
2048 if (dump_enabled_p ())
2049 dump_printf_loc (MSG_NOTE, vect_location,
2050 "loop has no enough iterations to support"
2051 " peeling for gaps.\n");
2052 return false;
2053 }
2054 }
2055
2056 /* Check the costings of the loop make vectorizing worthwhile. */
2057 res = vect_analyze_loop_costing (loop_vinfo);
2058 if (res < 0)
2059 goto again;
2060 if (!res)
2061 {
2062 if (dump_enabled_p ())
2063 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2064 "Loop costings not worthwhile.\n");
2065 return false;
2066 }
2067
2068 /* Decide whether we need to create an epilogue loop to handle
2069 remaining scalar iterations. */
2070 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2071
2072 unsigned HOST_WIDE_INT const_vf;
2073 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2074 /* The main loop handles all iterations. */
2075 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2076 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2077 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2078 {
2079 /* Work out the (constant) number of iterations that need to be
2080 peeled for reasons other than niters. */
2081 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2082 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2083 peel_niter += 1;
2084 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
2085 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2086 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2087 }
2088 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2089 /* ??? When peeling for gaps but not alignment, we could
2090 try to check whether the (variable) niters is known to be
2091 VF * N + 1. That's something of a niche case though. */
2092 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2093 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2094 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2095 < (unsigned) exact_log2 (const_vf))
2096 /* In case of versioning, check if the maximum number of
2097 iterations is greater than th. If they are identical,
2098 the epilogue is unnecessary. */
2099 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2100 || ((unsigned HOST_WIDE_INT) max_niter
2101 > (th / const_vf) * const_vf))))
2102 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2103
2104 /* If an epilogue loop is required make sure we can create one. */
2105 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2106 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2107 {
2108 if (dump_enabled_p ())
2109 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2110 if (!vect_can_advance_ivs_p (loop_vinfo)
2111 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2112 single_exit (LOOP_VINFO_LOOP
2113 (loop_vinfo))))
2114 {
2115 if (dump_enabled_p ())
2116 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2117 "not vectorized: can't create required "
2118 "epilog loop\n");
2119 goto again;
2120 }
2121 }
2122
2123 /* During peeling, we need to check if number of loop iterations is
2124 enough for both peeled prolog loop and vector loop. This check
2125 can be merged along with threshold check of loop versioning, so
2126 increase threshold for this case if necessary. */
2127 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2128 {
2129 poly_uint64 niters_th = 0;
2130
2131 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2132 {
2133 /* Niters for peeled prolog loop. */
2134 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2135 {
2136 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2137 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2138 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2139 }
2140 else
2141 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2142 }
2143
2144 /* Niters for at least one iteration of vectorized loop. */
2145 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2146 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2147 /* One additional iteration because of peeling for gap. */
2148 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2149 niters_th += 1;
2150 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2151 }
2152
2153 gcc_assert (known_eq (vectorization_factor,
2154 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2155
2156 /* Ok to vectorize! */
2157 return true;
2158
2159 again:
2160 /* Try again with SLP forced off but if we didn't do any SLP there is
2161 no point in re-trying. */
2162 if (!slp)
2163 return false;
2164
2165 /* If there are reduction chains re-trying will fail anyway. */
2166 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2167 return false;
2168
2169 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2170 via interleaving or lane instructions. */
2171 slp_instance instance;
2172 slp_tree node;
2173 unsigned i, j;
2174 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2175 {
2176 stmt_vec_info vinfo;
2177 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2178 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2179 continue;
2180 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2181 unsigned int size = DR_GROUP_SIZE (vinfo);
2182 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2183 if (! vect_store_lanes_supported (vectype, size, false)
2184 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2185 && ! vect_grouped_store_supported (vectype, size))
2186 return false;
2187 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2188 {
2189 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2190 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2191 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2192 size = DR_GROUP_SIZE (vinfo);
2193 vectype = STMT_VINFO_VECTYPE (vinfo);
2194 if (! vect_load_lanes_supported (vectype, size, false)
2195 && ! vect_grouped_load_supported (vectype, single_element_p,
2196 size))
2197 return false;
2198 }
2199 }
2200
2201 if (dump_enabled_p ())
2202 dump_printf_loc (MSG_NOTE, vect_location,
2203 "re-trying with SLP disabled\n");
2204
2205 /* Roll back state appropriately. No SLP this time. */
2206 slp = false;
2207 /* Restore vectorization factor as it were without SLP. */
2208 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2209 /* Free the SLP instances. */
2210 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2211 vect_free_slp_instance (instance, false);
2212 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2213 /* Reset SLP type to loop_vect on all stmts. */
2214 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2215 {
2216 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2217 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2218 !gsi_end_p (si); gsi_next (&si))
2219 {
2220 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2221 STMT_SLP_TYPE (stmt_info) = loop_vect;
2222 }
2223 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2224 !gsi_end_p (si); gsi_next (&si))
2225 {
2226 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2227 STMT_SLP_TYPE (stmt_info) = loop_vect;
2228 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2229 {
2230 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2231 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2232 STMT_SLP_TYPE (stmt_info) = loop_vect;
2233 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2234 !gsi_end_p (pi); gsi_next (&pi))
2235 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2236 = loop_vect;
2237 }
2238 }
2239 }
2240 /* Free optimized alias test DDRS. */
2241 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2242 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2243 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2244 /* Reset target cost data. */
2245 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2246 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2247 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2248 /* Reset accumulated rgroup information. */
2249 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2250 /* Reset assorted flags. */
2251 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2252 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2253 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2254 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2255 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2256
2257 goto start_over;
2258 }
2259
2260 /* Function vect_analyze_loop.
2261
2262 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2263 for it. The different analyses will record information in the
2264 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2265 be vectorized. */
2266 loop_vec_info
2267 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2268 vec_info_shared *shared)
2269 {
2270 loop_vec_info loop_vinfo;
2271 auto_vector_sizes vector_sizes;
2272
2273 /* Autodetect first vector size we try. */
2274 current_vector_size = 0;
2275 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2276 unsigned int next_size = 0;
2277
2278 DUMP_VECT_SCOPE ("analyze_loop_nest");
2279
2280 if (loop_outer (loop)
2281 && loop_vec_info_for_loop (loop_outer (loop))
2282 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2283 {
2284 if (dump_enabled_p ())
2285 dump_printf_loc (MSG_NOTE, vect_location,
2286 "outer-loop already vectorized.\n");
2287 return NULL;
2288 }
2289
2290 if (!find_loop_nest (loop, &shared->loop_nest))
2291 {
2292 if (dump_enabled_p ())
2293 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2294 "not vectorized: loop nest containing two "
2295 "or more consecutive inner loops cannot be "
2296 "vectorized\n");
2297 return NULL;
2298 }
2299
2300 unsigned n_stmts = 0;
2301 poly_uint64 autodetected_vector_size = 0;
2302 while (1)
2303 {
2304 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2305 loop_vinfo = vect_analyze_loop_form (loop, shared);
2306 if (!loop_vinfo)
2307 {
2308 if (dump_enabled_p ())
2309 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2310 "bad loop form.\n");
2311 return NULL;
2312 }
2313
2314 bool fatal = false;
2315
2316 if (orig_loop_vinfo)
2317 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2318
2319 if (vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts))
2320 {
2321 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2322
2323 return loop_vinfo;
2324 }
2325
2326 delete loop_vinfo;
2327
2328 if (next_size == 0)
2329 autodetected_vector_size = current_vector_size;
2330
2331 if (next_size < vector_sizes.length ()
2332 && known_eq (vector_sizes[next_size], autodetected_vector_size))
2333 next_size += 1;
2334
2335 if (fatal
2336 || next_size == vector_sizes.length ()
2337 || known_eq (current_vector_size, 0U))
2338 return NULL;
2339
2340 /* Try the next biggest vector size. */
2341 current_vector_size = vector_sizes[next_size++];
2342 if (dump_enabled_p ())
2343 {
2344 dump_printf_loc (MSG_NOTE, vect_location,
2345 "***** Re-trying analysis with "
2346 "vector size ");
2347 dump_dec (MSG_NOTE, current_vector_size);
2348 dump_printf (MSG_NOTE, "\n");
2349 }
2350 }
2351 }
2352
2353 /* Return true if there is an in-order reduction function for CODE, storing
2354 it in *REDUC_FN if so. */
2355
2356 static bool
2357 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2358 {
2359 switch (code)
2360 {
2361 case PLUS_EXPR:
2362 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2363 return true;
2364
2365 default:
2366 return false;
2367 }
2368 }
2369
2370 /* Function reduction_fn_for_scalar_code
2371
2372 Input:
2373 CODE - tree_code of a reduction operations.
2374
2375 Output:
2376 REDUC_FN - the corresponding internal function to be used to reduce the
2377 vector of partial results into a single scalar result, or IFN_LAST
2378 if the operation is a supported reduction operation, but does not have
2379 such an internal function.
2380
2381 Return FALSE if CODE currently cannot be vectorized as reduction. */
2382
2383 static bool
2384 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2385 {
2386 switch (code)
2387 {
2388 case MAX_EXPR:
2389 *reduc_fn = IFN_REDUC_MAX;
2390 return true;
2391
2392 case MIN_EXPR:
2393 *reduc_fn = IFN_REDUC_MIN;
2394 return true;
2395
2396 case PLUS_EXPR:
2397 *reduc_fn = IFN_REDUC_PLUS;
2398 return true;
2399
2400 case BIT_AND_EXPR:
2401 *reduc_fn = IFN_REDUC_AND;
2402 return true;
2403
2404 case BIT_IOR_EXPR:
2405 *reduc_fn = IFN_REDUC_IOR;
2406 return true;
2407
2408 case BIT_XOR_EXPR:
2409 *reduc_fn = IFN_REDUC_XOR;
2410 return true;
2411
2412 case MULT_EXPR:
2413 case MINUS_EXPR:
2414 *reduc_fn = IFN_LAST;
2415 return true;
2416
2417 default:
2418 return false;
2419 }
2420 }
2421
2422 /* If there is a neutral value X such that SLP reduction NODE would not
2423 be affected by the introduction of additional X elements, return that X,
2424 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2425 is true if the SLP statements perform a single reduction, false if each
2426 statement performs an independent reduction. */
2427
2428 static tree
2429 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2430 bool reduc_chain)
2431 {
2432 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2433 stmt_vec_info stmt_vinfo = stmts[0];
2434 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2435 tree scalar_type = TREE_TYPE (vector_type);
2436 struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2437 gcc_assert (loop);
2438
2439 switch (code)
2440 {
2441 case WIDEN_SUM_EXPR:
2442 case DOT_PROD_EXPR:
2443 case SAD_EXPR:
2444 case PLUS_EXPR:
2445 case MINUS_EXPR:
2446 case BIT_IOR_EXPR:
2447 case BIT_XOR_EXPR:
2448 return build_zero_cst (scalar_type);
2449
2450 case MULT_EXPR:
2451 return build_one_cst (scalar_type);
2452
2453 case BIT_AND_EXPR:
2454 return build_all_ones_cst (scalar_type);
2455
2456 case MAX_EXPR:
2457 case MIN_EXPR:
2458 /* For MIN/MAX the initial values are neutral. A reduction chain
2459 has only a single initial value, so that value is neutral for
2460 all statements. */
2461 if (reduc_chain)
2462 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2463 loop_preheader_edge (loop));
2464 return NULL_TREE;
2465
2466 default:
2467 return NULL_TREE;
2468 }
2469 }
2470
2471 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2472 STMT is printed with a message MSG. */
2473
2474 static void
2475 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2476 {
2477 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2478 }
2479
2480 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2481 operation. Return true if the results of DEF_STMT_INFO are something
2482 that can be accumulated by such a reduction. */
2483
2484 static bool
2485 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2486 {
2487 return (is_gimple_assign (def_stmt_info->stmt)
2488 || is_gimple_call (def_stmt_info->stmt)
2489 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2490 || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2491 && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2492 && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2493 }
2494
2495 /* Detect SLP reduction of the form:
2496
2497 #a1 = phi <a5, a0>
2498 a2 = operation (a1)
2499 a3 = operation (a2)
2500 a4 = operation (a3)
2501 a5 = operation (a4)
2502
2503 #a = phi <a5>
2504
2505 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2506 FIRST_STMT is the first reduction stmt in the chain
2507 (a2 = operation (a1)).
2508
2509 Return TRUE if a reduction chain was detected. */
2510
2511 static bool
2512 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2513 gimple *first_stmt)
2514 {
2515 struct loop *loop = (gimple_bb (phi))->loop_father;
2516 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2517 enum tree_code code;
2518 gimple *loop_use_stmt = NULL;
2519 stmt_vec_info use_stmt_info, current_stmt_info = NULL;
2520 tree lhs;
2521 imm_use_iterator imm_iter;
2522 use_operand_p use_p;
2523 int nloop_uses, size = 0, n_out_of_loop_uses;
2524 bool found = false;
2525
2526 if (loop != vect_loop)
2527 return false;
2528
2529 lhs = PHI_RESULT (phi);
2530 code = gimple_assign_rhs_code (first_stmt);
2531 while (1)
2532 {
2533 nloop_uses = 0;
2534 n_out_of_loop_uses = 0;
2535 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2536 {
2537 gimple *use_stmt = USE_STMT (use_p);
2538 if (is_gimple_debug (use_stmt))
2539 continue;
2540
2541 /* Check if we got back to the reduction phi. */
2542 if (use_stmt == phi)
2543 {
2544 loop_use_stmt = use_stmt;
2545 found = true;
2546 break;
2547 }
2548
2549 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2550 {
2551 loop_use_stmt = use_stmt;
2552 nloop_uses++;
2553 }
2554 else
2555 n_out_of_loop_uses++;
2556
2557 /* There are can be either a single use in the loop or two uses in
2558 phi nodes. */
2559 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2560 return false;
2561 }
2562
2563 if (found)
2564 break;
2565
2566 /* We reached a statement with no loop uses. */
2567 if (nloop_uses == 0)
2568 return false;
2569
2570 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2571 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2572 return false;
2573
2574 if (!is_gimple_assign (loop_use_stmt)
2575 || code != gimple_assign_rhs_code (loop_use_stmt)
2576 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2577 return false;
2578
2579 /* Insert USE_STMT into reduction chain. */
2580 use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2581 if (current_stmt_info)
2582 {
2583 REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = use_stmt_info;
2584 REDUC_GROUP_FIRST_ELEMENT (use_stmt_info)
2585 = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2586 }
2587 else
2588 REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = use_stmt_info;
2589
2590 lhs = gimple_assign_lhs (loop_use_stmt);
2591 current_stmt_info = use_stmt_info;
2592 size++;
2593 }
2594
2595 if (!found || loop_use_stmt != phi || size < 2)
2596 return false;
2597
2598 /* Swap the operands, if needed, to make the reduction operand be the second
2599 operand. */
2600 lhs = PHI_RESULT (phi);
2601 stmt_vec_info next_stmt_info = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2602 while (next_stmt_info)
2603 {
2604 gassign *next_stmt = as_a <gassign *> (next_stmt_info->stmt);
2605 if (gimple_assign_rhs2 (next_stmt) == lhs)
2606 {
2607 tree op = gimple_assign_rhs1 (next_stmt);
2608 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2609
2610 /* Check that the other def is either defined in the loop
2611 ("vect_internal_def"), or it's an induction (defined by a
2612 loop-header phi-node). */
2613 if (def_stmt_info
2614 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2615 && vect_valid_reduction_input_p (def_stmt_info))
2616 {
2617 lhs = gimple_assign_lhs (next_stmt);
2618 next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
2619 continue;
2620 }
2621
2622 return false;
2623 }
2624 else
2625 {
2626 tree op = gimple_assign_rhs2 (next_stmt);
2627 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2628
2629 /* Check that the other def is either defined in the loop
2630 ("vect_internal_def"), or it's an induction (defined by a
2631 loop-header phi-node). */
2632 if (def_stmt_info
2633 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2634 && vect_valid_reduction_input_p (def_stmt_info))
2635 {
2636 if (dump_enabled_p ())
2637 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
2638 next_stmt);
2639
2640 swap_ssa_operands (next_stmt,
2641 gimple_assign_rhs1_ptr (next_stmt),
2642 gimple_assign_rhs2_ptr (next_stmt));
2643 update_stmt (next_stmt);
2644
2645 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2646 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2647 }
2648 else
2649 return false;
2650 }
2651
2652 lhs = gimple_assign_lhs (next_stmt);
2653 next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
2654 }
2655
2656 /* Save the chain for further analysis in SLP detection. */
2657 stmt_vec_info first_stmt_info
2658 = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2659 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first_stmt_info);
2660 REDUC_GROUP_SIZE (first_stmt_info) = size;
2661
2662 return true;
2663 }
2664
2665 /* Return true if we need an in-order reduction for operation CODE
2666 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2667 overflow must wrap. */
2668
2669 static bool
2670 needs_fold_left_reduction_p (tree type, tree_code code,
2671 bool need_wrapping_integral_overflow)
2672 {
2673 /* CHECKME: check for !flag_finite_math_only too? */
2674 if (SCALAR_FLOAT_TYPE_P (type))
2675 switch (code)
2676 {
2677 case MIN_EXPR:
2678 case MAX_EXPR:
2679 return false;
2680
2681 default:
2682 return !flag_associative_math;
2683 }
2684
2685 if (INTEGRAL_TYPE_P (type))
2686 {
2687 if (!operation_no_trapping_overflow (type, code))
2688 return true;
2689 if (need_wrapping_integral_overflow
2690 && !TYPE_OVERFLOW_WRAPS (type)
2691 && operation_can_overflow (code))
2692 return true;
2693 return false;
2694 }
2695
2696 if (SAT_FIXED_POINT_TYPE_P (type))
2697 return true;
2698
2699 return false;
2700 }
2701
2702 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2703 reduction operation CODE has a handled computation expression. */
2704
2705 bool
2706 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2707 tree loop_arg, enum tree_code code)
2708 {
2709 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2710 auto_bitmap visited;
2711 tree lookfor = PHI_RESULT (phi);
2712 ssa_op_iter curri;
2713 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2714 while (USE_FROM_PTR (curr) != loop_arg)
2715 curr = op_iter_next_use (&curri);
2716 curri.i = curri.numops;
2717 do
2718 {
2719 path.safe_push (std::make_pair (curri, curr));
2720 tree use = USE_FROM_PTR (curr);
2721 if (use == lookfor)
2722 break;
2723 gimple *def = SSA_NAME_DEF_STMT (use);
2724 if (gimple_nop_p (def)
2725 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2726 {
2727 pop:
2728 do
2729 {
2730 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2731 curri = x.first;
2732 curr = x.second;
2733 do
2734 curr = op_iter_next_use (&curri);
2735 /* Skip already visited or non-SSA operands (from iterating
2736 over PHI args). */
2737 while (curr != NULL_USE_OPERAND_P
2738 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2739 || ! bitmap_set_bit (visited,
2740 SSA_NAME_VERSION
2741 (USE_FROM_PTR (curr)))));
2742 }
2743 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2744 if (curr == NULL_USE_OPERAND_P)
2745 break;
2746 }
2747 else
2748 {
2749 if (gimple_code (def) == GIMPLE_PHI)
2750 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2751 else
2752 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2753 while (curr != NULL_USE_OPERAND_P
2754 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2755 || ! bitmap_set_bit (visited,
2756 SSA_NAME_VERSION
2757 (USE_FROM_PTR (curr)))))
2758 curr = op_iter_next_use (&curri);
2759 if (curr == NULL_USE_OPERAND_P)
2760 goto pop;
2761 }
2762 }
2763 while (1);
2764 if (dump_file && (dump_flags & TDF_DETAILS))
2765 {
2766 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2767 unsigned i;
2768 std::pair<ssa_op_iter, use_operand_p> *x;
2769 FOR_EACH_VEC_ELT (path, i, x)
2770 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2771 dump_printf (MSG_NOTE, "\n");
2772 }
2773
2774 /* Check whether the reduction path detected is valid. */
2775 bool fail = path.length () == 0;
2776 bool neg = false;
2777 for (unsigned i = 1; i < path.length (); ++i)
2778 {
2779 gimple *use_stmt = USE_STMT (path[i].second);
2780 tree op = USE_FROM_PTR (path[i].second);
2781 if (! has_single_use (op)
2782 || ! is_gimple_assign (use_stmt))
2783 {
2784 fail = true;
2785 break;
2786 }
2787 if (gimple_assign_rhs_code (use_stmt) != code)
2788 {
2789 if (code == PLUS_EXPR
2790 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2791 {
2792 /* Track whether we negate the reduction value each iteration. */
2793 if (gimple_assign_rhs2 (use_stmt) == op)
2794 neg = ! neg;
2795 }
2796 else
2797 {
2798 fail = true;
2799 break;
2800 }
2801 }
2802 }
2803 return ! fail && ! neg;
2804 }
2805
2806
2807 /* Function vect_is_simple_reduction
2808
2809 (1) Detect a cross-iteration def-use cycle that represents a simple
2810 reduction computation. We look for the following pattern:
2811
2812 loop_header:
2813 a1 = phi < a0, a2 >
2814 a3 = ...
2815 a2 = operation (a3, a1)
2816
2817 or
2818
2819 a3 = ...
2820 loop_header:
2821 a1 = phi < a0, a2 >
2822 a2 = operation (a3, a1)
2823
2824 such that:
2825 1. operation is commutative and associative and it is safe to
2826 change the order of the computation
2827 2. no uses for a2 in the loop (a2 is used out of the loop)
2828 3. no uses of a1 in the loop besides the reduction operation
2829 4. no uses of a1 outside the loop.
2830
2831 Conditions 1,4 are tested here.
2832 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2833
2834 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2835 nested cycles.
2836
2837 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2838 reductions:
2839
2840 a1 = phi < a0, a2 >
2841 inner loop (def of a3)
2842 a2 = phi < a3 >
2843
2844 (4) Detect condition expressions, ie:
2845 for (int i = 0; i < N; i++)
2846 if (a[i] < val)
2847 ret_val = a[i];
2848
2849 */
2850
2851 static stmt_vec_info
2852 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2853 bool *double_reduc,
2854 bool need_wrapping_integral_overflow,
2855 enum vect_reduction_type *v_reduc_type)
2856 {
2857 gphi *phi = as_a <gphi *> (phi_info->stmt);
2858 struct loop *loop = (gimple_bb (phi))->loop_father;
2859 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2860 gimple *phi_use_stmt = NULL;
2861 enum tree_code orig_code, code;
2862 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2863 tree type;
2864 int nloop_uses;
2865 tree name;
2866 imm_use_iterator imm_iter;
2867 use_operand_p use_p;
2868 bool phi_def;
2869
2870 *double_reduc = false;
2871 *v_reduc_type = TREE_CODE_REDUCTION;
2872
2873 tree phi_name = PHI_RESULT (phi);
2874 /* ??? If there are no uses of the PHI result the inner loop reduction
2875 won't be detected as possibly double-reduction by vectorizable_reduction
2876 because that tries to walk the PHI arg from the preheader edge which
2877 can be constant. See PR60382. */
2878 if (has_zero_uses (phi_name))
2879 return NULL;
2880 nloop_uses = 0;
2881 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2882 {
2883 gimple *use_stmt = USE_STMT (use_p);
2884 if (is_gimple_debug (use_stmt))
2885 continue;
2886
2887 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2888 {
2889 if (dump_enabled_p ())
2890 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2891 "intermediate value used outside loop.\n");
2892
2893 return NULL;
2894 }
2895
2896 nloop_uses++;
2897 if (nloop_uses > 1)
2898 {
2899 if (dump_enabled_p ())
2900 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2901 "reduction value used in loop.\n");
2902 return NULL;
2903 }
2904
2905 phi_use_stmt = use_stmt;
2906 }
2907
2908 edge latch_e = loop_latch_edge (loop);
2909 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2910 if (TREE_CODE (loop_arg) != SSA_NAME)
2911 {
2912 if (dump_enabled_p ())
2913 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2914 "reduction: not ssa_name: %T\n", loop_arg);
2915 return NULL;
2916 }
2917
2918 stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
2919 if (!def_stmt_info
2920 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
2921 return NULL;
2922
2923 if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
2924 {
2925 name = gimple_assign_lhs (def_stmt);
2926 phi_def = false;
2927 }
2928 else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2929 {
2930 name = PHI_RESULT (def_stmt);
2931 phi_def = true;
2932 }
2933 else
2934 {
2935 if (dump_enabled_p ())
2936 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2937 "reduction: unhandled reduction operation: %G",
2938 def_stmt_info->stmt);
2939 return NULL;
2940 }
2941
2942 nloop_uses = 0;
2943 auto_vec<gphi *, 3> lcphis;
2944 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2945 {
2946 gimple *use_stmt = USE_STMT (use_p);
2947 if (is_gimple_debug (use_stmt))
2948 continue;
2949 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2950 nloop_uses++;
2951 else
2952 /* We can have more than one loop-closed PHI. */
2953 lcphis.safe_push (as_a <gphi *> (use_stmt));
2954 if (nloop_uses > 1)
2955 {
2956 if (dump_enabled_p ())
2957 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2958 "reduction used in loop.\n");
2959 return NULL;
2960 }
2961 }
2962
2963 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2964 defined in the inner loop. */
2965 if (phi_def)
2966 {
2967 gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
2968 op1 = PHI_ARG_DEF (def_stmt, 0);
2969
2970 if (gimple_phi_num_args (def_stmt) != 1
2971 || TREE_CODE (op1) != SSA_NAME)
2972 {
2973 if (dump_enabled_p ())
2974 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2975 "unsupported phi node definition.\n");
2976
2977 return NULL;
2978 }
2979
2980 gimple *def1 = SSA_NAME_DEF_STMT (op1);
2981 if (gimple_bb (def1)
2982 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2983 && loop->inner
2984 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2985 && is_gimple_assign (def1)
2986 && is_a <gphi *> (phi_use_stmt)
2987 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2988 {
2989 if (dump_enabled_p ())
2990 report_vect_op (MSG_NOTE, def_stmt,
2991 "detected double reduction: ");
2992
2993 *double_reduc = true;
2994 return def_stmt_info;
2995 }
2996
2997 return NULL;
2998 }
2999
3000 /* If we are vectorizing an inner reduction we are executing that
3001 in the original order only in case we are not dealing with a
3002 double reduction. */
3003 bool check_reduction = true;
3004 if (flow_loop_nested_p (vect_loop, loop))
3005 {
3006 gphi *lcphi;
3007 unsigned i;
3008 check_reduction = false;
3009 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3010 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3011 {
3012 gimple *use_stmt = USE_STMT (use_p);
3013 if (is_gimple_debug (use_stmt))
3014 continue;
3015 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3016 check_reduction = true;
3017 }
3018 }
3019
3020 gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
3021 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3022 code = orig_code = gimple_assign_rhs_code (def_stmt);
3023
3024 /* We can handle "res -= x[i]", which is non-associative by
3025 simply rewriting this into "res += -x[i]". Avoid changing
3026 gimple instruction for the first simple tests and only do this
3027 if we're allowed to change code at all. */
3028 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3029 code = PLUS_EXPR;
3030
3031 if (code == COND_EXPR)
3032 {
3033 if (! nested_in_vect_loop)
3034 *v_reduc_type = COND_REDUCTION;
3035
3036 op3 = gimple_assign_rhs1 (def_stmt);
3037 if (COMPARISON_CLASS_P (op3))
3038 {
3039 op4 = TREE_OPERAND (op3, 1);
3040 op3 = TREE_OPERAND (op3, 0);
3041 }
3042 if (op3 == phi_name || op4 == phi_name)
3043 {
3044 if (dump_enabled_p ())
3045 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3046 "reduction: condition depends on previous"
3047 " iteration: ");
3048 return NULL;
3049 }
3050
3051 op1 = gimple_assign_rhs2 (def_stmt);
3052 op2 = gimple_assign_rhs3 (def_stmt);
3053 }
3054 else if (!commutative_tree_code (code) || !associative_tree_code (code))
3055 {
3056 if (dump_enabled_p ())
3057 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3058 "reduction: not commutative/associative: ");
3059 return NULL;
3060 }
3061 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3062 {
3063 op1 = gimple_assign_rhs1 (def_stmt);
3064 op2 = gimple_assign_rhs2 (def_stmt);
3065 }
3066 else
3067 {
3068 if (dump_enabled_p ())
3069 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3070 "reduction: not handled operation: ");
3071 return NULL;
3072 }
3073
3074 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3075 {
3076 if (dump_enabled_p ())
3077 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3078 "reduction: both uses not ssa_names: ");
3079
3080 return NULL;
3081 }
3082
3083 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3084 if ((TREE_CODE (op1) == SSA_NAME
3085 && !types_compatible_p (type,TREE_TYPE (op1)))
3086 || (TREE_CODE (op2) == SSA_NAME
3087 && !types_compatible_p (type, TREE_TYPE (op2)))
3088 || (op3 && TREE_CODE (op3) == SSA_NAME
3089 && !types_compatible_p (type, TREE_TYPE (op3)))
3090 || (op4 && TREE_CODE (op4) == SSA_NAME
3091 && !types_compatible_p (type, TREE_TYPE (op4))))
3092 {
3093 if (dump_enabled_p ())
3094 {
3095 dump_printf_loc (MSG_NOTE, vect_location,
3096 "reduction: multiple types: operation type: "
3097 "%T, operands types: %T,%T",
3098 type, TREE_TYPE (op1), TREE_TYPE (op2));
3099 if (op3)
3100 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
3101
3102 if (op4)
3103 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
3104 dump_printf (MSG_NOTE, "\n");
3105 }
3106
3107 return NULL;
3108 }
3109
3110 /* Check whether it's ok to change the order of the computation.
3111 Generally, when vectorizing a reduction we change the order of the
3112 computation. This may change the behavior of the program in some
3113 cases, so we need to check that this is ok. One exception is when
3114 vectorizing an outer-loop: the inner-loop is executed sequentially,
3115 and therefore vectorizing reductions in the inner-loop during
3116 outer-loop vectorization is safe. */
3117 if (check_reduction
3118 && *v_reduc_type == TREE_CODE_REDUCTION
3119 && needs_fold_left_reduction_p (type, code,
3120 need_wrapping_integral_overflow))
3121 *v_reduc_type = FOLD_LEFT_REDUCTION;
3122
3123 /* Reduction is safe. We're dealing with one of the following:
3124 1) integer arithmetic and no trapv
3125 2) floating point arithmetic, and special flags permit this optimization
3126 3) nested cycle (i.e., outer loop vectorization). */
3127 stmt_vec_info def1_info = loop_info->lookup_def (op1);
3128 stmt_vec_info def2_info = loop_info->lookup_def (op2);
3129 if (code != COND_EXPR && !def1_info && !def2_info)
3130 {
3131 if (dump_enabled_p ())
3132 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3133 return NULL;
3134 }
3135
3136 /* Check that one def is the reduction def, defined by PHI,
3137 the other def is either defined in the loop ("vect_internal_def"),
3138 or it's an induction (defined by a loop-header phi-node). */
3139
3140 if (def2_info
3141 && def2_info->stmt == phi
3142 && (code == COND_EXPR
3143 || !def1_info
3144 || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
3145 || vect_valid_reduction_input_p (def1_info)))
3146 {
3147 if (dump_enabled_p ())
3148 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3149 return def_stmt_info;
3150 }
3151
3152 if (def1_info
3153 && def1_info->stmt == phi
3154 && (code == COND_EXPR
3155 || !def2_info
3156 || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
3157 || vect_valid_reduction_input_p (def2_info)))
3158 {
3159 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3160 {
3161 /* Check if we can swap operands (just for simplicity - so that
3162 the rest of the code can assume that the reduction variable
3163 is always the last (second) argument). */
3164 if (code == COND_EXPR)
3165 {
3166 /* Swap cond_expr by inverting the condition. */
3167 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3168 enum tree_code invert_code = ERROR_MARK;
3169 enum tree_code cond_code = TREE_CODE (cond_expr);
3170
3171 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3172 {
3173 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3174 invert_code = invert_tree_comparison (cond_code, honor_nans);
3175 }
3176 if (invert_code != ERROR_MARK)
3177 {
3178 TREE_SET_CODE (cond_expr, invert_code);
3179 swap_ssa_operands (def_stmt,
3180 gimple_assign_rhs2_ptr (def_stmt),
3181 gimple_assign_rhs3_ptr (def_stmt));
3182 }
3183 else
3184 {
3185 if (dump_enabled_p ())
3186 report_vect_op (MSG_NOTE, def_stmt,
3187 "detected reduction: cannot swap operands "
3188 "for cond_expr");
3189 return NULL;
3190 }
3191 }
3192 else
3193 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3194 gimple_assign_rhs2_ptr (def_stmt));
3195
3196 if (dump_enabled_p ())
3197 report_vect_op (MSG_NOTE, def_stmt,
3198 "detected reduction: need to swap operands: ");
3199
3200 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3201 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3202 }
3203 else
3204 {
3205 if (dump_enabled_p ())
3206 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3207 }
3208
3209 return def_stmt_info;
3210 }
3211
3212 /* Try to find SLP reduction chain. */
3213 if (! nested_in_vect_loop
3214 && code != COND_EXPR
3215 && orig_code != MINUS_EXPR
3216 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3217 {
3218 if (dump_enabled_p ())
3219 report_vect_op (MSG_NOTE, def_stmt,
3220 "reduction: detected reduction chain: ");
3221
3222 return def_stmt_info;
3223 }
3224
3225 /* Dissolve group eventually half-built by vect_is_slp_reduction. */
3226 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (def_stmt_info);
3227 while (first)
3228 {
3229 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
3230 REDUC_GROUP_FIRST_ELEMENT (first) = NULL;
3231 REDUC_GROUP_NEXT_ELEMENT (first) = NULL;
3232 first = next;
3233 }
3234
3235 /* Look for the expression computing loop_arg from loop PHI result. */
3236 if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3237 return def_stmt_info;
3238
3239 if (dump_enabled_p ())
3240 {
3241 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3242 "reduction: unknown pattern: ");
3243 }
3244
3245 return NULL;
3246 }
3247
3248 /* Wrapper around vect_is_simple_reduction, which will modify code
3249 in-place if it enables detection of more reductions. Arguments
3250 as there. */
3251
3252 stmt_vec_info
3253 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3254 bool *double_reduc,
3255 bool need_wrapping_integral_overflow)
3256 {
3257 enum vect_reduction_type v_reduc_type;
3258 stmt_vec_info def_info
3259 = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3260 need_wrapping_integral_overflow,
3261 &v_reduc_type);
3262 if (def_info)
3263 {
3264 STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3265 STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3266 STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3267 STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3268 }
3269 return def_info;
3270 }
3271
3272 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3273 int
3274 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3275 int *peel_iters_epilogue,
3276 stmt_vector_for_cost *scalar_cost_vec,
3277 stmt_vector_for_cost *prologue_cost_vec,
3278 stmt_vector_for_cost *epilogue_cost_vec)
3279 {
3280 int retval = 0;
3281 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3282
3283 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3284 {
3285 *peel_iters_epilogue = assumed_vf / 2;
3286 if (dump_enabled_p ())
3287 dump_printf_loc (MSG_NOTE, vect_location,
3288 "cost model: epilogue peel iters set to vf/2 "
3289 "because loop iterations are unknown .\n");
3290
3291 /* If peeled iterations are known but number of scalar loop
3292 iterations are unknown, count a taken branch per peeled loop. */
3293 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3294 NULL, 0, vect_prologue);
3295 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3296 NULL, 0, vect_epilogue);
3297 }
3298 else
3299 {
3300 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3301 peel_iters_prologue = niters < peel_iters_prologue ?
3302 niters : peel_iters_prologue;
3303 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3304 /* If we need to peel for gaps, but no peeling is required, we have to
3305 peel VF iterations. */
3306 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3307 *peel_iters_epilogue = assumed_vf;
3308 }
3309
3310 stmt_info_for_cost *si;
3311 int j;
3312 if (peel_iters_prologue)
3313 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3314 retval += record_stmt_cost (prologue_cost_vec,
3315 si->count * peel_iters_prologue,
3316 si->kind, si->stmt_info, si->misalign,
3317 vect_prologue);
3318 if (*peel_iters_epilogue)
3319 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3320 retval += record_stmt_cost (epilogue_cost_vec,
3321 si->count * *peel_iters_epilogue,
3322 si->kind, si->stmt_info, si->misalign,
3323 vect_epilogue);
3324
3325 return retval;
3326 }
3327
3328 /* Function vect_estimate_min_profitable_iters
3329
3330 Return the number of iterations required for the vector version of the
3331 loop to be profitable relative to the cost of the scalar version of the
3332 loop.
3333
3334 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3335 of iterations for vectorization. -1 value means loop vectorization
3336 is not profitable. This returned value may be used for dynamic
3337 profitability check.
3338
3339 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3340 for static check against estimated number of iterations. */
3341
3342 static void
3343 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3344 int *ret_min_profitable_niters,
3345 int *ret_min_profitable_estimate)
3346 {
3347 int min_profitable_iters;
3348 int min_profitable_estimate;
3349 int peel_iters_prologue;
3350 int peel_iters_epilogue;
3351 unsigned vec_inside_cost = 0;
3352 int vec_outside_cost = 0;
3353 unsigned vec_prologue_cost = 0;
3354 unsigned vec_epilogue_cost = 0;
3355 int scalar_single_iter_cost = 0;
3356 int scalar_outside_cost = 0;
3357 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3358 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3359 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3360
3361 /* Cost model disabled. */
3362 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3363 {
3364 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3365 *ret_min_profitable_niters = 0;
3366 *ret_min_profitable_estimate = 0;
3367 return;
3368 }
3369
3370 /* Requires loop versioning tests to handle misalignment. */
3371 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3372 {
3373 /* FIXME: Make cost depend on complexity of individual check. */
3374 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3375 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3376 vect_prologue);
3377 dump_printf (MSG_NOTE,
3378 "cost model: Adding cost of checks for loop "
3379 "versioning to treat misalignment.\n");
3380 }
3381
3382 /* Requires loop versioning with alias checks. */
3383 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3384 {
3385 /* FIXME: Make cost depend on complexity of individual check. */
3386 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3387 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3388 vect_prologue);
3389 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3390 if (len)
3391 /* Count LEN - 1 ANDs and LEN comparisons. */
3392 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3393 NULL, 0, vect_prologue);
3394 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3395 if (len)
3396 {
3397 /* Count LEN - 1 ANDs and LEN comparisons. */
3398 unsigned int nstmts = len * 2 - 1;
3399 /* +1 for each bias that needs adding. */
3400 for (unsigned int i = 0; i < len; ++i)
3401 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3402 nstmts += 1;
3403 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3404 NULL, 0, vect_prologue);
3405 }
3406 dump_printf (MSG_NOTE,
3407 "cost model: Adding cost of checks for loop "
3408 "versioning aliasing.\n");
3409 }
3410
3411 /* Requires loop versioning with niter checks. */
3412 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3413 {
3414 /* FIXME: Make cost depend on complexity of individual check. */
3415 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3416 vect_prologue);
3417 dump_printf (MSG_NOTE,
3418 "cost model: Adding cost of checks for loop "
3419 "versioning niters.\n");
3420 }
3421
3422 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3423 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3424 vect_prologue);
3425
3426 /* Count statements in scalar loop. Using this as scalar cost for a single
3427 iteration for now.
3428
3429 TODO: Add outer loop support.
3430
3431 TODO: Consider assigning different costs to different scalar
3432 statements. */
3433
3434 scalar_single_iter_cost
3435 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3436
3437 /* Add additional cost for the peeled instructions in prologue and epilogue
3438 loop. (For fully-masked loops there will be no peeling.)
3439
3440 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3441 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3442
3443 TODO: Build an expression that represents peel_iters for prologue and
3444 epilogue to be used in a run-time test. */
3445
3446 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3447 {
3448 peel_iters_prologue = 0;
3449 peel_iters_epilogue = 0;
3450
3451 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3452 {
3453 /* We need to peel exactly one iteration. */
3454 peel_iters_epilogue += 1;
3455 stmt_info_for_cost *si;
3456 int j;
3457 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3458 j, si)
3459 (void) add_stmt_cost (target_cost_data, si->count,
3460 si->kind, si->stmt_info, si->misalign,
3461 vect_epilogue);
3462 }
3463 }
3464 else if (npeel < 0)
3465 {
3466 peel_iters_prologue = assumed_vf / 2;
3467 dump_printf (MSG_NOTE, "cost model: "
3468 "prologue peel iters set to vf/2.\n");
3469
3470 /* If peeling for alignment is unknown, loop bound of main loop becomes
3471 unknown. */
3472 peel_iters_epilogue = assumed_vf / 2;
3473 dump_printf (MSG_NOTE, "cost model: "
3474 "epilogue peel iters set to vf/2 because "
3475 "peeling for alignment is unknown.\n");
3476
3477 /* If peeled iterations are unknown, count a taken branch and a not taken
3478 branch per peeled loop. Even if scalar loop iterations are known,
3479 vector iterations are not known since peeled prologue iterations are
3480 not known. Hence guards remain the same. */
3481 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3482 NULL, 0, vect_prologue);
3483 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3484 NULL, 0, vect_prologue);
3485 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3486 NULL, 0, vect_epilogue);
3487 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3488 NULL, 0, vect_epilogue);
3489 stmt_info_for_cost *si;
3490 int j;
3491 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3492 {
3493 (void) add_stmt_cost (target_cost_data,
3494 si->count * peel_iters_prologue,
3495 si->kind, si->stmt_info, si->misalign,
3496 vect_prologue);
3497 (void) add_stmt_cost (target_cost_data,
3498 si->count * peel_iters_epilogue,
3499 si->kind, si->stmt_info, si->misalign,
3500 vect_epilogue);
3501 }
3502 }
3503 else
3504 {
3505 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3506 stmt_info_for_cost *si;
3507 int j;
3508 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3509
3510 prologue_cost_vec.create (2);
3511 epilogue_cost_vec.create (2);
3512 peel_iters_prologue = npeel;
3513
3514 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3515 &peel_iters_epilogue,
3516 &LOOP_VINFO_SCALAR_ITERATION_COST
3517 (loop_vinfo),
3518 &prologue_cost_vec,
3519 &epilogue_cost_vec);
3520
3521 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3522 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3523 si->misalign, vect_prologue);
3524
3525 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3526 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3527 si->misalign, vect_epilogue);
3528
3529 prologue_cost_vec.release ();
3530 epilogue_cost_vec.release ();
3531 }
3532
3533 /* FORNOW: The scalar outside cost is incremented in one of the
3534 following ways:
3535
3536 1. The vectorizer checks for alignment and aliasing and generates
3537 a condition that allows dynamic vectorization. A cost model
3538 check is ANDED with the versioning condition. Hence scalar code
3539 path now has the added cost of the versioning check.
3540
3541 if (cost > th & versioning_check)
3542 jmp to vector code
3543
3544 Hence run-time scalar is incremented by not-taken branch cost.
3545
3546 2. The vectorizer then checks if a prologue is required. If the
3547 cost model check was not done before during versioning, it has to
3548 be done before the prologue check.
3549
3550 if (cost <= th)
3551 prologue = scalar_iters
3552 if (prologue == 0)
3553 jmp to vector code
3554 else
3555 execute prologue
3556 if (prologue == num_iters)
3557 go to exit
3558
3559 Hence the run-time scalar cost is incremented by a taken branch,
3560 plus a not-taken branch, plus a taken branch cost.
3561
3562 3. The vectorizer then checks if an epilogue is required. If the
3563 cost model check was not done before during prologue check, it
3564 has to be done with the epilogue check.
3565
3566 if (prologue == 0)
3567 jmp to vector code
3568 else
3569 execute prologue
3570 if (prologue == num_iters)
3571 go to exit
3572 vector code:
3573 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3574 jmp to epilogue
3575
3576 Hence the run-time scalar cost should be incremented by 2 taken
3577 branches.
3578
3579 TODO: The back end may reorder the BBS's differently and reverse
3580 conditions/branch directions. Change the estimates below to
3581 something more reasonable. */
3582
3583 /* If the number of iterations is known and we do not do versioning, we can
3584 decide whether to vectorize at compile time. Hence the scalar version
3585 do not carry cost model guard costs. */
3586 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3587 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3588 {
3589 /* Cost model check occurs at versioning. */
3590 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3591 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3592 else
3593 {
3594 /* Cost model check occurs at prologue generation. */
3595 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3596 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3597 + vect_get_stmt_cost (cond_branch_not_taken);
3598 /* Cost model check occurs at epilogue generation. */
3599 else
3600 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3601 }
3602 }
3603
3604 /* Complete the target-specific cost calculations. */
3605 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3606 &vec_inside_cost, &vec_epilogue_cost);
3607
3608 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3609
3610 if (dump_enabled_p ())
3611 {
3612 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3613 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3614 vec_inside_cost);
3615 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3616 vec_prologue_cost);
3617 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3618 vec_epilogue_cost);
3619 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3620 scalar_single_iter_cost);
3621 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3622 scalar_outside_cost);
3623 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3624 vec_outside_cost);
3625 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3626 peel_iters_prologue);
3627 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3628 peel_iters_epilogue);
3629 }
3630
3631 /* Calculate number of iterations required to make the vector version
3632 profitable, relative to the loop bodies only. The following condition
3633 must hold true:
3634 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3635 where
3636 SIC = scalar iteration cost, VIC = vector iteration cost,
3637 VOC = vector outside cost, VF = vectorization factor,
3638 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3639 SOC = scalar outside cost for run time cost model check. */
3640
3641 if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3642 {
3643 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3644 * assumed_vf
3645 - vec_inside_cost * peel_iters_prologue
3646 - vec_inside_cost * peel_iters_epilogue);
3647 if (min_profitable_iters <= 0)
3648 min_profitable_iters = 0;
3649 else
3650 {
3651 min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3652 - vec_inside_cost);
3653
3654 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3655 <= (((int) vec_inside_cost * min_profitable_iters)
3656 + (((int) vec_outside_cost - scalar_outside_cost)
3657 * assumed_vf)))
3658 min_profitable_iters++;
3659 }
3660 }
3661 /* vector version will never be profitable. */
3662 else
3663 {
3664 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3665 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3666 "vectorization did not happen for a simd loop");
3667
3668 if (dump_enabled_p ())
3669 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3670 "cost model: the vector iteration cost = %d "
3671 "divided by the scalar iteration cost = %d "
3672 "is greater or equal to the vectorization factor = %d"
3673 ".\n",
3674 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3675 *ret_min_profitable_niters = -1;
3676 *ret_min_profitable_estimate = -1;
3677 return;
3678 }
3679
3680 dump_printf (MSG_NOTE,
3681 " Calculated minimum iters for profitability: %d\n",
3682 min_profitable_iters);
3683
3684 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3685 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3686 /* We want the vectorized loop to execute at least once. */
3687 min_profitable_iters = assumed_vf + peel_iters_prologue;
3688
3689 if (dump_enabled_p ())
3690 dump_printf_loc (MSG_NOTE, vect_location,
3691 " Runtime profitability threshold = %d\n",
3692 min_profitable_iters);
3693
3694 *ret_min_profitable_niters = min_profitable_iters;
3695
3696 /* Calculate number of iterations required to make the vector version
3697 profitable, relative to the loop bodies only.
3698
3699 Non-vectorized variant is SIC * niters and it must win over vector
3700 variant on the expected loop trip count. The following condition must hold true:
3701 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3702
3703 if (vec_outside_cost <= 0)
3704 min_profitable_estimate = 0;
3705 else
3706 {
3707 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3708 * assumed_vf
3709 - vec_inside_cost * peel_iters_prologue
3710 - vec_inside_cost * peel_iters_epilogue)
3711 / ((scalar_single_iter_cost * assumed_vf)
3712 - vec_inside_cost);
3713 }
3714 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3715 if (dump_enabled_p ())
3716 dump_printf_loc (MSG_NOTE, vect_location,
3717 " Static estimate profitability threshold = %d\n",
3718 min_profitable_estimate);
3719
3720 *ret_min_profitable_estimate = min_profitable_estimate;
3721 }
3722
3723 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3724 vector elements (not bits) for a vector with NELT elements. */
3725 static void
3726 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3727 vec_perm_builder *sel)
3728 {
3729 /* The encoding is a single stepped pattern. Any wrap-around is handled
3730 by vec_perm_indices. */
3731 sel->new_vector (nelt, 1, 3);
3732 for (unsigned int i = 0; i < 3; i++)
3733 sel->quick_push (i + offset);
3734 }
3735
3736 /* Checks whether the target supports whole-vector shifts for vectors of mode
3737 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3738 it supports vec_perm_const with masks for all necessary shift amounts. */
3739 static bool
3740 have_whole_vector_shift (machine_mode mode)
3741 {
3742 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3743 return true;
3744
3745 /* Variable-length vectors should be handled via the optab. */
3746 unsigned int nelt;
3747 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3748 return false;
3749
3750 vec_perm_builder sel;
3751 vec_perm_indices indices;
3752 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3753 {
3754 calc_vec_perm_mask_for_shift (i, nelt, &sel);
3755 indices.new_vector (sel, 2, nelt);
3756 if (!can_vec_perm_const_p (mode, indices, false))
3757 return false;
3758 }
3759 return true;
3760 }
3761
3762 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3763 functions. Design better to avoid maintenance issues. */
3764
3765 /* Function vect_model_reduction_cost.
3766
3767 Models cost for a reduction operation, including the vector ops
3768 generated within the strip-mine loop, the initial definition before
3769 the loop, and the epilogue code that must be generated. */
3770
3771 static void
3772 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3773 int ncopies, stmt_vector_for_cost *cost_vec)
3774 {
3775 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3776 enum tree_code code;
3777 optab optab;
3778 tree vectype;
3779 machine_mode mode;
3780 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3781 struct loop *loop = NULL;
3782
3783 if (loop_vinfo)
3784 loop = LOOP_VINFO_LOOP (loop_vinfo);
3785
3786 /* Condition reductions generate two reductions in the loop. */
3787 vect_reduction_type reduction_type
3788 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3789 if (reduction_type == COND_REDUCTION)
3790 ncopies *= 2;
3791
3792 vectype = STMT_VINFO_VECTYPE (stmt_info);
3793 mode = TYPE_MODE (vectype);
3794 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3795
3796 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3797
3798 if (reduction_type == EXTRACT_LAST_REDUCTION
3799 || reduction_type == FOLD_LEFT_REDUCTION)
3800 {
3801 /* No extra instructions needed in the prologue. */
3802 prologue_cost = 0;
3803
3804 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3805 /* Count one reduction-like operation per vector. */
3806 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3807 stmt_info, 0, vect_body);
3808 else
3809 {
3810 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
3811 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3812 inside_cost = record_stmt_cost (cost_vec, nelements,
3813 vec_to_scalar, stmt_info, 0,
3814 vect_body);
3815 inside_cost += record_stmt_cost (cost_vec, nelements,
3816 scalar_stmt, stmt_info, 0,
3817 vect_body);
3818 }
3819 }
3820 else
3821 {
3822 /* Add in cost for initial definition.
3823 For cond reduction we have four vectors: initial index, step,
3824 initial result of the data reduction, initial value of the index
3825 reduction. */
3826 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3827 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3828 scalar_to_vec, stmt_info, 0,
3829 vect_prologue);
3830
3831 /* Cost of reduction op inside loop. */
3832 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3833 stmt_info, 0, vect_body);
3834 }
3835
3836 /* Determine cost of epilogue code.
3837
3838 We have a reduction operator that will reduce the vector in one statement.
3839 Also requires scalar extract. */
3840
3841 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3842 {
3843 if (reduc_fn != IFN_LAST)
3844 {
3845 if (reduction_type == COND_REDUCTION)
3846 {
3847 /* An EQ stmt and an COND_EXPR stmt. */
3848 epilogue_cost += record_stmt_cost (cost_vec, 2,
3849 vector_stmt, stmt_info, 0,
3850 vect_epilogue);
3851 /* Reduction of the max index and a reduction of the found
3852 values. */
3853 epilogue_cost += record_stmt_cost (cost_vec, 2,
3854 vec_to_scalar, stmt_info, 0,
3855 vect_epilogue);
3856 /* A broadcast of the max value. */
3857 epilogue_cost += record_stmt_cost (cost_vec, 1,
3858 scalar_to_vec, stmt_info, 0,
3859 vect_epilogue);
3860 }
3861 else
3862 {
3863 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3864 stmt_info, 0, vect_epilogue);
3865 epilogue_cost += record_stmt_cost (cost_vec, 1,
3866 vec_to_scalar, stmt_info, 0,
3867 vect_epilogue);
3868 }
3869 }
3870 else if (reduction_type == COND_REDUCTION)
3871 {
3872 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3873 /* Extraction of scalar elements. */
3874 epilogue_cost += record_stmt_cost (cost_vec,
3875 2 * estimated_nunits,
3876 vec_to_scalar, stmt_info, 0,
3877 vect_epilogue);
3878 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
3879 epilogue_cost += record_stmt_cost (cost_vec,
3880 2 * estimated_nunits - 3,
3881 scalar_stmt, stmt_info, 0,
3882 vect_epilogue);
3883 }
3884 else if (reduction_type == EXTRACT_LAST_REDUCTION
3885 || reduction_type == FOLD_LEFT_REDUCTION)
3886 /* No extra instructions need in the epilogue. */
3887 ;
3888 else
3889 {
3890 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3891 tree bitsize =
3892 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3893 int element_bitsize = tree_to_uhwi (bitsize);
3894 int nelements = vec_size_in_bits / element_bitsize;
3895
3896 if (code == COND_EXPR)
3897 code = MAX_EXPR;
3898
3899 optab = optab_for_tree_code (code, vectype, optab_default);
3900
3901 /* We have a whole vector shift available. */
3902 if (optab != unknown_optab
3903 && VECTOR_MODE_P (mode)
3904 && optab_handler (optab, mode) != CODE_FOR_nothing
3905 && have_whole_vector_shift (mode))
3906 {
3907 /* Final reduction via vector shifts and the reduction operator.
3908 Also requires scalar extract. */
3909 epilogue_cost += record_stmt_cost (cost_vec,
3910 exact_log2 (nelements) * 2,
3911 vector_stmt, stmt_info, 0,
3912 vect_epilogue);
3913 epilogue_cost += record_stmt_cost (cost_vec, 1,
3914 vec_to_scalar, stmt_info, 0,
3915 vect_epilogue);
3916 }
3917 else
3918 /* Use extracts and reduction op for final reduction. For N
3919 elements, we have N extracts and N-1 reduction ops. */
3920 epilogue_cost += record_stmt_cost (cost_vec,
3921 nelements + nelements - 1,
3922 vector_stmt, stmt_info, 0,
3923 vect_epilogue);
3924 }
3925 }
3926
3927 if (dump_enabled_p ())
3928 dump_printf (MSG_NOTE,
3929 "vect_model_reduction_cost: inside_cost = %d, "
3930 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3931 prologue_cost, epilogue_cost);
3932 }
3933
3934
3935 /* Function vect_model_induction_cost.
3936
3937 Models cost for induction operations. */
3938
3939 static void
3940 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
3941 stmt_vector_for_cost *cost_vec)
3942 {
3943 unsigned inside_cost, prologue_cost;
3944
3945 if (PURE_SLP_STMT (stmt_info))
3946 return;
3947
3948 /* loop cost for vec_loop. */
3949 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3950 stmt_info, 0, vect_body);
3951
3952 /* prologue cost for vec_init and vec_step. */
3953 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
3954 stmt_info, 0, vect_prologue);
3955
3956 if (dump_enabled_p ())
3957 dump_printf_loc (MSG_NOTE, vect_location,
3958 "vect_model_induction_cost: inside_cost = %d, "
3959 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3960 }
3961
3962
3963
3964 /* Function get_initial_def_for_reduction
3965
3966 Input:
3967 STMT_VINFO - a stmt that performs a reduction operation in the loop.
3968 INIT_VAL - the initial value of the reduction variable
3969
3970 Output:
3971 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3972 of the reduction (used for adjusting the epilog - see below).
3973 Return a vector variable, initialized according to the operation that
3974 STMT_VINFO performs. This vector will be used as the initial value
3975 of the vector of partial results.
3976
3977 Option1 (adjust in epilog): Initialize the vector as follows:
3978 add/bit or/xor: [0,0,...,0,0]
3979 mult/bit and: [1,1,...,1,1]
3980 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3981 and when necessary (e.g. add/mult case) let the caller know
3982 that it needs to adjust the result by init_val.
3983
3984 Option2: Initialize the vector as follows:
3985 add/bit or/xor: [init_val,0,0,...,0]
3986 mult/bit and: [init_val,1,1,...,1]
3987 min/max/cond_expr: [init_val,init_val,...,init_val]
3988 and no adjustments are needed.
3989
3990 For example, for the following code:
3991
3992 s = init_val;
3993 for (i=0;i<n;i++)
3994 s = s + a[i];
3995
3996 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
3997 For a vector of 4 units, we want to return either [0,0,0,init_val],
3998 or [0,0,0,0] and let the caller know that it needs to adjust
3999 the result at the end by 'init_val'.
4000
4001 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4002 initialization vector is simpler (same element in all entries), if
4003 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4004
4005 A cost model should help decide between these two schemes. */
4006
4007 tree
4008 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
4009 tree *adjustment_def)
4010 {
4011 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4012 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4013 tree scalar_type = TREE_TYPE (init_val);
4014 tree vectype = get_vectype_for_scalar_type (scalar_type);
4015 enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
4016 tree def_for_init;
4017 tree init_def;
4018 REAL_VALUE_TYPE real_init_val = dconst0;
4019 int int_init_val = 0;
4020 gimple_seq stmts = NULL;
4021
4022 gcc_assert (vectype);
4023
4024 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4025 || SCALAR_FLOAT_TYPE_P (scalar_type));
4026
4027 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4028 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4029
4030 vect_reduction_type reduction_type
4031 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4032
4033 switch (code)
4034 {
4035 case WIDEN_SUM_EXPR:
4036 case DOT_PROD_EXPR:
4037 case SAD_EXPR:
4038 case PLUS_EXPR:
4039 case MINUS_EXPR:
4040 case BIT_IOR_EXPR:
4041 case BIT_XOR_EXPR:
4042 case MULT_EXPR:
4043 case BIT_AND_EXPR:
4044 {
4045 /* ADJUSTMENT_DEF is NULL when called from
4046 vect_create_epilog_for_reduction to vectorize double reduction. */
4047 if (adjustment_def)
4048 *adjustment_def = init_val;
4049
4050 if (code == MULT_EXPR)
4051 {
4052 real_init_val = dconst1;
4053 int_init_val = 1;
4054 }
4055
4056 if (code == BIT_AND_EXPR)
4057 int_init_val = -1;
4058
4059 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4060 def_for_init = build_real (scalar_type, real_init_val);
4061 else
4062 def_for_init = build_int_cst (scalar_type, int_init_val);
4063
4064 if (adjustment_def)
4065 /* Option1: the first element is '0' or '1' as well. */
4066 init_def = gimple_build_vector_from_val (&stmts, vectype,
4067 def_for_init);
4068 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4069 {
4070 /* Option2 (variable length): the first element is INIT_VAL. */
4071 init_def = gimple_build_vector_from_val (&stmts, vectype,
4072 def_for_init);
4073 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4074 vectype, init_def, init_val);
4075 }
4076 else
4077 {
4078 /* Option2: the first element is INIT_VAL. */
4079 tree_vector_builder elts (vectype, 1, 2);
4080 elts.quick_push (init_val);
4081 elts.quick_push (def_for_init);
4082 init_def = gimple_build_vector (&stmts, &elts);
4083 }
4084 }
4085 break;
4086
4087 case MIN_EXPR:
4088 case MAX_EXPR:
4089 case COND_EXPR:
4090 {
4091 if (adjustment_def)
4092 {
4093 *adjustment_def = NULL_TREE;
4094 if (reduction_type != COND_REDUCTION
4095 && reduction_type != EXTRACT_LAST_REDUCTION)
4096 {
4097 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
4098 break;
4099 }
4100 }
4101 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4102 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4103 }
4104 break;
4105
4106 default:
4107 gcc_unreachable ();
4108 }
4109
4110 if (stmts)
4111 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4112 return init_def;
4113 }
4114
4115 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4116 NUMBER_OF_VECTORS is the number of vector defs to create.
4117 If NEUTRAL_OP is nonnull, introducing extra elements of that
4118 value will not change the result. */
4119
4120 static void
4121 get_initial_defs_for_reduction (slp_tree slp_node,
4122 vec<tree> *vec_oprnds,
4123 unsigned int number_of_vectors,
4124 bool reduc_chain, tree neutral_op)
4125 {
4126 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4127 stmt_vec_info stmt_vinfo = stmts[0];
4128 unsigned HOST_WIDE_INT nunits;
4129 unsigned j, number_of_places_left_in_vector;
4130 tree vector_type;
4131 tree vop;
4132 int group_size = stmts.length ();
4133 unsigned int vec_num, i;
4134 unsigned number_of_copies = 1;
4135 vec<tree> voprnds;
4136 voprnds.create (number_of_vectors);
4137 struct loop *loop;
4138 auto_vec<tree, 16> permute_results;
4139
4140 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4141
4142 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4143
4144 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4145 gcc_assert (loop);
4146 edge pe = loop_preheader_edge (loop);
4147
4148 gcc_assert (!reduc_chain || neutral_op);
4149
4150 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4151 created vectors. It is greater than 1 if unrolling is performed.
4152
4153 For example, we have two scalar operands, s1 and s2 (e.g., group of
4154 strided accesses of size two), while NUNITS is four (i.e., four scalars
4155 of this type can be packed in a vector). The output vector will contain
4156 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4157 will be 2).
4158
4159 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4160 vectors containing the operands.
4161
4162 For example, NUNITS is four as before, and the group size is 8
4163 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4164 {s5, s6, s7, s8}. */
4165
4166 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4167 nunits = group_size;
4168
4169 number_of_copies = nunits * number_of_vectors / group_size;
4170
4171 number_of_places_left_in_vector = nunits;
4172 bool constant_p = true;
4173 tree_vector_builder elts (vector_type, nunits, 1);
4174 elts.quick_grow (nunits);
4175 for (j = 0; j < number_of_copies; j++)
4176 {
4177 for (i = group_size - 1; stmts.iterate (i, &stmt_vinfo); i--)
4178 {
4179 tree op;
4180 /* Get the def before the loop. In reduction chain we have only
4181 one initial value. */
4182 if ((j != (number_of_copies - 1)
4183 || (reduc_chain && i != 0))
4184 && neutral_op)
4185 op = neutral_op;
4186 else
4187 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4188
4189 /* Create 'vect_ = {op0,op1,...,opn}'. */
4190 number_of_places_left_in_vector--;
4191 elts[number_of_places_left_in_vector] = op;
4192 if (!CONSTANT_CLASS_P (op))
4193 constant_p = false;
4194
4195 if (number_of_places_left_in_vector == 0)
4196 {
4197 gimple_seq ctor_seq = NULL;
4198 tree init;
4199 if (constant_p && !neutral_op
4200 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4201 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4202 /* Build the vector directly from ELTS. */
4203 init = gimple_build_vector (&ctor_seq, &elts);
4204 else if (neutral_op)
4205 {
4206 /* Build a vector of the neutral value and shift the
4207 other elements into place. */
4208 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4209 neutral_op);
4210 int k = nunits;
4211 while (k > 0 && elts[k - 1] == neutral_op)
4212 k -= 1;
4213 while (k > 0)
4214 {
4215 k -= 1;
4216 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4217 vector_type, init, elts[k]);
4218 }
4219 }
4220 else
4221 {
4222 /* First time round, duplicate ELTS to fill the
4223 required number of vectors, then cherry pick the
4224 appropriate result for each iteration. */
4225 if (vec_oprnds->is_empty ())
4226 duplicate_and_interleave (&ctor_seq, vector_type, elts,
4227 number_of_vectors,
4228 permute_results);
4229 init = permute_results[number_of_vectors - j - 1];
4230 }
4231 if (ctor_seq != NULL)
4232 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4233 voprnds.quick_push (init);
4234
4235 number_of_places_left_in_vector = nunits;
4236 elts.new_vector (vector_type, nunits, 1);
4237 elts.quick_grow (nunits);
4238 constant_p = true;
4239 }
4240 }
4241 }
4242
4243 /* Since the vectors are created in the reverse order, we should invert
4244 them. */
4245 vec_num = voprnds.length ();
4246 for (j = vec_num; j != 0; j--)
4247 {
4248 vop = voprnds[j - 1];
4249 vec_oprnds->quick_push (vop);
4250 }
4251
4252 voprnds.release ();
4253
4254 /* In case that VF is greater than the unrolling factor needed for the SLP
4255 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4256 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4257 to replicate the vectors. */
4258 tree neutral_vec = NULL;
4259 while (number_of_vectors > vec_oprnds->length ())
4260 {
4261 if (neutral_op)
4262 {
4263 if (!neutral_vec)
4264 {
4265 gimple_seq ctor_seq = NULL;
4266 neutral_vec = gimple_build_vector_from_val
4267 (&ctor_seq, vector_type, neutral_op);
4268 if (ctor_seq != NULL)
4269 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4270 }
4271 vec_oprnds->quick_push (neutral_vec);
4272 }
4273 else
4274 {
4275 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4276 vec_oprnds->quick_push (vop);
4277 }
4278 }
4279 }
4280
4281
4282 /* Function vect_create_epilog_for_reduction
4283
4284 Create code at the loop-epilog to finalize the result of a reduction
4285 computation.
4286
4287 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4288 reduction statements.
4289 STMT_INFO is the scalar reduction stmt that is being vectorized.
4290 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4291 number of elements that we can fit in a vectype (nunits). In this case
4292 we have to generate more than one vector stmt - i.e - we need to "unroll"
4293 the vector stmt by a factor VF/nunits. For more details see documentation
4294 in vectorizable_operation.
4295 REDUC_FN is the internal function for the epilog reduction.
4296 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4297 computation.
4298 REDUC_INDEX is the index of the operand in the right hand side of the
4299 statement that is defined by REDUCTION_PHI.
4300 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4301 SLP_NODE is an SLP node containing a group of reduction statements. The
4302 first one in this group is STMT_INFO.
4303 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4304 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4305 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4306 any value of the IV in the loop.
4307 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4308 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4309 null if this is not an SLP reduction
4310
4311 This function:
4312 1. Creates the reduction def-use cycles: sets the arguments for
4313 REDUCTION_PHIS:
4314 The loop-entry argument is the vectorized initial-value of the reduction.
4315 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4316 sums.
4317 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4318 by calling the function specified by REDUC_FN if available, or by
4319 other means (whole-vector shifts or a scalar loop).
4320 The function also creates a new phi node at the loop exit to preserve
4321 loop-closed form, as illustrated below.
4322
4323 The flow at the entry to this function:
4324
4325 loop:
4326 vec_def = phi <null, null> # REDUCTION_PHI
4327 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4328 s_loop = scalar_stmt # (scalar) STMT_INFO
4329 loop_exit:
4330 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4331 use <s_out0>
4332 use <s_out0>
4333
4334 The above is transformed by this function into:
4335
4336 loop:
4337 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4338 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4339 s_loop = scalar_stmt # (scalar) STMT_INFO
4340 loop_exit:
4341 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4342 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4343 v_out2 = reduce <v_out1>
4344 s_out3 = extract_field <v_out2, 0>
4345 s_out4 = adjust_result <s_out3>
4346 use <s_out4>
4347 use <s_out4>
4348 */
4349
4350 static void
4351 vect_create_epilog_for_reduction (vec<tree> vect_defs,
4352 stmt_vec_info stmt_info,
4353 gimple *reduc_def_stmt,
4354 int ncopies, internal_fn reduc_fn,
4355 vec<stmt_vec_info> reduction_phis,
4356 bool double_reduc,
4357 slp_tree slp_node,
4358 slp_instance slp_node_instance,
4359 tree induc_val, enum tree_code induc_code,
4360 tree neutral_op)
4361 {
4362 stmt_vec_info prev_phi_info;
4363 tree vectype;
4364 machine_mode mode;
4365 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4366 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4367 basic_block exit_bb;
4368 tree scalar_dest;
4369 tree scalar_type;
4370 gimple *new_phi = NULL, *phi;
4371 stmt_vec_info phi_info;
4372 gimple_stmt_iterator exit_gsi;
4373 tree vec_dest;
4374 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4375 gimple *epilog_stmt = NULL;
4376 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
4377 gimple *exit_phi;
4378 tree bitsize;
4379 tree adjustment_def = NULL;
4380 tree vec_initial_def = NULL;
4381 tree expr, def, initial_def = NULL;
4382 tree orig_name, scalar_result;
4383 imm_use_iterator imm_iter, phi_imm_iter;
4384 use_operand_p use_p, phi_use_p;
4385 gimple *use_stmt;
4386 stmt_vec_info reduction_phi_info = NULL;
4387 bool nested_in_vect_loop = false;
4388 auto_vec<gimple *> new_phis;
4389 auto_vec<stmt_vec_info> inner_phis;
4390 int j, i;
4391 auto_vec<tree> scalar_results;
4392 unsigned int group_size = 1, k, ratio;
4393 auto_vec<tree> vec_initial_defs;
4394 auto_vec<gimple *> phis;
4395 bool slp_reduc = false;
4396 bool direct_slp_reduc;
4397 tree new_phi_result;
4398 stmt_vec_info inner_phi = NULL;
4399 tree induction_index = NULL_TREE;
4400
4401 if (slp_node)
4402 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4403
4404 if (nested_in_vect_loop_p (loop, stmt_info))
4405 {
4406 outer_loop = loop;
4407 loop = loop->inner;
4408 nested_in_vect_loop = true;
4409 gcc_assert (!slp_node);
4410 }
4411
4412 vectype = STMT_VINFO_VECTYPE (stmt_info);
4413 gcc_assert (vectype);
4414 mode = TYPE_MODE (vectype);
4415
4416 /* 1. Create the reduction def-use cycle:
4417 Set the arguments of REDUCTION_PHIS, i.e., transform
4418
4419 loop:
4420 vec_def = phi <null, null> # REDUCTION_PHI
4421 VECT_DEF = vector_stmt # vectorized form of STMT
4422 ...
4423
4424 into:
4425
4426 loop:
4427 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4428 VECT_DEF = vector_stmt # vectorized form of STMT
4429 ...
4430
4431 (in case of SLP, do it for all the phis). */
4432
4433 /* Get the loop-entry arguments. */
4434 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4435 if (slp_node)
4436 {
4437 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4438 vec_initial_defs.reserve (vec_num);
4439 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4440 &vec_initial_defs, vec_num,
4441 REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4442 neutral_op);
4443 }
4444 else
4445 {
4446 /* Get at the scalar def before the loop, that defines the initial value
4447 of the reduction variable. */
4448 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4449 loop_preheader_edge (loop));
4450 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4451 and we can't use zero for induc_val, use initial_def. Similarly
4452 for REDUC_MIN and initial_def larger than the base. */
4453 if (TREE_CODE (initial_def) == INTEGER_CST
4454 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4455 == INTEGER_INDUC_COND_REDUCTION)
4456 && !integer_zerop (induc_val)
4457 && ((induc_code == MAX_EXPR
4458 && tree_int_cst_lt (initial_def, induc_val))
4459 || (induc_code == MIN_EXPR
4460 && tree_int_cst_lt (induc_val, initial_def))))
4461 induc_val = initial_def;
4462
4463 if (double_reduc)
4464 /* In case of double reduction we only create a vector variable
4465 to be put in the reduction phi node. The actual statement
4466 creation is done later in this function. */
4467 vec_initial_def = vect_create_destination_var (initial_def, vectype);
4468 else if (nested_in_vect_loop)
4469 {
4470 /* Do not use an adjustment def as that case is not supported
4471 correctly if ncopies is not one. */
4472 vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4473 vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4474 stmt_info);
4475 }
4476 else
4477 vec_initial_def
4478 = get_initial_def_for_reduction (stmt_info, initial_def,
4479 &adjustment_def);
4480 vec_initial_defs.create (1);
4481 vec_initial_defs.quick_push (vec_initial_def);
4482 }
4483
4484 /* Set phi nodes arguments. */
4485 FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4486 {
4487 tree vec_init_def = vec_initial_defs[i];
4488 tree def = vect_defs[i];
4489 for (j = 0; j < ncopies; j++)
4490 {
4491 if (j != 0)
4492 {
4493 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4494 if (nested_in_vect_loop)
4495 vec_init_def
4496 = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
4497 }
4498
4499 /* Set the loop-entry arg of the reduction-phi. */
4500
4501 gphi *phi = as_a <gphi *> (phi_info->stmt);
4502 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4503 == INTEGER_INDUC_COND_REDUCTION)
4504 {
4505 /* Initialise the reduction phi to zero. This prevents initial
4506 values of non-zero interferring with the reduction op. */
4507 gcc_assert (ncopies == 1);
4508 gcc_assert (i == 0);
4509
4510 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4511 tree induc_val_vec
4512 = build_vector_from_val (vec_init_def_type, induc_val);
4513
4514 add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4515 UNKNOWN_LOCATION);
4516 }
4517 else
4518 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4519 UNKNOWN_LOCATION);
4520
4521 /* Set the loop-latch arg for the reduction-phi. */
4522 if (j > 0)
4523 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4524
4525 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4526
4527 if (dump_enabled_p ())
4528 dump_printf_loc (MSG_NOTE, vect_location,
4529 "transform reduction: created def-use cycle: %G%G",
4530 phi, SSA_NAME_DEF_STMT (def));
4531 }
4532 }
4533
4534 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4535 which is updated with the current index of the loop for every match of
4536 the original loop's cond_expr (VEC_STMT). This results in a vector
4537 containing the last time the condition passed for that vector lane.
4538 The first match will be a 1 to allow 0 to be used for non-matching
4539 indexes. If there are no matches at all then the vector will be all
4540 zeroes. */
4541 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4542 {
4543 tree indx_before_incr, indx_after_incr;
4544 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4545
4546 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4547 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4548
4549 int scalar_precision
4550 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4551 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4552 tree cr_index_vector_type = build_vector_type
4553 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4554
4555 /* First we create a simple vector induction variable which starts
4556 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4557 vector size (STEP). */
4558
4559 /* Create a {1,2,3,...} vector. */
4560 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4561
4562 /* Create a vector of the step value. */
4563 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4564 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4565
4566 /* Create an induction variable. */
4567 gimple_stmt_iterator incr_gsi;
4568 bool insert_after;
4569 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4570 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4571 insert_after, &indx_before_incr, &indx_after_incr);
4572
4573 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4574 filled with zeros (VEC_ZERO). */
4575
4576 /* Create a vector of 0s. */
4577 tree zero = build_zero_cst (cr_index_scalar_type);
4578 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4579
4580 /* Create a vector phi node. */
4581 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4582 new_phi = create_phi_node (new_phi_tree, loop->header);
4583 loop_vinfo->add_stmt (new_phi);
4584 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4585 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4586
4587 /* Now take the condition from the loops original cond_expr
4588 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4589 every match uses values from the induction variable
4590 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4591 (NEW_PHI_TREE).
4592 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4593 the new cond_expr (INDEX_COND_EXPR). */
4594
4595 /* Duplicate the condition from vec_stmt. */
4596 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4597
4598 /* Create a conditional, where the condition is taken from vec_stmt
4599 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4600 else is the phi (NEW_PHI_TREE). */
4601 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4602 ccompare, indx_before_incr,
4603 new_phi_tree);
4604 induction_index = make_ssa_name (cr_index_vector_type);
4605 gimple *index_condition = gimple_build_assign (induction_index,
4606 index_cond_expr);
4607 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4608 stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4609 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4610
4611 /* Update the phi with the vec cond. */
4612 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4613 loop_latch_edge (loop), UNKNOWN_LOCATION);
4614 }
4615
4616 /* 2. Create epilog code.
4617 The reduction epilog code operates across the elements of the vector
4618 of partial results computed by the vectorized loop.
4619 The reduction epilog code consists of:
4620
4621 step 1: compute the scalar result in a vector (v_out2)
4622 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4623 step 3: adjust the scalar result (s_out3) if needed.
4624
4625 Step 1 can be accomplished using one the following three schemes:
4626 (scheme 1) using reduc_fn, if available.
4627 (scheme 2) using whole-vector shifts, if available.
4628 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4629 combined.
4630
4631 The overall epilog code looks like this:
4632
4633 s_out0 = phi <s_loop> # original EXIT_PHI
4634 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4635 v_out2 = reduce <v_out1> # step 1
4636 s_out3 = extract_field <v_out2, 0> # step 2
4637 s_out4 = adjust_result <s_out3> # step 3
4638
4639 (step 3 is optional, and steps 1 and 2 may be combined).
4640 Lastly, the uses of s_out0 are replaced by s_out4. */
4641
4642
4643 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4644 v_out1 = phi <VECT_DEF>
4645 Store them in NEW_PHIS. */
4646
4647 exit_bb = single_exit (loop)->dest;
4648 prev_phi_info = NULL;
4649 new_phis.create (vect_defs.length ());
4650 FOR_EACH_VEC_ELT (vect_defs, i, def)
4651 {
4652 for (j = 0; j < ncopies; j++)
4653 {
4654 tree new_def = copy_ssa_name (def);
4655 phi = create_phi_node (new_def, exit_bb);
4656 stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4657 if (j == 0)
4658 new_phis.quick_push (phi);
4659 else
4660 {
4661 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4662 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4663 }
4664
4665 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4666 prev_phi_info = phi_info;
4667 }
4668 }
4669
4670 /* The epilogue is created for the outer-loop, i.e., for the loop being
4671 vectorized. Create exit phis for the outer loop. */
4672 if (double_reduc)
4673 {
4674 loop = outer_loop;
4675 exit_bb = single_exit (loop)->dest;
4676 inner_phis.create (vect_defs.length ());
4677 FOR_EACH_VEC_ELT (new_phis, i, phi)
4678 {
4679 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4680 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4681 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4682 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4683 PHI_RESULT (phi));
4684 prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4685 inner_phis.quick_push (phi_info);
4686 new_phis[i] = outer_phi;
4687 while (STMT_VINFO_RELATED_STMT (phi_info))
4688 {
4689 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4690 new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4691 outer_phi = create_phi_node (new_result, exit_bb);
4692 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4693 PHI_RESULT (phi_info->stmt));
4694 stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4695 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4696 prev_phi_info = outer_phi_info;
4697 }
4698 }
4699 }
4700
4701 exit_gsi = gsi_after_labels (exit_bb);
4702
4703 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4704 (i.e. when reduc_fn is not available) and in the final adjustment
4705 code (if needed). Also get the original scalar reduction variable as
4706 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4707 represents a reduction pattern), the tree-code and scalar-def are
4708 taken from the original stmt that the pattern-stmt (STMT) replaces.
4709 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4710 are taken from STMT. */
4711
4712 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4713 if (orig_stmt_info != stmt_info)
4714 {
4715 /* Reduction pattern */
4716 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4717 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4718 }
4719
4720 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4721 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4722 partial results are added and not subtracted. */
4723 if (code == MINUS_EXPR)
4724 code = PLUS_EXPR;
4725
4726 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4727 scalar_type = TREE_TYPE (scalar_dest);
4728 scalar_results.create (group_size);
4729 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4730 bitsize = TYPE_SIZE (scalar_type);
4731
4732 /* In case this is a reduction in an inner-loop while vectorizing an outer
4733 loop - we don't need to extract a single scalar result at the end of the
4734 inner-loop (unless it is double reduction, i.e., the use of reduction is
4735 outside the outer-loop). The final vector of partial results will be used
4736 in the vectorized outer-loop, or reduced to a scalar result at the end of
4737 the outer-loop. */
4738 if (nested_in_vect_loop && !double_reduc)
4739 goto vect_finalize_reduction;
4740
4741 /* SLP reduction without reduction chain, e.g.,
4742 # a1 = phi <a2, a0>
4743 # b1 = phi <b2, b0>
4744 a2 = operation (a1)
4745 b2 = operation (b1) */
4746 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4747
4748 /* True if we should implement SLP_REDUC using native reduction operations
4749 instead of scalar operations. */
4750 direct_slp_reduc = (reduc_fn != IFN_LAST
4751 && slp_reduc
4752 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4753
4754 /* In case of reduction chain, e.g.,
4755 # a1 = phi <a3, a0>
4756 a2 = operation (a1)
4757 a3 = operation (a2),
4758
4759 we may end up with more than one vector result. Here we reduce them to
4760 one vector. */
4761 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4762 {
4763 tree first_vect = PHI_RESULT (new_phis[0]);
4764 gassign *new_vec_stmt = NULL;
4765 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4766 for (k = 1; k < new_phis.length (); k++)
4767 {
4768 gimple *next_phi = new_phis[k];
4769 tree second_vect = PHI_RESULT (next_phi);
4770 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4771 new_vec_stmt = gimple_build_assign (tem, code,
4772 first_vect, second_vect);
4773 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4774 first_vect = tem;
4775 }
4776
4777 new_phi_result = first_vect;
4778 if (new_vec_stmt)
4779 {
4780 new_phis.truncate (0);
4781 new_phis.safe_push (new_vec_stmt);
4782 }
4783 }
4784 /* Likewise if we couldn't use a single defuse cycle. */
4785 else if (ncopies > 1)
4786 {
4787 gcc_assert (new_phis.length () == 1);
4788 tree first_vect = PHI_RESULT (new_phis[0]);
4789 gassign *new_vec_stmt = NULL;
4790 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4791 stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4792 for (int k = 1; k < ncopies; ++k)
4793 {
4794 next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4795 tree second_vect = PHI_RESULT (next_phi_info->stmt);
4796 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4797 new_vec_stmt = gimple_build_assign (tem, code,
4798 first_vect, second_vect);
4799 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4800 first_vect = tem;
4801 }
4802 new_phi_result = first_vect;
4803 new_phis.truncate (0);
4804 new_phis.safe_push (new_vec_stmt);
4805 }
4806 else
4807 new_phi_result = PHI_RESULT (new_phis[0]);
4808
4809 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4810 && reduc_fn != IFN_LAST)
4811 {
4812 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4813 various data values where the condition matched and another vector
4814 (INDUCTION_INDEX) containing all the indexes of those matches. We
4815 need to extract the last matching index (which will be the index with
4816 highest value) and use this to index into the data vector.
4817 For the case where there were no matches, the data vector will contain
4818 all default values and the index vector will be all zeros. */
4819
4820 /* Get various versions of the type of the vector of indexes. */
4821 tree index_vec_type = TREE_TYPE (induction_index);
4822 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4823 tree index_scalar_type = TREE_TYPE (index_vec_type);
4824 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4825 (index_vec_type);
4826
4827 /* Get an unsigned integer version of the type of the data vector. */
4828 int scalar_precision
4829 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4830 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4831 tree vectype_unsigned = build_vector_type
4832 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4833
4834 /* First we need to create a vector (ZERO_VEC) of zeros and another
4835 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4836 can create using a MAX reduction and then expanding.
4837 In the case where the loop never made any matches, the max index will
4838 be zero. */
4839
4840 /* Vector of {0, 0, 0,...}. */
4841 tree zero_vec = make_ssa_name (vectype);
4842 tree zero_vec_rhs = build_zero_cst (vectype);
4843 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4844 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4845
4846 /* Find maximum value from the vector of found indexes. */
4847 tree max_index = make_ssa_name (index_scalar_type);
4848 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4849 1, induction_index);
4850 gimple_call_set_lhs (max_index_stmt, max_index);
4851 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4852
4853 /* Vector of {max_index, max_index, max_index,...}. */
4854 tree max_index_vec = make_ssa_name (index_vec_type);
4855 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4856 max_index);
4857 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4858 max_index_vec_rhs);
4859 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4860
4861 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4862 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4863 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4864 otherwise. Only one value should match, resulting in a vector
4865 (VEC_COND) with one data value and the rest zeros.
4866 In the case where the loop never made any matches, every index will
4867 match, resulting in a vector with all data values (which will all be
4868 the default value). */
4869
4870 /* Compare the max index vector to the vector of found indexes to find
4871 the position of the max value. */
4872 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4873 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4874 induction_index,
4875 max_index_vec);
4876 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4877
4878 /* Use the compare to choose either values from the data vector or
4879 zero. */
4880 tree vec_cond = make_ssa_name (vectype);
4881 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4882 vec_compare, new_phi_result,
4883 zero_vec);
4884 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4885
4886 /* Finally we need to extract the data value from the vector (VEC_COND)
4887 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4888 reduction, but because this doesn't exist, we can use a MAX reduction
4889 instead. The data value might be signed or a float so we need to cast
4890 it first.
4891 In the case where the loop never made any matches, the data values are
4892 all identical, and so will reduce down correctly. */
4893
4894 /* Make the matched data values unsigned. */
4895 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4896 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4897 vec_cond);
4898 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4899 VIEW_CONVERT_EXPR,
4900 vec_cond_cast_rhs);
4901 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4902
4903 /* Reduce down to a scalar value. */
4904 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4905 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4906 1, vec_cond_cast);
4907 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4908 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4909
4910 /* Convert the reduced value back to the result type and set as the
4911 result. */
4912 gimple_seq stmts = NULL;
4913 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4914 data_reduc);
4915 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4916 scalar_results.safe_push (new_temp);
4917 }
4918 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4919 && reduc_fn == IFN_LAST)
4920 {
4921 /* Condition reduction without supported IFN_REDUC_MAX. Generate
4922 idx = 0;
4923 idx_val = induction_index[0];
4924 val = data_reduc[0];
4925 for (idx = 0, val = init, i = 0; i < nelts; ++i)
4926 if (induction_index[i] > idx_val)
4927 val = data_reduc[i], idx_val = induction_index[i];
4928 return val; */
4929
4930 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4931 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4932 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4933 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4934 /* Enforced by vectorizable_reduction, which ensures we have target
4935 support before allowing a conditional reduction on variable-length
4936 vectors. */
4937 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4938 tree idx_val = NULL_TREE, val = NULL_TREE;
4939 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4940 {
4941 tree old_idx_val = idx_val;
4942 tree old_val = val;
4943 idx_val = make_ssa_name (idx_eltype);
4944 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4945 build3 (BIT_FIELD_REF, idx_eltype,
4946 induction_index,
4947 bitsize_int (el_size),
4948 bitsize_int (off)));
4949 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4950 val = make_ssa_name (data_eltype);
4951 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4952 build3 (BIT_FIELD_REF,
4953 data_eltype,
4954 new_phi_result,
4955 bitsize_int (el_size),
4956 bitsize_int (off)));
4957 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4958 if (off != 0)
4959 {
4960 tree new_idx_val = idx_val;
4961 tree new_val = val;
4962 if (off != v_size - el_size)
4963 {
4964 new_idx_val = make_ssa_name (idx_eltype);
4965 epilog_stmt = gimple_build_assign (new_idx_val,
4966 MAX_EXPR, idx_val,
4967 old_idx_val);
4968 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4969 }
4970 new_val = make_ssa_name (data_eltype);
4971 epilog_stmt = gimple_build_assign (new_val,
4972 COND_EXPR,
4973 build2 (GT_EXPR,
4974 boolean_type_node,
4975 idx_val,
4976 old_idx_val),
4977 val, old_val);
4978 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4979 idx_val = new_idx_val;
4980 val = new_val;
4981 }
4982 }
4983 /* Convert the reduced value back to the result type and set as the
4984 result. */
4985 gimple_seq stmts = NULL;
4986 val = gimple_convert (&stmts, scalar_type, val);
4987 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4988 scalar_results.safe_push (val);
4989 }
4990
4991 /* 2.3 Create the reduction code, using one of the three schemes described
4992 above. In SLP we simply need to extract all the elements from the
4993 vector (without reducing them), so we use scalar shifts. */
4994 else if (reduc_fn != IFN_LAST && !slp_reduc)
4995 {
4996 tree tmp;
4997 tree vec_elem_type;
4998
4999 /* Case 1: Create:
5000 v_out2 = reduc_expr <v_out1> */
5001
5002 if (dump_enabled_p ())
5003 dump_printf_loc (MSG_NOTE, vect_location,
5004 "Reduce using direct vector reduction.\n");
5005
5006 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5007 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5008 {
5009 tree tmp_dest
5010 = vect_create_destination_var (scalar_dest, vec_elem_type);
5011 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5012 new_phi_result);
5013 gimple_set_lhs (epilog_stmt, tmp_dest);
5014 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5015 gimple_set_lhs (epilog_stmt, new_temp);
5016 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5017
5018 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5019 new_temp);
5020 }
5021 else
5022 {
5023 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5024 new_phi_result);
5025 gimple_set_lhs (epilog_stmt, new_scalar_dest);
5026 }
5027
5028 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5029 gimple_set_lhs (epilog_stmt, new_temp);
5030 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5031
5032 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5033 == INTEGER_INDUC_COND_REDUCTION)
5034 && !operand_equal_p (initial_def, induc_val, 0))
5035 {
5036 /* Earlier we set the initial value to be a vector if induc_val
5037 values. Check the result and if it is induc_val then replace
5038 with the original initial value, unless induc_val is
5039 the same as initial_def already. */
5040 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5041 induc_val);
5042
5043 tmp = make_ssa_name (new_scalar_dest);
5044 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5045 initial_def, new_temp);
5046 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5047 new_temp = tmp;
5048 }
5049
5050 scalar_results.safe_push (new_temp);
5051 }
5052 else if (direct_slp_reduc)
5053 {
5054 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5055 with the elements for other SLP statements replaced with the
5056 neutral value. We can then do a normal reduction on each vector. */
5057
5058 /* Enforced by vectorizable_reduction. */
5059 gcc_assert (new_phis.length () == 1);
5060 gcc_assert (pow2p_hwi (group_size));
5061
5062 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5063 vec<stmt_vec_info> orig_phis
5064 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5065 gimple_seq seq = NULL;
5066
5067 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5068 and the same element size as VECTYPE. */
5069 tree index = build_index_vector (vectype, 0, 1);
5070 tree index_type = TREE_TYPE (index);
5071 tree index_elt_type = TREE_TYPE (index_type);
5072 tree mask_type = build_same_sized_truth_vector_type (index_type);
5073
5074 /* Create a vector that, for each element, identifies which of
5075 the REDUC_GROUP_SIZE results should use it. */
5076 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5077 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5078 build_vector_from_val (index_type, index_mask));
5079
5080 /* Get a neutral vector value. This is simply a splat of the neutral
5081 scalar value if we have one, otherwise the initial scalar value
5082 is itself a neutral value. */
5083 tree vector_identity = NULL_TREE;
5084 if (neutral_op)
5085 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5086 neutral_op);
5087 for (unsigned int i = 0; i < group_size; ++i)
5088 {
5089 /* If there's no univeral neutral value, we can use the
5090 initial scalar value from the original PHI. This is used
5091 for MIN and MAX reduction, for example. */
5092 if (!neutral_op)
5093 {
5094 tree scalar_value
5095 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5096 loop_preheader_edge (loop));
5097 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5098 scalar_value);
5099 }
5100
5101 /* Calculate the equivalent of:
5102
5103 sel[j] = (index[j] == i);
5104
5105 which selects the elements of NEW_PHI_RESULT that should
5106 be included in the result. */
5107 tree compare_val = build_int_cst (index_elt_type, i);
5108 compare_val = build_vector_from_val (index_type, compare_val);
5109 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5110 index, compare_val);
5111
5112 /* Calculate the equivalent of:
5113
5114 vec = seq ? new_phi_result : vector_identity;
5115
5116 VEC is now suitable for a full vector reduction. */
5117 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5118 sel, new_phi_result, vector_identity);
5119
5120 /* Do the reduction and convert it to the appropriate type. */
5121 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5122 TREE_TYPE (vectype), vec);
5123 scalar = gimple_convert (&seq, scalar_type, scalar);
5124 scalar_results.safe_push (scalar);
5125 }
5126 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5127 }
5128 else
5129 {
5130 bool reduce_with_shift;
5131 tree vec_temp;
5132
5133 /* COND reductions all do the final reduction with MAX_EXPR
5134 or MIN_EXPR. */
5135 if (code == COND_EXPR)
5136 {
5137 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5138 == INTEGER_INDUC_COND_REDUCTION)
5139 code = induc_code;
5140 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5141 == CONST_COND_REDUCTION)
5142 code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5143 else
5144 code = MAX_EXPR;
5145 }
5146
5147 /* See if the target wants to do the final (shift) reduction
5148 in a vector mode of smaller size and first reduce upper/lower
5149 halves against each other. */
5150 enum machine_mode mode1 = mode;
5151 tree vectype1 = vectype;
5152 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5153 unsigned sz1 = sz;
5154 if (!slp_reduc
5155 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5156 sz1 = GET_MODE_SIZE (mode1).to_constant ();
5157
5158 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5159 reduce_with_shift = have_whole_vector_shift (mode1);
5160 if (!VECTOR_MODE_P (mode1))
5161 reduce_with_shift = false;
5162 else
5163 {
5164 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5165 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5166 reduce_with_shift = false;
5167 }
5168
5169 /* First reduce the vector to the desired vector size we should
5170 do shift reduction on by combining upper and lower halves. */
5171 new_temp = new_phi_result;
5172 while (sz > sz1)
5173 {
5174 gcc_assert (!slp_reduc);
5175 sz /= 2;
5176 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5177
5178 /* The target has to make sure we support lowpart/highpart
5179 extraction, either via direct vector extract or through
5180 an integer mode punning. */
5181 tree dst1, dst2;
5182 if (convert_optab_handler (vec_extract_optab,
5183 TYPE_MODE (TREE_TYPE (new_temp)),
5184 TYPE_MODE (vectype1))
5185 != CODE_FOR_nothing)
5186 {
5187 /* Extract sub-vectors directly once vec_extract becomes
5188 a conversion optab. */
5189 dst1 = make_ssa_name (vectype1);
5190 epilog_stmt
5191 = gimple_build_assign (dst1, BIT_FIELD_REF,
5192 build3 (BIT_FIELD_REF, vectype1,
5193 new_temp, TYPE_SIZE (vectype1),
5194 bitsize_int (0)));
5195 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5196 dst2 = make_ssa_name (vectype1);
5197 epilog_stmt
5198 = gimple_build_assign (dst2, BIT_FIELD_REF,
5199 build3 (BIT_FIELD_REF, vectype1,
5200 new_temp, TYPE_SIZE (vectype1),
5201 bitsize_int (sz * BITS_PER_UNIT)));
5202 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5203 }
5204 else
5205 {
5206 /* Extract via punning to appropriately sized integer mode
5207 vector. */
5208 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5209 1);
5210 tree etype = build_vector_type (eltype, 2);
5211 gcc_assert (convert_optab_handler (vec_extract_optab,
5212 TYPE_MODE (etype),
5213 TYPE_MODE (eltype))
5214 != CODE_FOR_nothing);
5215 tree tem = make_ssa_name (etype);
5216 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5217 build1 (VIEW_CONVERT_EXPR,
5218 etype, new_temp));
5219 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5220 new_temp = tem;
5221 tem = make_ssa_name (eltype);
5222 epilog_stmt
5223 = gimple_build_assign (tem, BIT_FIELD_REF,
5224 build3 (BIT_FIELD_REF, eltype,
5225 new_temp, TYPE_SIZE (eltype),
5226 bitsize_int (0)));
5227 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5228 dst1 = make_ssa_name (vectype1);
5229 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5230 build1 (VIEW_CONVERT_EXPR,
5231 vectype1, tem));
5232 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5233 tem = make_ssa_name (eltype);
5234 epilog_stmt
5235 = gimple_build_assign (tem, BIT_FIELD_REF,
5236 build3 (BIT_FIELD_REF, eltype,
5237 new_temp, TYPE_SIZE (eltype),
5238 bitsize_int (sz * BITS_PER_UNIT)));
5239 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5240 dst2 = make_ssa_name (vectype1);
5241 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5242 build1 (VIEW_CONVERT_EXPR,
5243 vectype1, tem));
5244 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5245 }
5246
5247 new_temp = make_ssa_name (vectype1);
5248 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5249 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5250 }
5251
5252 if (reduce_with_shift && !slp_reduc)
5253 {
5254 int element_bitsize = tree_to_uhwi (bitsize);
5255 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5256 for variable-length vectors and also requires direct target support
5257 for loop reductions. */
5258 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5259 int nelements = vec_size_in_bits / element_bitsize;
5260 vec_perm_builder sel;
5261 vec_perm_indices indices;
5262
5263 int elt_offset;
5264
5265 tree zero_vec = build_zero_cst (vectype1);
5266 /* Case 2: Create:
5267 for (offset = nelements/2; offset >= 1; offset/=2)
5268 {
5269 Create: va' = vec_shift <va, offset>
5270 Create: va = vop <va, va'>
5271 } */
5272
5273 tree rhs;
5274
5275 if (dump_enabled_p ())
5276 dump_printf_loc (MSG_NOTE, vect_location,
5277 "Reduce using vector shifts\n");
5278
5279 mode1 = TYPE_MODE (vectype1);
5280 vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5281 for (elt_offset = nelements / 2;
5282 elt_offset >= 1;
5283 elt_offset /= 2)
5284 {
5285 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5286 indices.new_vector (sel, 2, nelements);
5287 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5288 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5289 new_temp, zero_vec, mask);
5290 new_name = make_ssa_name (vec_dest, epilog_stmt);
5291 gimple_assign_set_lhs (epilog_stmt, new_name);
5292 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5293
5294 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5295 new_temp);
5296 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5297 gimple_assign_set_lhs (epilog_stmt, new_temp);
5298 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5299 }
5300
5301 /* 2.4 Extract the final scalar result. Create:
5302 s_out3 = extract_field <v_out2, bitpos> */
5303
5304 if (dump_enabled_p ())
5305 dump_printf_loc (MSG_NOTE, vect_location,
5306 "extract scalar result\n");
5307
5308 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5309 bitsize, bitsize_zero_node);
5310 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5311 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5312 gimple_assign_set_lhs (epilog_stmt, new_temp);
5313 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5314 scalar_results.safe_push (new_temp);
5315 }
5316 else
5317 {
5318 /* Case 3: Create:
5319 s = extract_field <v_out2, 0>
5320 for (offset = element_size;
5321 offset < vector_size;
5322 offset += element_size;)
5323 {
5324 Create: s' = extract_field <v_out2, offset>
5325 Create: s = op <s, s'> // For non SLP cases
5326 } */
5327
5328 if (dump_enabled_p ())
5329 dump_printf_loc (MSG_NOTE, vect_location,
5330 "Reduce using scalar code.\n");
5331
5332 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5333 int element_bitsize = tree_to_uhwi (bitsize);
5334 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5335 {
5336 int bit_offset;
5337 if (gimple_code (new_phi) == GIMPLE_PHI)
5338 vec_temp = PHI_RESULT (new_phi);
5339 else
5340 vec_temp = gimple_assign_lhs (new_phi);
5341 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5342 bitsize_zero_node);
5343 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5344 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5345 gimple_assign_set_lhs (epilog_stmt, new_temp);
5346 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5347
5348 /* In SLP we don't need to apply reduction operation, so we just
5349 collect s' values in SCALAR_RESULTS. */
5350 if (slp_reduc)
5351 scalar_results.safe_push (new_temp);
5352
5353 for (bit_offset = element_bitsize;
5354 bit_offset < vec_size_in_bits;
5355 bit_offset += element_bitsize)
5356 {
5357 tree bitpos = bitsize_int (bit_offset);
5358 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5359 bitsize, bitpos);
5360
5361 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5362 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5363 gimple_assign_set_lhs (epilog_stmt, new_name);
5364 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5365
5366 if (slp_reduc)
5367 {
5368 /* In SLP we don't need to apply reduction operation, so
5369 we just collect s' values in SCALAR_RESULTS. */
5370 new_temp = new_name;
5371 scalar_results.safe_push (new_name);
5372 }
5373 else
5374 {
5375 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5376 new_name, new_temp);
5377 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5378 gimple_assign_set_lhs (epilog_stmt, new_temp);
5379 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5380 }
5381 }
5382 }
5383
5384 /* The only case where we need to reduce scalar results in SLP, is
5385 unrolling. If the size of SCALAR_RESULTS is greater than
5386 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5387 REDUC_GROUP_SIZE. */
5388 if (slp_reduc)
5389 {
5390 tree res, first_res, new_res;
5391 gimple *new_stmt;
5392
5393 /* Reduce multiple scalar results in case of SLP unrolling. */
5394 for (j = group_size; scalar_results.iterate (j, &res);
5395 j++)
5396 {
5397 first_res = scalar_results[j % group_size];
5398 new_stmt = gimple_build_assign (new_scalar_dest, code,
5399 first_res, res);
5400 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5401 gimple_assign_set_lhs (new_stmt, new_res);
5402 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5403 scalar_results[j % group_size] = new_res;
5404 }
5405 }
5406 else
5407 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5408 scalar_results.safe_push (new_temp);
5409 }
5410
5411 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5412 == INTEGER_INDUC_COND_REDUCTION)
5413 && !operand_equal_p (initial_def, induc_val, 0))
5414 {
5415 /* Earlier we set the initial value to be a vector if induc_val
5416 values. Check the result and if it is induc_val then replace
5417 with the original initial value, unless induc_val is
5418 the same as initial_def already. */
5419 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5420 induc_val);
5421
5422 tree tmp = make_ssa_name (new_scalar_dest);
5423 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5424 initial_def, new_temp);
5425 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5426 scalar_results[0] = tmp;
5427 }
5428 }
5429
5430 vect_finalize_reduction:
5431
5432 if (double_reduc)
5433 loop = loop->inner;
5434
5435 /* 2.5 Adjust the final result by the initial value of the reduction
5436 variable. (When such adjustment is not needed, then
5437 'adjustment_def' is zero). For example, if code is PLUS we create:
5438 new_temp = loop_exit_def + adjustment_def */
5439
5440 if (adjustment_def)
5441 {
5442 gcc_assert (!slp_reduc);
5443 if (nested_in_vect_loop)
5444 {
5445 new_phi = new_phis[0];
5446 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5447 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5448 new_dest = vect_create_destination_var (scalar_dest, vectype);
5449 }
5450 else
5451 {
5452 new_temp = scalar_results[0];
5453 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5454 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5455 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5456 }
5457
5458 epilog_stmt = gimple_build_assign (new_dest, expr);
5459 new_temp = make_ssa_name (new_dest, epilog_stmt);
5460 gimple_assign_set_lhs (epilog_stmt, new_temp);
5461 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5462 if (nested_in_vect_loop)
5463 {
5464 stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5465 STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5466 = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5467
5468 if (!double_reduc)
5469 scalar_results.quick_push (new_temp);
5470 else
5471 scalar_results[0] = new_temp;
5472 }
5473 else
5474 scalar_results[0] = new_temp;
5475
5476 new_phis[0] = epilog_stmt;
5477 }
5478
5479 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5480 phis with new adjusted scalar results, i.e., replace use <s_out0>
5481 with use <s_out4>.
5482
5483 Transform:
5484 loop_exit:
5485 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5486 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5487 v_out2 = reduce <v_out1>
5488 s_out3 = extract_field <v_out2, 0>
5489 s_out4 = adjust_result <s_out3>
5490 use <s_out0>
5491 use <s_out0>
5492
5493 into:
5494
5495 loop_exit:
5496 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5497 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5498 v_out2 = reduce <v_out1>
5499 s_out3 = extract_field <v_out2, 0>
5500 s_out4 = adjust_result <s_out3>
5501 use <s_out4>
5502 use <s_out4> */
5503
5504
5505 /* In SLP reduction chain we reduce vector results into one vector if
5506 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5507 LHS of the last stmt in the reduction chain, since we are looking for
5508 the loop exit phi node. */
5509 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5510 {
5511 stmt_vec_info dest_stmt_info
5512 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5513 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5514 group_size = 1;
5515 }
5516
5517 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5518 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5519 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5520 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5521 correspond to the first vector stmt, etc.
5522 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5523 if (group_size > new_phis.length ())
5524 {
5525 ratio = group_size / new_phis.length ();
5526 gcc_assert (!(group_size % new_phis.length ()));
5527 }
5528 else
5529 ratio = 1;
5530
5531 stmt_vec_info epilog_stmt_info = NULL;
5532 for (k = 0; k < group_size; k++)
5533 {
5534 if (k % ratio == 0)
5535 {
5536 epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
5537 reduction_phi_info = reduction_phis[k / ratio];
5538 if (double_reduc)
5539 inner_phi = inner_phis[k / ratio];
5540 }
5541
5542 if (slp_reduc)
5543 {
5544 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5545
5546 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5547 /* SLP statements can't participate in patterns. */
5548 gcc_assert (!orig_stmt_info);
5549 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5550 }
5551
5552 phis.create (3);
5553 /* Find the loop-closed-use at the loop exit of the original scalar
5554 result. (The reduction result is expected to have two immediate uses -
5555 one at the latch block, and one at the loop exit). */
5556 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5557 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5558 && !is_gimple_debug (USE_STMT (use_p)))
5559 phis.safe_push (USE_STMT (use_p));
5560
5561 /* While we expect to have found an exit_phi because of loop-closed-ssa
5562 form we can end up without one if the scalar cycle is dead. */
5563
5564 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5565 {
5566 if (outer_loop)
5567 {
5568 stmt_vec_info exit_phi_vinfo
5569 = loop_vinfo->lookup_stmt (exit_phi);
5570 gphi *vect_phi;
5571
5572 /* FORNOW. Currently not supporting the case that an inner-loop
5573 reduction is not used in the outer-loop (but only outside the
5574 outer-loop), unless it is double reduction. */
5575 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5576 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5577 || double_reduc);
5578
5579 if (double_reduc)
5580 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5581 else
5582 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
5583 if (!double_reduc
5584 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5585 != vect_double_reduction_def)
5586 continue;
5587
5588 /* Handle double reduction:
5589
5590 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5591 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5592 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5593 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5594
5595 At that point the regular reduction (stmt2 and stmt3) is
5596 already vectorized, as well as the exit phi node, stmt4.
5597 Here we vectorize the phi node of double reduction, stmt1, and
5598 update all relevant statements. */
5599
5600 /* Go through all the uses of s2 to find double reduction phi
5601 node, i.e., stmt1 above. */
5602 orig_name = PHI_RESULT (exit_phi);
5603 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5604 {
5605 stmt_vec_info use_stmt_vinfo;
5606 tree vect_phi_init, preheader_arg, vect_phi_res;
5607 basic_block bb = gimple_bb (use_stmt);
5608
5609 /* Check that USE_STMT is really double reduction phi
5610 node. */
5611 if (gimple_code (use_stmt) != GIMPLE_PHI
5612 || gimple_phi_num_args (use_stmt) != 2
5613 || bb->loop_father != outer_loop)
5614 continue;
5615 use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5616 if (!use_stmt_vinfo
5617 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5618 != vect_double_reduction_def)
5619 continue;
5620
5621 /* Create vector phi node for double reduction:
5622 vs1 = phi <vs0, vs2>
5623 vs1 was created previously in this function by a call to
5624 vect_get_vec_def_for_operand and is stored in
5625 vec_initial_def;
5626 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5627 vs0 is created here. */
5628
5629 /* Create vector phi node. */
5630 vect_phi = create_phi_node (vec_initial_def, bb);
5631 loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5632
5633 /* Create vs0 - initial def of the double reduction phi. */
5634 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5635 loop_preheader_edge (outer_loop));
5636 vect_phi_init = get_initial_def_for_reduction
5637 (stmt_info, preheader_arg, NULL);
5638
5639 /* Update phi node arguments with vs0 and vs2. */
5640 add_phi_arg (vect_phi, vect_phi_init,
5641 loop_preheader_edge (outer_loop),
5642 UNKNOWN_LOCATION);
5643 add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5644 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5645 if (dump_enabled_p ())
5646 dump_printf_loc (MSG_NOTE, vect_location,
5647 "created double reduction phi node: %G",
5648 vect_phi);
5649
5650 vect_phi_res = PHI_RESULT (vect_phi);
5651
5652 /* Replace the use, i.e., set the correct vs1 in the regular
5653 reduction phi node. FORNOW, NCOPIES is always 1, so the
5654 loop is redundant. */
5655 stmt_vec_info use_info = reduction_phi_info;
5656 for (j = 0; j < ncopies; j++)
5657 {
5658 edge pr_edge = loop_preheader_edge (loop);
5659 SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5660 pr_edge->dest_idx, vect_phi_res);
5661 use_info = STMT_VINFO_RELATED_STMT (use_info);
5662 }
5663 }
5664 }
5665 }
5666
5667 phis.release ();
5668 if (nested_in_vect_loop)
5669 {
5670 if (double_reduc)
5671 loop = outer_loop;
5672 else
5673 continue;
5674 }
5675
5676 phis.create (3);
5677 /* Find the loop-closed-use at the loop exit of the original scalar
5678 result. (The reduction result is expected to have two immediate uses,
5679 one at the latch block, and one at the loop exit). For double
5680 reductions we are looking for exit phis of the outer loop. */
5681 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5682 {
5683 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5684 {
5685 if (!is_gimple_debug (USE_STMT (use_p)))
5686 phis.safe_push (USE_STMT (use_p));
5687 }
5688 else
5689 {
5690 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5691 {
5692 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5693
5694 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5695 {
5696 if (!flow_bb_inside_loop_p (loop,
5697 gimple_bb (USE_STMT (phi_use_p)))
5698 && !is_gimple_debug (USE_STMT (phi_use_p)))
5699 phis.safe_push (USE_STMT (phi_use_p));
5700 }
5701 }
5702 }
5703 }
5704
5705 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5706 {
5707 /* Replace the uses: */
5708 orig_name = PHI_RESULT (exit_phi);
5709 scalar_result = scalar_results[k];
5710 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5711 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5712 SET_USE (use_p, scalar_result);
5713 }
5714
5715 phis.release ();
5716 }
5717 }
5718
5719 /* Return a vector of type VECTYPE that is equal to the vector select
5720 operation "MASK ? VEC : IDENTITY". Insert the select statements
5721 before GSI. */
5722
5723 static tree
5724 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5725 tree vec, tree identity)
5726 {
5727 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5728 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5729 mask, vec, identity);
5730 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5731 return cond;
5732 }
5733
5734 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5735 order, starting with LHS. Insert the extraction statements before GSI and
5736 associate the new scalar SSA names with variable SCALAR_DEST.
5737 Return the SSA name for the result. */
5738
5739 static tree
5740 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5741 tree_code code, tree lhs, tree vector_rhs)
5742 {
5743 tree vectype = TREE_TYPE (vector_rhs);
5744 tree scalar_type = TREE_TYPE (vectype);
5745 tree bitsize = TYPE_SIZE (scalar_type);
5746 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5747 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5748
5749 for (unsigned HOST_WIDE_INT bit_offset = 0;
5750 bit_offset < vec_size_in_bits;
5751 bit_offset += element_bitsize)
5752 {
5753 tree bitpos = bitsize_int (bit_offset);
5754 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5755 bitsize, bitpos);
5756
5757 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5758 rhs = make_ssa_name (scalar_dest, stmt);
5759 gimple_assign_set_lhs (stmt, rhs);
5760 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5761
5762 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5763 tree new_name = make_ssa_name (scalar_dest, stmt);
5764 gimple_assign_set_lhs (stmt, new_name);
5765 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5766 lhs = new_name;
5767 }
5768 return lhs;
5769 }
5770
5771 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5772 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5773 statement. CODE is the operation performed by STMT_INFO and OPS are
5774 its scalar operands. REDUC_INDEX is the index of the operand in
5775 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5776 implements in-order reduction, or IFN_LAST if we should open-code it.
5777 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5778 that should be used to control the operation in a fully-masked loop. */
5779
5780 static bool
5781 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5782 gimple_stmt_iterator *gsi,
5783 stmt_vec_info *vec_stmt, slp_tree slp_node,
5784 gimple *reduc_def_stmt,
5785 tree_code code, internal_fn reduc_fn,
5786 tree ops[3], tree vectype_in,
5787 int reduc_index, vec_loop_masks *masks)
5788 {
5789 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5790 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5791 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5792 stmt_vec_info new_stmt_info = NULL;
5793
5794 int ncopies;
5795 if (slp_node)
5796 ncopies = 1;
5797 else
5798 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5799
5800 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5801 gcc_assert (ncopies == 1);
5802 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5803 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5804 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5805 == FOLD_LEFT_REDUCTION);
5806
5807 if (slp_node)
5808 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5809 TYPE_VECTOR_SUBPARTS (vectype_in)));
5810
5811 tree op0 = ops[1 - reduc_index];
5812
5813 int group_size = 1;
5814 stmt_vec_info scalar_dest_def_info;
5815 auto_vec<tree> vec_oprnds0;
5816 if (slp_node)
5817 {
5818 vect_get_vec_defs (op0, NULL_TREE, stmt_info, &vec_oprnds0, NULL,
5819 slp_node);
5820 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5821 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5822 }
5823 else
5824 {
5825 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5826 vec_oprnds0.create (1);
5827 vec_oprnds0.quick_push (loop_vec_def0);
5828 scalar_dest_def_info = stmt_info;
5829 }
5830
5831 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5832 tree scalar_type = TREE_TYPE (scalar_dest);
5833 tree reduc_var = gimple_phi_result (reduc_def_stmt);
5834
5835 int vec_num = vec_oprnds0.length ();
5836 gcc_assert (vec_num == 1 || slp_node);
5837 tree vec_elem_type = TREE_TYPE (vectype_out);
5838 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5839
5840 tree vector_identity = NULL_TREE;
5841 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5842 vector_identity = build_zero_cst (vectype_out);
5843
5844 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5845 int i;
5846 tree def0;
5847 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5848 {
5849 gimple *new_stmt;
5850 tree mask = NULL_TREE;
5851 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5852 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5853
5854 /* Handle MINUS by adding the negative. */
5855 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5856 {
5857 tree negated = make_ssa_name (vectype_out);
5858 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5859 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5860 def0 = negated;
5861 }
5862
5863 if (mask)
5864 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5865 vector_identity);
5866
5867 /* On the first iteration the input is simply the scalar phi
5868 result, and for subsequent iterations it is the output of
5869 the preceding operation. */
5870 if (reduc_fn != IFN_LAST)
5871 {
5872 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5873 /* For chained SLP reductions the output of the previous reduction
5874 operation serves as the input of the next. For the final statement
5875 the output cannot be a temporary - we reuse the original
5876 scalar destination of the last statement. */
5877 if (i != vec_num - 1)
5878 {
5879 gimple_set_lhs (new_stmt, scalar_dest_var);
5880 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5881 gimple_set_lhs (new_stmt, reduc_var);
5882 }
5883 }
5884 else
5885 {
5886 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5887 reduc_var, def0);
5888 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5889 /* Remove the statement, so that we can use the same code paths
5890 as for statements that we've just created. */
5891 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5892 gsi_remove (&tmp_gsi, false);
5893 }
5894
5895 if (i == vec_num - 1)
5896 {
5897 gimple_set_lhs (new_stmt, scalar_dest);
5898 new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5899 new_stmt);
5900 }
5901 else
5902 new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5903 new_stmt, gsi);
5904
5905 if (slp_node)
5906 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5907 }
5908
5909 if (!slp_node)
5910 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5911
5912 return true;
5913 }
5914
5915 /* Function is_nonwrapping_integer_induction.
5916
5917 Check if STMT_VINO (which is part of loop LOOP) both increments and
5918 does not cause overflow. */
5919
5920 static bool
5921 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
5922 {
5923 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5924 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5925 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5926 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5927 widest_int ni, max_loop_value, lhs_max;
5928 wi::overflow_type overflow = wi::OVF_NONE;
5929
5930 /* Make sure the loop is integer based. */
5931 if (TREE_CODE (base) != INTEGER_CST
5932 || TREE_CODE (step) != INTEGER_CST)
5933 return false;
5934
5935 /* Check that the max size of the loop will not wrap. */
5936
5937 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5938 return true;
5939
5940 if (! max_stmt_executions (loop, &ni))
5941 return false;
5942
5943 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5944 &overflow);
5945 if (overflow)
5946 return false;
5947
5948 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5949 TYPE_SIGN (lhs_type), &overflow);
5950 if (overflow)
5951 return false;
5952
5953 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5954 <= TYPE_PRECISION (lhs_type));
5955 }
5956
5957 /* Function vectorizable_reduction.
5958
5959 Check if STMT_INFO performs a reduction operation that can be vectorized.
5960 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5961 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5962 Return true if STMT_INFO is vectorizable in this way.
5963
5964 This function also handles reduction idioms (patterns) that have been
5965 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
5966 may be of this form:
5967 X = pattern_expr (arg0, arg1, ..., X)
5968 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5969 sequence that had been detected and replaced by the pattern-stmt
5970 (STMT_INFO).
5971
5972 This function also handles reduction of condition expressions, for example:
5973 for (int i = 0; i < N; i++)
5974 if (a[i] < value)
5975 last = a[i];
5976 This is handled by vectorising the loop and creating an additional vector
5977 containing the loop indexes for which "a[i] < value" was true. In the
5978 function epilogue this is reduced to a single max value and then used to
5979 index into the vector of results.
5980
5981 In some cases of reduction patterns, the type of the reduction variable X is
5982 different than the type of the other arguments of STMT_INFO.
5983 In such cases, the vectype that is used when transforming STMT_INFO into
5984 a vector stmt is different than the vectype that is used to determine the
5985 vectorization factor, because it consists of a different number of elements
5986 than the actual number of elements that are being operated upon in parallel.
5987
5988 For example, consider an accumulation of shorts into an int accumulator.
5989 On some targets it's possible to vectorize this pattern operating on 8
5990 shorts at a time (hence, the vectype for purposes of determining the
5991 vectorization factor should be V8HI); on the other hand, the vectype that
5992 is used to create the vector form is actually V4SI (the type of the result).
5993
5994 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5995 indicates what is the actual level of parallelism (V8HI in the example), so
5996 that the right vectorization factor would be derived. This vectype
5997 corresponds to the type of arguments to the reduction stmt, and should *NOT*
5998 be used to create the vectorized stmt. The right vectype for the vectorized
5999 stmt is obtained from the type of the result X:
6000 get_vectype_for_scalar_type (TREE_TYPE (X))
6001
6002 This means that, contrary to "regular" reductions (or "regular" stmts in
6003 general), the following equation:
6004 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6005 does *NOT* necessarily hold for reduction patterns. */
6006
6007 bool
6008 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6009 stmt_vec_info *vec_stmt, slp_tree slp_node,
6010 slp_instance slp_node_instance,
6011 stmt_vector_for_cost *cost_vec)
6012 {
6013 tree vec_dest;
6014 tree scalar_dest;
6015 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6016 tree vectype_in = NULL_TREE;
6017 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6018 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6019 enum tree_code code, orig_code;
6020 internal_fn reduc_fn;
6021 machine_mode vec_mode;
6022 int op_type;
6023 optab optab;
6024 tree new_temp = NULL_TREE;
6025 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6026 stmt_vec_info cond_stmt_vinfo = NULL;
6027 enum tree_code cond_reduc_op_code = ERROR_MARK;
6028 tree scalar_type;
6029 bool is_simple_use;
6030 int i;
6031 int ncopies;
6032 int epilog_copies;
6033 stmt_vec_info prev_stmt_info, prev_phi_info;
6034 bool single_defuse_cycle = false;
6035 stmt_vec_info new_stmt_info = NULL;
6036 int j;
6037 tree ops[3];
6038 enum vect_def_type dts[3];
6039 bool nested_cycle = false, found_nested_cycle_def = false;
6040 bool double_reduc = false;
6041 basic_block def_bb;
6042 struct loop * def_stmt_loop;
6043 tree def_arg;
6044 auto_vec<tree> vec_oprnds0;
6045 auto_vec<tree> vec_oprnds1;
6046 auto_vec<tree> vec_oprnds2;
6047 auto_vec<tree> vect_defs;
6048 auto_vec<stmt_vec_info> phis;
6049 int vec_num;
6050 tree def0, tem;
6051 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6052 tree cond_reduc_val = NULL_TREE;
6053
6054 /* Make sure it was already recognized as a reduction computation. */
6055 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6056 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6057 return false;
6058
6059 if (nested_in_vect_loop_p (loop, stmt_info))
6060 {
6061 loop = loop->inner;
6062 nested_cycle = true;
6063 }
6064
6065 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6066 gcc_assert (slp_node
6067 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6068
6069 if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
6070 {
6071 tree phi_result = gimple_phi_result (phi);
6072 /* Analysis is fully done on the reduction stmt invocation. */
6073 if (! vec_stmt)
6074 {
6075 if (slp_node)
6076 slp_node_instance->reduc_phis = slp_node;
6077
6078 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6079 return true;
6080 }
6081
6082 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6083 /* Leave the scalar phi in place. Note that checking
6084 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6085 for reductions involving a single statement. */
6086 return true;
6087
6088 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6089 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6090
6091 if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6092 == EXTRACT_LAST_REDUCTION)
6093 /* Leave the scalar phi in place. */
6094 return true;
6095
6096 gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6097 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6098 {
6099 tree op = gimple_op (reduc_stmt, k);
6100 if (op == phi_result)
6101 continue;
6102 if (k == 1
6103 && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6104 continue;
6105 if (!vectype_in
6106 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6107 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6108 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6109 break;
6110 }
6111 gcc_assert (vectype_in);
6112
6113 if (slp_node)
6114 ncopies = 1;
6115 else
6116 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6117
6118 stmt_vec_info use_stmt_info;
6119 if (ncopies > 1
6120 && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6121 && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6122 && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info)
6123 single_defuse_cycle = true;
6124
6125 /* Create the destination vector */
6126 scalar_dest = gimple_assign_lhs (reduc_stmt);
6127 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6128
6129 if (slp_node)
6130 /* The size vect_schedule_slp_instance computes is off for us. */
6131 vec_num = vect_get_num_vectors
6132 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6133 * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6134 vectype_in);
6135 else
6136 vec_num = 1;
6137
6138 /* Generate the reduction PHIs upfront. */
6139 prev_phi_info = NULL;
6140 for (j = 0; j < ncopies; j++)
6141 {
6142 if (j == 0 || !single_defuse_cycle)
6143 {
6144 for (i = 0; i < vec_num; i++)
6145 {
6146 /* Create the reduction-phi that defines the reduction
6147 operand. */
6148 gimple *new_phi = create_phi_node (vec_dest, loop->header);
6149 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6150
6151 if (slp_node)
6152 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6153 else
6154 {
6155 if (j == 0)
6156 STMT_VINFO_VEC_STMT (stmt_info)
6157 = *vec_stmt = new_phi_info;
6158 else
6159 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6160 prev_phi_info = new_phi_info;
6161 }
6162 }
6163 }
6164 }
6165
6166 return true;
6167 }
6168
6169 /* 1. Is vectorizable reduction? */
6170 /* Not supportable if the reduction variable is used in the loop, unless
6171 it's a reduction chain. */
6172 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6173 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6174 return false;
6175
6176 /* Reductions that are not used even in an enclosing outer-loop,
6177 are expected to be "live" (used out of the loop). */
6178 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6179 && !STMT_VINFO_LIVE_P (stmt_info))
6180 return false;
6181
6182 /* 2. Has this been recognized as a reduction pattern?
6183
6184 Check if STMT represents a pattern that has been recognized
6185 in earlier analysis stages. For stmts that represent a pattern,
6186 the STMT_VINFO_RELATED_STMT field records the last stmt in
6187 the original sequence that constitutes the pattern. */
6188
6189 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6190 if (orig_stmt_info)
6191 {
6192 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6193 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6194 }
6195
6196 /* 3. Check the operands of the operation. The first operands are defined
6197 inside the loop body. The last operand is the reduction variable,
6198 which is defined by the loop-header-phi. */
6199
6200 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6201
6202 /* Flatten RHS. */
6203 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6204 {
6205 case GIMPLE_BINARY_RHS:
6206 code = gimple_assign_rhs_code (stmt);
6207 op_type = TREE_CODE_LENGTH (code);
6208 gcc_assert (op_type == binary_op);
6209 ops[0] = gimple_assign_rhs1 (stmt);
6210 ops[1] = gimple_assign_rhs2 (stmt);
6211 break;
6212
6213 case GIMPLE_TERNARY_RHS:
6214 code = gimple_assign_rhs_code (stmt);
6215 op_type = TREE_CODE_LENGTH (code);
6216 gcc_assert (op_type == ternary_op);
6217 ops[0] = gimple_assign_rhs1 (stmt);
6218 ops[1] = gimple_assign_rhs2 (stmt);
6219 ops[2] = gimple_assign_rhs3 (stmt);
6220 break;
6221
6222 case GIMPLE_UNARY_RHS:
6223 return false;
6224
6225 default:
6226 gcc_unreachable ();
6227 }
6228
6229 if (code == COND_EXPR && slp_node)
6230 return false;
6231
6232 scalar_dest = gimple_assign_lhs (stmt);
6233 scalar_type = TREE_TYPE (scalar_dest);
6234 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6235 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6236 return false;
6237
6238 /* Do not try to vectorize bit-precision reductions. */
6239 if (!type_has_mode_precision_p (scalar_type))
6240 return false;
6241
6242 /* All uses but the last are expected to be defined in the loop.
6243 The last use is the reduction variable. In case of nested cycle this
6244 assumption is not true: we use reduc_index to record the index of the
6245 reduction variable. */
6246 stmt_vec_info reduc_def_info = NULL;
6247 int reduc_index = -1;
6248 for (i = 0; i < op_type; i++)
6249 {
6250 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6251 if (i == 0 && code == COND_EXPR)
6252 continue;
6253
6254 stmt_vec_info def_stmt_info;
6255 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6256 &def_stmt_info);
6257 dt = dts[i];
6258 gcc_assert (is_simple_use);
6259 if (dt == vect_reduction_def)
6260 {
6261 reduc_def_info = def_stmt_info;
6262 reduc_index = i;
6263 continue;
6264 }
6265 else if (tem)
6266 {
6267 /* To properly compute ncopies we are interested in the widest
6268 input type in case we're looking at a widening accumulation. */
6269 if (!vectype_in
6270 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6271 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6272 vectype_in = tem;
6273 }
6274
6275 if (dt != vect_internal_def
6276 && dt != vect_external_def
6277 && dt != vect_constant_def
6278 && dt != vect_induction_def
6279 && !(dt == vect_nested_cycle && nested_cycle))
6280 return false;
6281
6282 if (dt == vect_nested_cycle)
6283 {
6284 found_nested_cycle_def = true;
6285 reduc_def_info = def_stmt_info;
6286 reduc_index = i;
6287 }
6288
6289 if (i == 1 && code == COND_EXPR)
6290 {
6291 /* Record how value of COND_EXPR is defined. */
6292 if (dt == vect_constant_def)
6293 {
6294 cond_reduc_dt = dt;
6295 cond_reduc_val = ops[i];
6296 }
6297 if (dt == vect_induction_def
6298 && def_stmt_info
6299 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6300 {
6301 cond_reduc_dt = dt;
6302 cond_stmt_vinfo = def_stmt_info;
6303 }
6304 }
6305 }
6306
6307 if (!vectype_in)
6308 vectype_in = vectype_out;
6309
6310 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6311 directy used in stmt. */
6312 if (reduc_index == -1)
6313 {
6314 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6315 {
6316 if (dump_enabled_p ())
6317 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6318 "in-order reduction chain without SLP.\n");
6319 return false;
6320 }
6321
6322 if (orig_stmt_info)
6323 reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6324 else
6325 reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6326 }
6327
6328 if (! reduc_def_info)
6329 return false;
6330
6331 gphi *reduc_def_phi = dyn_cast <gphi *> (reduc_def_info->stmt);
6332 if (!reduc_def_phi)
6333 return false;
6334
6335 if (!(reduc_index == -1
6336 || dts[reduc_index] == vect_reduction_def
6337 || dts[reduc_index] == vect_nested_cycle
6338 || ((dts[reduc_index] == vect_internal_def
6339 || dts[reduc_index] == vect_external_def
6340 || dts[reduc_index] == vect_constant_def
6341 || dts[reduc_index] == vect_induction_def)
6342 && nested_cycle && found_nested_cycle_def)))
6343 {
6344 /* For pattern recognized stmts, orig_stmt might be a reduction,
6345 but some helper statements for the pattern might not, or
6346 might be COND_EXPRs with reduction uses in the condition. */
6347 gcc_assert (orig_stmt_info);
6348 return false;
6349 }
6350
6351 /* PHIs should not participate in patterns. */
6352 gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6353 enum vect_reduction_type v_reduc_type
6354 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6355 stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6356
6357 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6358 /* If we have a condition reduction, see if we can simplify it further. */
6359 if (v_reduc_type == COND_REDUCTION)
6360 {
6361 /* TODO: We can't yet handle reduction chains, since we need to treat
6362 each COND_EXPR in the chain specially, not just the last one.
6363 E.g. for:
6364
6365 x_1 = PHI <x_3, ...>
6366 x_2 = a_2 ? ... : x_1;
6367 x_3 = a_3 ? ... : x_2;
6368
6369 we're interested in the last element in x_3 for which a_2 || a_3
6370 is true, whereas the current reduction chain handling would
6371 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6372 as a reduction operation. */
6373 if (reduc_index == -1)
6374 {
6375 if (dump_enabled_p ())
6376 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6377 "conditional reduction chains not supported\n");
6378 return false;
6379 }
6380
6381 /* vect_is_simple_reduction ensured that operand 2 is the
6382 loop-carried operand. */
6383 gcc_assert (reduc_index == 2);
6384
6385 /* Loop peeling modifies initial value of reduction PHI, which
6386 makes the reduction stmt to be transformed different to the
6387 original stmt analyzed. We need to record reduction code for
6388 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6389 it can be used directly at transform stage. */
6390 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6391 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6392 {
6393 /* Also set the reduction type to CONST_COND_REDUCTION. */
6394 gcc_assert (cond_reduc_dt == vect_constant_def);
6395 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6396 }
6397 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6398 vectype_in, OPTIMIZE_FOR_SPEED))
6399 {
6400 if (dump_enabled_p ())
6401 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6402 "optimizing condition reduction with"
6403 " FOLD_EXTRACT_LAST.\n");
6404 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6405 }
6406 else if (cond_reduc_dt == vect_induction_def)
6407 {
6408 tree base
6409 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6410 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6411
6412 gcc_assert (TREE_CODE (base) == INTEGER_CST
6413 && TREE_CODE (step) == INTEGER_CST);
6414 cond_reduc_val = NULL_TREE;
6415 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6416 above base; punt if base is the minimum value of the type for
6417 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6418 if (tree_int_cst_sgn (step) == -1)
6419 {
6420 cond_reduc_op_code = MIN_EXPR;
6421 if (tree_int_cst_sgn (base) == -1)
6422 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6423 else if (tree_int_cst_lt (base,
6424 TYPE_MAX_VALUE (TREE_TYPE (base))))
6425 cond_reduc_val
6426 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6427 }
6428 else
6429 {
6430 cond_reduc_op_code = MAX_EXPR;
6431 if (tree_int_cst_sgn (base) == 1)
6432 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6433 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6434 base))
6435 cond_reduc_val
6436 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6437 }
6438 if (cond_reduc_val)
6439 {
6440 if (dump_enabled_p ())
6441 dump_printf_loc (MSG_NOTE, vect_location,
6442 "condition expression based on "
6443 "integer induction.\n");
6444 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6445 = INTEGER_INDUC_COND_REDUCTION;
6446 }
6447 }
6448 else if (cond_reduc_dt == vect_constant_def)
6449 {
6450 enum vect_def_type cond_initial_dt;
6451 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6452 tree cond_initial_val
6453 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6454
6455 gcc_assert (cond_reduc_val != NULL_TREE);
6456 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6457 if (cond_initial_dt == vect_constant_def
6458 && types_compatible_p (TREE_TYPE (cond_initial_val),
6459 TREE_TYPE (cond_reduc_val)))
6460 {
6461 tree e = fold_binary (LE_EXPR, boolean_type_node,
6462 cond_initial_val, cond_reduc_val);
6463 if (e && (integer_onep (e) || integer_zerop (e)))
6464 {
6465 if (dump_enabled_p ())
6466 dump_printf_loc (MSG_NOTE, vect_location,
6467 "condition expression based on "
6468 "compile time constant.\n");
6469 /* Record reduction code at analysis stage. */
6470 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6471 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6472 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6473 = CONST_COND_REDUCTION;
6474 }
6475 }
6476 }
6477 }
6478
6479 if (orig_stmt_info)
6480 gcc_assert (tmp == orig_stmt_info
6481 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6482 else
6483 /* We changed STMT to be the first stmt in reduction chain, hence we
6484 check that in this case the first element in the chain is STMT. */
6485 gcc_assert (tmp == stmt_info
6486 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6487
6488 if (STMT_VINFO_LIVE_P (reduc_def_info))
6489 return false;
6490
6491 if (slp_node)
6492 ncopies = 1;
6493 else
6494 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6495
6496 gcc_assert (ncopies >= 1);
6497
6498 vec_mode = TYPE_MODE (vectype_in);
6499 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6500
6501 if (code == COND_EXPR)
6502 {
6503 /* Only call during the analysis stage, otherwise we'll lose
6504 STMT_VINFO_TYPE. */
6505 if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
6506 ops[reduc_index], 0, NULL,
6507 cost_vec))
6508 {
6509 if (dump_enabled_p ())
6510 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6511 "unsupported condition in reduction\n");
6512 return false;
6513 }
6514 }
6515 else
6516 {
6517 /* 4. Supportable by target? */
6518
6519 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6520 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6521 {
6522 /* Shifts and rotates are only supported by vectorizable_shifts,
6523 not vectorizable_reduction. */
6524 if (dump_enabled_p ())
6525 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6526 "unsupported shift or rotation.\n");
6527 return false;
6528 }
6529
6530 /* 4.1. check support for the operation in the loop */
6531 optab = optab_for_tree_code (code, vectype_in, optab_default);
6532 if (!optab)
6533 {
6534 if (dump_enabled_p ())
6535 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6536 "no optab.\n");
6537
6538 return false;
6539 }
6540
6541 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6542 {
6543 if (dump_enabled_p ())
6544 dump_printf (MSG_NOTE, "op not supported by target.\n");
6545
6546 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6547 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6548 return false;
6549
6550 if (dump_enabled_p ())
6551 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6552 }
6553
6554 /* Worthwhile without SIMD support? */
6555 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6556 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6557 {
6558 if (dump_enabled_p ())
6559 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6560 "not worthwhile without SIMD support.\n");
6561
6562 return false;
6563 }
6564 }
6565
6566 /* 4.2. Check support for the epilog operation.
6567
6568 If STMT represents a reduction pattern, then the type of the
6569 reduction variable may be different than the type of the rest
6570 of the arguments. For example, consider the case of accumulation
6571 of shorts into an int accumulator; The original code:
6572 S1: int_a = (int) short_a;
6573 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6574
6575 was replaced with:
6576 STMT: int_acc = widen_sum <short_a, int_acc>
6577
6578 This means that:
6579 1. The tree-code that is used to create the vector operation in the
6580 epilog code (that reduces the partial results) is not the
6581 tree-code of STMT, but is rather the tree-code of the original
6582 stmt from the pattern that STMT is replacing. I.e, in the example
6583 above we want to use 'widen_sum' in the loop, but 'plus' in the
6584 epilog.
6585 2. The type (mode) we use to check available target support
6586 for the vector operation to be created in the *epilog*, is
6587 determined by the type of the reduction variable (in the example
6588 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6589 However the type (mode) we use to check available target support
6590 for the vector operation to be created *inside the loop*, is
6591 determined by the type of the other arguments to STMT (in the
6592 example we'd check this: optab_handler (widen_sum_optab,
6593 vect_short_mode)).
6594
6595 This is contrary to "regular" reductions, in which the types of all
6596 the arguments are the same as the type of the reduction variable.
6597 For "regular" reductions we can therefore use the same vector type
6598 (and also the same tree-code) when generating the epilog code and
6599 when generating the code inside the loop. */
6600
6601 vect_reduction_type reduction_type
6602 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6603 if (orig_stmt_info
6604 && (reduction_type == TREE_CODE_REDUCTION
6605 || reduction_type == FOLD_LEFT_REDUCTION))
6606 {
6607 /* This is a reduction pattern: get the vectype from the type of the
6608 reduction variable, and get the tree-code from orig_stmt. */
6609 orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6610 gcc_assert (vectype_out);
6611 vec_mode = TYPE_MODE (vectype_out);
6612 }
6613 else
6614 {
6615 /* Regular reduction: use the same vectype and tree-code as used for
6616 the vector code inside the loop can be used for the epilog code. */
6617 orig_code = code;
6618
6619 if (code == MINUS_EXPR)
6620 orig_code = PLUS_EXPR;
6621
6622 /* For simple condition reductions, replace with the actual expression
6623 we want to base our reduction around. */
6624 if (reduction_type == CONST_COND_REDUCTION)
6625 {
6626 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6627 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6628 }
6629 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6630 orig_code = cond_reduc_op_code;
6631 }
6632
6633 if (nested_cycle)
6634 {
6635 def_bb = gimple_bb (reduc_def_phi);
6636 def_stmt_loop = def_bb->loop_father;
6637 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6638 loop_preheader_edge (def_stmt_loop));
6639 stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6640 if (def_arg_stmt_info
6641 && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6642 == vect_double_reduction_def))
6643 double_reduc = true;
6644 }
6645
6646 reduc_fn = IFN_LAST;
6647
6648 if (reduction_type == TREE_CODE_REDUCTION
6649 || reduction_type == FOLD_LEFT_REDUCTION
6650 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6651 || reduction_type == CONST_COND_REDUCTION)
6652 {
6653 if (reduction_type == FOLD_LEFT_REDUCTION
6654 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6655 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6656 {
6657 if (reduc_fn != IFN_LAST
6658 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6659 OPTIMIZE_FOR_SPEED))
6660 {
6661 if (dump_enabled_p ())
6662 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6663 "reduc op not supported by target.\n");
6664
6665 reduc_fn = IFN_LAST;
6666 }
6667 }
6668 else
6669 {
6670 if (!nested_cycle || double_reduc)
6671 {
6672 if (dump_enabled_p ())
6673 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6674 "no reduc code for scalar code.\n");
6675
6676 return false;
6677 }
6678 }
6679 }
6680 else if (reduction_type == COND_REDUCTION)
6681 {
6682 int scalar_precision
6683 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6684 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6685 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6686 nunits_out);
6687
6688 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6689 OPTIMIZE_FOR_SPEED))
6690 reduc_fn = IFN_REDUC_MAX;
6691 }
6692
6693 if (reduction_type != EXTRACT_LAST_REDUCTION
6694 && (!nested_cycle || double_reduc)
6695 && reduc_fn == IFN_LAST
6696 && !nunits_out.is_constant ())
6697 {
6698 if (dump_enabled_p ())
6699 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6700 "missing target support for reduction on"
6701 " variable-length vectors.\n");
6702 return false;
6703 }
6704
6705 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6706 && ncopies > 1)
6707 {
6708 if (dump_enabled_p ())
6709 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6710 "multiple types in double reduction or condition "
6711 "reduction.\n");
6712 return false;
6713 }
6714
6715 /* For SLP reductions, see if there is a neutral value we can use. */
6716 tree neutral_op = NULL_TREE;
6717 if (slp_node)
6718 neutral_op = neutral_op_for_slp_reduction
6719 (slp_node_instance->reduc_phis, code,
6720 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6721
6722 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6723 {
6724 /* We can't support in-order reductions of code such as this:
6725
6726 for (int i = 0; i < n1; ++i)
6727 for (int j = 0; j < n2; ++j)
6728 l += a[j];
6729
6730 since GCC effectively transforms the loop when vectorizing:
6731
6732 for (int i = 0; i < n1 / VF; ++i)
6733 for (int j = 0; j < n2; ++j)
6734 for (int k = 0; k < VF; ++k)
6735 l += a[j];
6736
6737 which is a reassociation of the original operation. */
6738 if (dump_enabled_p ())
6739 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6740 "in-order double reduction not supported.\n");
6741
6742 return false;
6743 }
6744
6745 if (reduction_type == FOLD_LEFT_REDUCTION
6746 && slp_node
6747 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6748 {
6749 /* We cannot use in-order reductions in this case because there is
6750 an implicit reassociation of the operations involved. */
6751 if (dump_enabled_p ())
6752 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6753 "in-order unchained SLP reductions not supported.\n");
6754 return false;
6755 }
6756
6757 /* For double reductions, and for SLP reductions with a neutral value,
6758 we construct a variable-length initial vector by loading a vector
6759 full of the neutral value and then shift-and-inserting the start
6760 values into the low-numbered elements. */
6761 if ((double_reduc || neutral_op)
6762 && !nunits_out.is_constant ()
6763 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6764 vectype_out, OPTIMIZE_FOR_SPEED))
6765 {
6766 if (dump_enabled_p ())
6767 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6768 "reduction on variable-length vectors requires"
6769 " target support for a vector-shift-and-insert"
6770 " operation.\n");
6771 return false;
6772 }
6773
6774 /* Check extra constraints for variable-length unchained SLP reductions. */
6775 if (STMT_SLP_TYPE (stmt_info)
6776 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6777 && !nunits_out.is_constant ())
6778 {
6779 /* We checked above that we could build the initial vector when
6780 there's a neutral element value. Check here for the case in
6781 which each SLP statement has its own initial value and in which
6782 that value needs to be repeated for every instance of the
6783 statement within the initial vector. */
6784 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6785 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6786 if (!neutral_op
6787 && !can_duplicate_and_interleave_p (group_size, elt_mode))
6788 {
6789 if (dump_enabled_p ())
6790 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6791 "unsupported form of SLP reduction for"
6792 " variable-length vectors: cannot build"
6793 " initial vector.\n");
6794 return false;
6795 }
6796 /* The epilogue code relies on the number of elements being a multiple
6797 of the group size. The duplicate-and-interleave approach to setting
6798 up the the initial vector does too. */
6799 if (!multiple_p (nunits_out, group_size))
6800 {
6801 if (dump_enabled_p ())
6802 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6803 "unsupported form of SLP reduction for"
6804 " variable-length vectors: the vector size"
6805 " is not a multiple of the number of results.\n");
6806 return false;
6807 }
6808 }
6809
6810 /* In case of widenning multiplication by a constant, we update the type
6811 of the constant to be the type of the other operand. We check that the
6812 constant fits the type in the pattern recognition pass. */
6813 if (code == DOT_PROD_EXPR
6814 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6815 {
6816 if (TREE_CODE (ops[0]) == INTEGER_CST)
6817 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6818 else if (TREE_CODE (ops[1]) == INTEGER_CST)
6819 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6820 else
6821 {
6822 if (dump_enabled_p ())
6823 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6824 "invalid types in dot-prod\n");
6825
6826 return false;
6827 }
6828 }
6829
6830 if (reduction_type == COND_REDUCTION)
6831 {
6832 widest_int ni;
6833
6834 if (! max_loop_iterations (loop, &ni))
6835 {
6836 if (dump_enabled_p ())
6837 dump_printf_loc (MSG_NOTE, vect_location,
6838 "loop count not known, cannot create cond "
6839 "reduction.\n");
6840 return false;
6841 }
6842 /* Convert backedges to iterations. */
6843 ni += 1;
6844
6845 /* The additional index will be the same type as the condition. Check
6846 that the loop can fit into this less one (because we'll use up the
6847 zero slot for when there are no matches). */
6848 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6849 if (wi::geu_p (ni, wi::to_widest (max_index)))
6850 {
6851 if (dump_enabled_p ())
6852 dump_printf_loc (MSG_NOTE, vect_location,
6853 "loop size is greater than data size.\n");
6854 return false;
6855 }
6856 }
6857
6858 /* In case the vectorization factor (VF) is bigger than the number
6859 of elements that we can fit in a vectype (nunits), we have to generate
6860 more than one vector stmt - i.e - we need to "unroll" the
6861 vector stmt by a factor VF/nunits. For more details see documentation
6862 in vectorizable_operation. */
6863
6864 /* If the reduction is used in an outer loop we need to generate
6865 VF intermediate results, like so (e.g. for ncopies=2):
6866 r0 = phi (init, r0)
6867 r1 = phi (init, r1)
6868 r0 = x0 + r0;
6869 r1 = x1 + r1;
6870 (i.e. we generate VF results in 2 registers).
6871 In this case we have a separate def-use cycle for each copy, and therefore
6872 for each copy we get the vector def for the reduction variable from the
6873 respective phi node created for this copy.
6874
6875 Otherwise (the reduction is unused in the loop nest), we can combine
6876 together intermediate results, like so (e.g. for ncopies=2):
6877 r = phi (init, r)
6878 r = x0 + r;
6879 r = x1 + r;
6880 (i.e. we generate VF/2 results in a single register).
6881 In this case for each copy we get the vector def for the reduction variable
6882 from the vectorized reduction operation generated in the previous iteration.
6883
6884 This only works when we see both the reduction PHI and its only consumer
6885 in vectorizable_reduction and there are no intermediate stmts
6886 participating. */
6887 stmt_vec_info use_stmt_info;
6888 tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6889 if (ncopies > 1
6890 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6891 && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6892 && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
6893 {
6894 single_defuse_cycle = true;
6895 epilog_copies = 1;
6896 }
6897 else
6898 epilog_copies = ncopies;
6899
6900 /* If the reduction stmt is one of the patterns that have lane
6901 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
6902 if ((ncopies > 1
6903 && ! single_defuse_cycle)
6904 && (code == DOT_PROD_EXPR
6905 || code == WIDEN_SUM_EXPR
6906 || code == SAD_EXPR))
6907 {
6908 if (dump_enabled_p ())
6909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6910 "multi def-use cycle not possible for lane-reducing "
6911 "reduction operation\n");
6912 return false;
6913 }
6914
6915 if (slp_node)
6916 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6917 else
6918 vec_num = 1;
6919
6920 internal_fn cond_fn = get_conditional_internal_fn (code);
6921 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6922
6923 if (!vec_stmt) /* transformation not required. */
6924 {
6925 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
6926 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6927 {
6928 if (reduction_type != FOLD_LEFT_REDUCTION
6929 && (cond_fn == IFN_LAST
6930 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6931 OPTIMIZE_FOR_SPEED)))
6932 {
6933 if (dump_enabled_p ())
6934 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6935 "can't use a fully-masked loop because no"
6936 " conditional operation is available.\n");
6937 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6938 }
6939 else if (reduc_index == -1)
6940 {
6941 if (dump_enabled_p ())
6942 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6943 "can't use a fully-masked loop for chained"
6944 " reductions.\n");
6945 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6946 }
6947 else
6948 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6949 vectype_in);
6950 }
6951 if (dump_enabled_p ()
6952 && reduction_type == FOLD_LEFT_REDUCTION)
6953 dump_printf_loc (MSG_NOTE, vect_location,
6954 "using an in-order (fold-left) reduction.\n");
6955 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6956 return true;
6957 }
6958
6959 /* Transform. */
6960
6961 if (dump_enabled_p ())
6962 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6963
6964 /* FORNOW: Multiple types are not supported for condition. */
6965 if (code == COND_EXPR)
6966 gcc_assert (ncopies == 1);
6967
6968 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6969
6970 if (reduction_type == FOLD_LEFT_REDUCTION)
6971 return vectorize_fold_left_reduction
6972 (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6973 reduc_fn, ops, vectype_in, reduc_index, masks);
6974
6975 if (reduction_type == EXTRACT_LAST_REDUCTION)
6976 {
6977 gcc_assert (!slp_node);
6978 return vectorizable_condition (stmt_info, gsi, vec_stmt,
6979 NULL, reduc_index, NULL, NULL);
6980 }
6981
6982 /* Create the destination vector */
6983 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6984
6985 prev_stmt_info = NULL;
6986 prev_phi_info = NULL;
6987 if (!slp_node)
6988 {
6989 vec_oprnds0.create (1);
6990 vec_oprnds1.create (1);
6991 if (op_type == ternary_op)
6992 vec_oprnds2.create (1);
6993 }
6994
6995 phis.create (vec_num);
6996 vect_defs.create (vec_num);
6997 if (!slp_node)
6998 vect_defs.quick_push (NULL_TREE);
6999
7000 if (slp_node)
7001 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7002 else
7003 phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
7004
7005 for (j = 0; j < ncopies; j++)
7006 {
7007 if (code == COND_EXPR)
7008 {
7009 gcc_assert (!slp_node);
7010 vectorizable_condition (stmt_info, gsi, vec_stmt,
7011 PHI_RESULT (phis[0]->stmt),
7012 reduc_index, NULL, NULL);
7013 /* Multiple types are not supported for condition. */
7014 break;
7015 }
7016
7017 /* Handle uses. */
7018 if (j == 0)
7019 {
7020 if (slp_node)
7021 {
7022 /* Get vec defs for all the operands except the reduction index,
7023 ensuring the ordering of the ops in the vector is kept. */
7024 auto_vec<tree, 3> slp_ops;
7025 auto_vec<vec<tree>, 3> vec_defs;
7026
7027 slp_ops.quick_push (ops[0]);
7028 slp_ops.quick_push (ops[1]);
7029 if (op_type == ternary_op)
7030 slp_ops.quick_push (ops[2]);
7031
7032 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7033
7034 vec_oprnds0.safe_splice (vec_defs[0]);
7035 vec_defs[0].release ();
7036 vec_oprnds1.safe_splice (vec_defs[1]);
7037 vec_defs[1].release ();
7038 if (op_type == ternary_op)
7039 {
7040 vec_oprnds2.safe_splice (vec_defs[2]);
7041 vec_defs[2].release ();
7042 }
7043 }
7044 else
7045 {
7046 vec_oprnds0.quick_push
7047 (vect_get_vec_def_for_operand (ops[0], stmt_info));
7048 vec_oprnds1.quick_push
7049 (vect_get_vec_def_for_operand (ops[1], stmt_info));
7050 if (op_type == ternary_op)
7051 vec_oprnds2.quick_push
7052 (vect_get_vec_def_for_operand (ops[2], stmt_info));
7053 }
7054 }
7055 else
7056 {
7057 if (!slp_node)
7058 {
7059 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7060
7061 if (single_defuse_cycle && reduc_index == 0)
7062 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7063 else
7064 vec_oprnds0[0]
7065 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7066 vec_oprnds0[0]);
7067 if (single_defuse_cycle && reduc_index == 1)
7068 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7069 else
7070 vec_oprnds1[0]
7071 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7072 vec_oprnds1[0]);
7073 if (op_type == ternary_op)
7074 {
7075 if (single_defuse_cycle && reduc_index == 2)
7076 vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7077 else
7078 vec_oprnds2[0]
7079 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7080 vec_oprnds2[0]);
7081 }
7082 }
7083 }
7084
7085 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7086 {
7087 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7088 if (masked_loop_p)
7089 {
7090 /* Make sure that the reduction accumulator is vop[0]. */
7091 if (reduc_index == 1)
7092 {
7093 gcc_assert (commutative_tree_code (code));
7094 std::swap (vop[0], vop[1]);
7095 }
7096 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7097 vectype_in, i * ncopies + j);
7098 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7099 vop[0], vop[1],
7100 vop[0]);
7101 new_temp = make_ssa_name (vec_dest, call);
7102 gimple_call_set_lhs (call, new_temp);
7103 gimple_call_set_nothrow (call, true);
7104 new_stmt_info
7105 = vect_finish_stmt_generation (stmt_info, call, gsi);
7106 }
7107 else
7108 {
7109 if (op_type == ternary_op)
7110 vop[2] = vec_oprnds2[i];
7111
7112 gassign *new_stmt = gimple_build_assign (vec_dest, code,
7113 vop[0], vop[1], vop[2]);
7114 new_temp = make_ssa_name (vec_dest, new_stmt);
7115 gimple_assign_set_lhs (new_stmt, new_temp);
7116 new_stmt_info
7117 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7118 }
7119
7120 if (slp_node)
7121 {
7122 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7123 vect_defs.quick_push (new_temp);
7124 }
7125 else
7126 vect_defs[0] = new_temp;
7127 }
7128
7129 if (slp_node)
7130 continue;
7131
7132 if (j == 0)
7133 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7134 else
7135 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7136
7137 prev_stmt_info = new_stmt_info;
7138 }
7139
7140 /* Finalize the reduction-phi (set its arguments) and create the
7141 epilog reduction code. */
7142 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7143 vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7144
7145 vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
7146 epilog_copies, reduc_fn, phis,
7147 double_reduc, slp_node, slp_node_instance,
7148 cond_reduc_val, cond_reduc_op_code,
7149 neutral_op);
7150
7151 return true;
7152 }
7153
7154 /* Function vect_min_worthwhile_factor.
7155
7156 For a loop where we could vectorize the operation indicated by CODE,
7157 return the minimum vectorization factor that makes it worthwhile
7158 to use generic vectors. */
7159 static unsigned int
7160 vect_min_worthwhile_factor (enum tree_code code)
7161 {
7162 switch (code)
7163 {
7164 case PLUS_EXPR:
7165 case MINUS_EXPR:
7166 case NEGATE_EXPR:
7167 return 4;
7168
7169 case BIT_AND_EXPR:
7170 case BIT_IOR_EXPR:
7171 case BIT_XOR_EXPR:
7172 case BIT_NOT_EXPR:
7173 return 2;
7174
7175 default:
7176 return INT_MAX;
7177 }
7178 }
7179
7180 /* Return true if VINFO indicates we are doing loop vectorization and if
7181 it is worth decomposing CODE operations into scalar operations for
7182 that loop's vectorization factor. */
7183
7184 bool
7185 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7186 {
7187 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7188 unsigned HOST_WIDE_INT value;
7189 return (loop_vinfo
7190 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7191 && value >= vect_min_worthwhile_factor (code));
7192 }
7193
7194 /* Function vectorizable_induction
7195
7196 Check if STMT_INFO performs an induction computation that can be vectorized.
7197 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7198 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7199 Return true if STMT_INFO is vectorizable in this way. */
7200
7201 bool
7202 vectorizable_induction (stmt_vec_info stmt_info,
7203 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7204 stmt_vec_info *vec_stmt, slp_tree slp_node,
7205 stmt_vector_for_cost *cost_vec)
7206 {
7207 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7208 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7209 unsigned ncopies;
7210 bool nested_in_vect_loop = false;
7211 struct loop *iv_loop;
7212 tree vec_def;
7213 edge pe = loop_preheader_edge (loop);
7214 basic_block new_bb;
7215 tree new_vec, vec_init, vec_step, t;
7216 tree new_name;
7217 gimple *new_stmt;
7218 gphi *induction_phi;
7219 tree induc_def, vec_dest;
7220 tree init_expr, step_expr;
7221 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7222 unsigned i;
7223 tree expr;
7224 gimple_seq stmts;
7225 imm_use_iterator imm_iter;
7226 use_operand_p use_p;
7227 gimple *exit_phi;
7228 edge latch_e;
7229 tree loop_arg;
7230 gimple_stmt_iterator si;
7231
7232 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7233 if (!phi)
7234 return false;
7235
7236 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7237 return false;
7238
7239 /* Make sure it was recognized as induction computation. */
7240 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7241 return false;
7242
7243 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7244 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7245
7246 if (slp_node)
7247 ncopies = 1;
7248 else
7249 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7250 gcc_assert (ncopies >= 1);
7251
7252 /* FORNOW. These restrictions should be relaxed. */
7253 if (nested_in_vect_loop_p (loop, stmt_info))
7254 {
7255 imm_use_iterator imm_iter;
7256 use_operand_p use_p;
7257 gimple *exit_phi;
7258 edge latch_e;
7259 tree loop_arg;
7260
7261 if (ncopies > 1)
7262 {
7263 if (dump_enabled_p ())
7264 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7265 "multiple types in nested loop.\n");
7266 return false;
7267 }
7268
7269 /* FORNOW: outer loop induction with SLP not supported. */
7270 if (STMT_SLP_TYPE (stmt_info))
7271 return false;
7272
7273 exit_phi = NULL;
7274 latch_e = loop_latch_edge (loop->inner);
7275 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7276 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7277 {
7278 gimple *use_stmt = USE_STMT (use_p);
7279 if (is_gimple_debug (use_stmt))
7280 continue;
7281
7282 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7283 {
7284 exit_phi = use_stmt;
7285 break;
7286 }
7287 }
7288 if (exit_phi)
7289 {
7290 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7291 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7292 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7293 {
7294 if (dump_enabled_p ())
7295 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7296 "inner-loop induction only used outside "
7297 "of the outer vectorized loop.\n");
7298 return false;
7299 }
7300 }
7301
7302 nested_in_vect_loop = true;
7303 iv_loop = loop->inner;
7304 }
7305 else
7306 iv_loop = loop;
7307 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7308
7309 if (slp_node && !nunits.is_constant ())
7310 {
7311 /* The current SLP code creates the initial value element-by-element. */
7312 if (dump_enabled_p ())
7313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7314 "SLP induction not supported for variable-length"
7315 " vectors.\n");
7316 return false;
7317 }
7318
7319 if (!vec_stmt) /* transformation not required. */
7320 {
7321 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7322 DUMP_VECT_SCOPE ("vectorizable_induction");
7323 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7324 return true;
7325 }
7326
7327 /* Transform. */
7328
7329 /* Compute a vector variable, initialized with the first VF values of
7330 the induction variable. E.g., for an iv with IV_PHI='X' and
7331 evolution S, for a vector of 4 units, we want to compute:
7332 [X, X + S, X + 2*S, X + 3*S]. */
7333
7334 if (dump_enabled_p ())
7335 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7336
7337 latch_e = loop_latch_edge (iv_loop);
7338 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7339
7340 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7341 gcc_assert (step_expr != NULL_TREE);
7342
7343 pe = loop_preheader_edge (iv_loop);
7344 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7345 loop_preheader_edge (iv_loop));
7346
7347 stmts = NULL;
7348 if (!nested_in_vect_loop)
7349 {
7350 /* Convert the initial value to the desired type. */
7351 tree new_type = TREE_TYPE (vectype);
7352 init_expr = gimple_convert (&stmts, new_type, init_expr);
7353
7354 /* If we are using the loop mask to "peel" for alignment then we need
7355 to adjust the start value here. */
7356 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7357 if (skip_niters != NULL_TREE)
7358 {
7359 if (FLOAT_TYPE_P (vectype))
7360 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7361 skip_niters);
7362 else
7363 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7364 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7365 skip_niters, step_expr);
7366 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7367 init_expr, skip_step);
7368 }
7369 }
7370
7371 /* Convert the step to the desired type. */
7372 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7373
7374 if (stmts)
7375 {
7376 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7377 gcc_assert (!new_bb);
7378 }
7379
7380 /* Find the first insertion point in the BB. */
7381 basic_block bb = gimple_bb (phi);
7382 si = gsi_after_labels (bb);
7383
7384 /* For SLP induction we have to generate several IVs as for example
7385 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7386 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7387 [VF*S, VF*S, VF*S, VF*S] for all. */
7388 if (slp_node)
7389 {
7390 /* Enforced above. */
7391 unsigned int const_nunits = nunits.to_constant ();
7392
7393 /* Generate [VF*S, VF*S, ... ]. */
7394 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7395 {
7396 expr = build_int_cst (integer_type_node, vf);
7397 expr = fold_convert (TREE_TYPE (step_expr), expr);
7398 }
7399 else
7400 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7401 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7402 expr, step_expr);
7403 if (! CONSTANT_CLASS_P (new_name))
7404 new_name = vect_init_vector (stmt_info, new_name,
7405 TREE_TYPE (step_expr), NULL);
7406 new_vec = build_vector_from_val (vectype, new_name);
7407 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7408
7409 /* Now generate the IVs. */
7410 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7411 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7412 unsigned elts = const_nunits * nvects;
7413 unsigned nivs = least_common_multiple (group_size,
7414 const_nunits) / const_nunits;
7415 gcc_assert (elts % group_size == 0);
7416 tree elt = init_expr;
7417 unsigned ivn;
7418 for (ivn = 0; ivn < nivs; ++ivn)
7419 {
7420 tree_vector_builder elts (vectype, const_nunits, 1);
7421 stmts = NULL;
7422 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7423 {
7424 if (ivn*const_nunits + eltn >= group_size
7425 && (ivn * const_nunits + eltn) % group_size == 0)
7426 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7427 elt, step_expr);
7428 elts.quick_push (elt);
7429 }
7430 vec_init = gimple_build_vector (&stmts, &elts);
7431 if (stmts)
7432 {
7433 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7434 gcc_assert (!new_bb);
7435 }
7436
7437 /* Create the induction-phi that defines the induction-operand. */
7438 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7439 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7440 stmt_vec_info induction_phi_info
7441 = loop_vinfo->add_stmt (induction_phi);
7442 induc_def = PHI_RESULT (induction_phi);
7443
7444 /* Create the iv update inside the loop */
7445 vec_def = make_ssa_name (vec_dest);
7446 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7447 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7448 loop_vinfo->add_stmt (new_stmt);
7449
7450 /* Set the arguments of the phi node: */
7451 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7452 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7453 UNKNOWN_LOCATION);
7454
7455 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7456 }
7457
7458 /* Re-use IVs when we can. */
7459 if (ivn < nvects)
7460 {
7461 unsigned vfp
7462 = least_common_multiple (group_size, const_nunits) / group_size;
7463 /* Generate [VF'*S, VF'*S, ... ]. */
7464 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7465 {
7466 expr = build_int_cst (integer_type_node, vfp);
7467 expr = fold_convert (TREE_TYPE (step_expr), expr);
7468 }
7469 else
7470 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7471 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7472 expr, step_expr);
7473 if (! CONSTANT_CLASS_P (new_name))
7474 new_name = vect_init_vector (stmt_info, new_name,
7475 TREE_TYPE (step_expr), NULL);
7476 new_vec = build_vector_from_val (vectype, new_name);
7477 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7478 for (; ivn < nvects; ++ivn)
7479 {
7480 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7481 tree def;
7482 if (gimple_code (iv) == GIMPLE_PHI)
7483 def = gimple_phi_result (iv);
7484 else
7485 def = gimple_assign_lhs (iv);
7486 new_stmt = gimple_build_assign (make_ssa_name (vectype),
7487 PLUS_EXPR,
7488 def, vec_step);
7489 if (gimple_code (iv) == GIMPLE_PHI)
7490 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7491 else
7492 {
7493 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7494 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7495 }
7496 SLP_TREE_VEC_STMTS (slp_node).quick_push
7497 (loop_vinfo->add_stmt (new_stmt));
7498 }
7499 }
7500
7501 return true;
7502 }
7503
7504 /* Create the vector that holds the initial_value of the induction. */
7505 if (nested_in_vect_loop)
7506 {
7507 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7508 been created during vectorization of previous stmts. We obtain it
7509 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7510 vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7511 /* If the initial value is not of proper type, convert it. */
7512 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7513 {
7514 new_stmt
7515 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7516 vect_simple_var,
7517 "vec_iv_"),
7518 VIEW_CONVERT_EXPR,
7519 build1 (VIEW_CONVERT_EXPR, vectype,
7520 vec_init));
7521 vec_init = gimple_assign_lhs (new_stmt);
7522 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7523 new_stmt);
7524 gcc_assert (!new_bb);
7525 loop_vinfo->add_stmt (new_stmt);
7526 }
7527 }
7528 else
7529 {
7530 /* iv_loop is the loop to be vectorized. Create:
7531 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7532 stmts = NULL;
7533 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7534
7535 unsigned HOST_WIDE_INT const_nunits;
7536 if (nunits.is_constant (&const_nunits))
7537 {
7538 tree_vector_builder elts (vectype, const_nunits, 1);
7539 elts.quick_push (new_name);
7540 for (i = 1; i < const_nunits; i++)
7541 {
7542 /* Create: new_name_i = new_name + step_expr */
7543 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7544 new_name, step_expr);
7545 elts.quick_push (new_name);
7546 }
7547 /* Create a vector from [new_name_0, new_name_1, ...,
7548 new_name_nunits-1] */
7549 vec_init = gimple_build_vector (&stmts, &elts);
7550 }
7551 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7552 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7553 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7554 new_name, step_expr);
7555 else
7556 {
7557 /* Build:
7558 [base, base, base, ...]
7559 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7560 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7561 gcc_assert (flag_associative_math);
7562 tree index = build_index_vector (vectype, 0, 1);
7563 tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7564 new_name);
7565 tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7566 step_expr);
7567 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7568 vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7569 vec_init, step_vec);
7570 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7571 vec_init, base_vec);
7572 }
7573
7574 if (stmts)
7575 {
7576 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7577 gcc_assert (!new_bb);
7578 }
7579 }
7580
7581
7582 /* Create the vector that holds the step of the induction. */
7583 if (nested_in_vect_loop)
7584 /* iv_loop is nested in the loop to be vectorized. Generate:
7585 vec_step = [S, S, S, S] */
7586 new_name = step_expr;
7587 else
7588 {
7589 /* iv_loop is the loop to be vectorized. Generate:
7590 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7591 gimple_seq seq = NULL;
7592 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7593 {
7594 expr = build_int_cst (integer_type_node, vf);
7595 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7596 }
7597 else
7598 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7599 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7600 expr, step_expr);
7601 if (seq)
7602 {
7603 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7604 gcc_assert (!new_bb);
7605 }
7606 }
7607
7608 t = unshare_expr (new_name);
7609 gcc_assert (CONSTANT_CLASS_P (new_name)
7610 || TREE_CODE (new_name) == SSA_NAME);
7611 new_vec = build_vector_from_val (vectype, t);
7612 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7613
7614
7615 /* Create the following def-use cycle:
7616 loop prolog:
7617 vec_init = ...
7618 vec_step = ...
7619 loop:
7620 vec_iv = PHI <vec_init, vec_loop>
7621 ...
7622 STMT
7623 ...
7624 vec_loop = vec_iv + vec_step; */
7625
7626 /* Create the induction-phi that defines the induction-operand. */
7627 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7628 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7629 stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7630 induc_def = PHI_RESULT (induction_phi);
7631
7632 /* Create the iv update inside the loop */
7633 vec_def = make_ssa_name (vec_dest);
7634 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7635 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7636 stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7637
7638 /* Set the arguments of the phi node: */
7639 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7640 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7641 UNKNOWN_LOCATION);
7642
7643 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7644
7645 /* In case that vectorization factor (VF) is bigger than the number
7646 of elements that we can fit in a vectype (nunits), we have to generate
7647 more than one vector stmt - i.e - we need to "unroll" the
7648 vector stmt by a factor VF/nunits. For more details see documentation
7649 in vectorizable_operation. */
7650
7651 if (ncopies > 1)
7652 {
7653 gimple_seq seq = NULL;
7654 stmt_vec_info prev_stmt_vinfo;
7655 /* FORNOW. This restriction should be relaxed. */
7656 gcc_assert (!nested_in_vect_loop);
7657
7658 /* Create the vector that holds the step of the induction. */
7659 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7660 {
7661 expr = build_int_cst (integer_type_node, nunits);
7662 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7663 }
7664 else
7665 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7666 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7667 expr, step_expr);
7668 if (seq)
7669 {
7670 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7671 gcc_assert (!new_bb);
7672 }
7673
7674 t = unshare_expr (new_name);
7675 gcc_assert (CONSTANT_CLASS_P (new_name)
7676 || TREE_CODE (new_name) == SSA_NAME);
7677 new_vec = build_vector_from_val (vectype, t);
7678 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7679
7680 vec_def = induc_def;
7681 prev_stmt_vinfo = induction_phi_info;
7682 for (i = 1; i < ncopies; i++)
7683 {
7684 /* vec_i = vec_prev + vec_step */
7685 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7686 vec_def, vec_step);
7687 vec_def = make_ssa_name (vec_dest, new_stmt);
7688 gimple_assign_set_lhs (new_stmt, vec_def);
7689
7690 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7691 new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7692 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7693 prev_stmt_vinfo = new_stmt_info;
7694 }
7695 }
7696
7697 if (nested_in_vect_loop)
7698 {
7699 /* Find the loop-closed exit-phi of the induction, and record
7700 the final vector of induction results: */
7701 exit_phi = NULL;
7702 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7703 {
7704 gimple *use_stmt = USE_STMT (use_p);
7705 if (is_gimple_debug (use_stmt))
7706 continue;
7707
7708 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7709 {
7710 exit_phi = use_stmt;
7711 break;
7712 }
7713 }
7714 if (exit_phi)
7715 {
7716 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7717 /* FORNOW. Currently not supporting the case that an inner-loop induction
7718 is not used in the outer-loop (i.e. only outside the outer-loop). */
7719 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7720 && !STMT_VINFO_LIVE_P (stmt_vinfo));
7721
7722 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7723 if (dump_enabled_p ())
7724 dump_printf_loc (MSG_NOTE, vect_location,
7725 "vector of inductions after inner-loop:%G",
7726 new_stmt);
7727 }
7728 }
7729
7730
7731 if (dump_enabled_p ())
7732 dump_printf_loc (MSG_NOTE, vect_location,
7733 "transform induction: created def-use cycle: %G%G",
7734 induction_phi, SSA_NAME_DEF_STMT (vec_def));
7735
7736 return true;
7737 }
7738
7739 /* Function vectorizable_live_operation.
7740
7741 STMT_INFO computes a value that is used outside the loop. Check if
7742 it can be supported. */
7743
7744 bool
7745 vectorizable_live_operation (stmt_vec_info stmt_info,
7746 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7747 slp_tree slp_node, int slp_index,
7748 stmt_vec_info *vec_stmt,
7749 stmt_vector_for_cost *)
7750 {
7751 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7752 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7753 imm_use_iterator imm_iter;
7754 tree lhs, lhs_type, bitsize, vec_bitsize;
7755 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7756 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7757 int ncopies;
7758 gimple *use_stmt;
7759 auto_vec<tree> vec_oprnds;
7760 int vec_entry = 0;
7761 poly_uint64 vec_index = 0;
7762
7763 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7764
7765 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7766 return false;
7767
7768 /* FORNOW. CHECKME. */
7769 if (nested_in_vect_loop_p (loop, stmt_info))
7770 return false;
7771
7772 /* If STMT is not relevant and it is a simple assignment and its inputs are
7773 invariant then it can remain in place, unvectorized. The original last
7774 scalar value that it computes will be used. */
7775 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7776 {
7777 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7778 if (dump_enabled_p ())
7779 dump_printf_loc (MSG_NOTE, vect_location,
7780 "statement is simple and uses invariant. Leaving in "
7781 "place.\n");
7782 return true;
7783 }
7784
7785 if (slp_node)
7786 ncopies = 1;
7787 else
7788 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7789
7790 if (slp_node)
7791 {
7792 gcc_assert (slp_index >= 0);
7793
7794 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7795 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7796
7797 /* Get the last occurrence of the scalar index from the concatenation of
7798 all the slp vectors. Calculate which slp vector it is and the index
7799 within. */
7800 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7801
7802 /* Calculate which vector contains the result, and which lane of
7803 that vector we need. */
7804 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7805 {
7806 if (dump_enabled_p ())
7807 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7808 "Cannot determine which vector holds the"
7809 " final result.\n");
7810 return false;
7811 }
7812 }
7813
7814 if (!vec_stmt)
7815 {
7816 /* No transformation required. */
7817 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7818 {
7819 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7820 OPTIMIZE_FOR_SPEED))
7821 {
7822 if (dump_enabled_p ())
7823 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7824 "can't use a fully-masked loop because "
7825 "the target doesn't support extract last "
7826 "reduction.\n");
7827 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7828 }
7829 else if (slp_node)
7830 {
7831 if (dump_enabled_p ())
7832 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7833 "can't use a fully-masked loop because an "
7834 "SLP statement is live after the loop.\n");
7835 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7836 }
7837 else if (ncopies > 1)
7838 {
7839 if (dump_enabled_p ())
7840 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7841 "can't use a fully-masked loop because"
7842 " ncopies is greater than 1.\n");
7843 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7844 }
7845 else
7846 {
7847 gcc_assert (ncopies == 1 && !slp_node);
7848 vect_record_loop_mask (loop_vinfo,
7849 &LOOP_VINFO_MASKS (loop_vinfo),
7850 1, vectype);
7851 }
7852 }
7853 return true;
7854 }
7855
7856 /* Use the lhs of the original scalar statement. */
7857 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
7858
7859 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7860 : gimple_get_lhs (stmt);
7861 lhs_type = TREE_TYPE (lhs);
7862
7863 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7864 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7865 : TYPE_SIZE (TREE_TYPE (vectype)));
7866 vec_bitsize = TYPE_SIZE (vectype);
7867
7868 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
7869 tree vec_lhs, bitstart;
7870 if (slp_node)
7871 {
7872 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7873
7874 /* Get the correct slp vectorized stmt. */
7875 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7876 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7877 vec_lhs = gimple_phi_result (phi);
7878 else
7879 vec_lhs = gimple_get_lhs (vec_stmt);
7880
7881 /* Get entry to use. */
7882 bitstart = bitsize_int (vec_index);
7883 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7884 }
7885 else
7886 {
7887 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7888 vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7889 gcc_checking_assert (ncopies == 1
7890 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7891
7892 /* For multiple copies, get the last copy. */
7893 for (int i = 1; i < ncopies; ++i)
7894 vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
7895
7896 /* Get the last lane in the vector. */
7897 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7898 }
7899
7900 gimple_seq stmts = NULL;
7901 tree new_tree;
7902 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7903 {
7904 /* Emit:
7905
7906 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7907
7908 where VEC_LHS is the vectorized live-out result and MASK is
7909 the loop mask for the final iteration. */
7910 gcc_assert (ncopies == 1 && !slp_node);
7911 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7912 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
7913 1, vectype, 0);
7914 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
7915 scalar_type, mask, vec_lhs);
7916
7917 /* Convert the extracted vector element to the required scalar type. */
7918 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7919 }
7920 else
7921 {
7922 tree bftype = TREE_TYPE (vectype);
7923 if (VECTOR_BOOLEAN_TYPE_P (vectype))
7924 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7925 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7926 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
7927 &stmts, true, NULL_TREE);
7928 }
7929
7930 if (stmts)
7931 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7932
7933 /* Replace use of lhs with newly computed result. If the use stmt is a
7934 single arg PHI, just replace all uses of PHI result. It's necessary
7935 because lcssa PHI defining lhs may be before newly inserted stmt. */
7936 use_operand_p use_p;
7937 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7938 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7939 && !is_gimple_debug (use_stmt))
7940 {
7941 if (gimple_code (use_stmt) == GIMPLE_PHI
7942 && gimple_phi_num_args (use_stmt) == 1)
7943 {
7944 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7945 }
7946 else
7947 {
7948 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7949 SET_USE (use_p, new_tree);
7950 }
7951 update_stmt (use_stmt);
7952 }
7953
7954 return true;
7955 }
7956
7957 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
7958
7959 static void
7960 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info)
7961 {
7962 ssa_op_iter op_iter;
7963 imm_use_iterator imm_iter;
7964 def_operand_p def_p;
7965 gimple *ustmt;
7966
7967 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
7968 {
7969 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7970 {
7971 basic_block bb;
7972
7973 if (!is_gimple_debug (ustmt))
7974 continue;
7975
7976 bb = gimple_bb (ustmt);
7977
7978 if (!flow_bb_inside_loop_p (loop, bb))
7979 {
7980 if (gimple_debug_bind_p (ustmt))
7981 {
7982 if (dump_enabled_p ())
7983 dump_printf_loc (MSG_NOTE, vect_location,
7984 "killing debug use\n");
7985
7986 gimple_debug_bind_reset_value (ustmt);
7987 update_stmt (ustmt);
7988 }
7989 else
7990 gcc_unreachable ();
7991 }
7992 }
7993 }
7994 }
7995
7996 /* Given loop represented by LOOP_VINFO, return true if computation of
7997 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7998 otherwise. */
7999
8000 static bool
8001 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8002 {
8003 /* Constant case. */
8004 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8005 {
8006 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8007 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8008
8009 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8010 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8011 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8012 return true;
8013 }
8014
8015 widest_int max;
8016 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8017 /* Check the upper bound of loop niters. */
8018 if (get_max_loop_iterations (loop, &max))
8019 {
8020 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8021 signop sgn = TYPE_SIGN (type);
8022 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8023 if (max < type_max)
8024 return true;
8025 }
8026 return false;
8027 }
8028
8029 /* Return a mask type with half the number of elements as TYPE. */
8030
8031 tree
8032 vect_halve_mask_nunits (tree type)
8033 {
8034 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8035 return build_truth_vector_type (nunits, current_vector_size);
8036 }
8037
8038 /* Return a mask type with twice as many elements as TYPE. */
8039
8040 tree
8041 vect_double_mask_nunits (tree type)
8042 {
8043 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8044 return build_truth_vector_type (nunits, current_vector_size);
8045 }
8046
8047 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8048 contain a sequence of NVECTORS masks that each control a vector of type
8049 VECTYPE. */
8050
8051 void
8052 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8053 unsigned int nvectors, tree vectype)
8054 {
8055 gcc_assert (nvectors != 0);
8056 if (masks->length () < nvectors)
8057 masks->safe_grow_cleared (nvectors);
8058 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8059 /* The number of scalars per iteration and the number of vectors are
8060 both compile-time constants. */
8061 unsigned int nscalars_per_iter
8062 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8063 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8064 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8065 {
8066 rgm->max_nscalars_per_iter = nscalars_per_iter;
8067 rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8068 }
8069 }
8070
8071 /* Given a complete set of masks MASKS, extract mask number INDEX
8072 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8073 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8074
8075 See the comment above vec_loop_masks for more details about the mask
8076 arrangement. */
8077
8078 tree
8079 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8080 unsigned int nvectors, tree vectype, unsigned int index)
8081 {
8082 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8083 tree mask_type = rgm->mask_type;
8084
8085 /* Populate the rgroup's mask array, if this is the first time we've
8086 used it. */
8087 if (rgm->masks.is_empty ())
8088 {
8089 rgm->masks.safe_grow_cleared (nvectors);
8090 for (unsigned int i = 0; i < nvectors; ++i)
8091 {
8092 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8093 /* Provide a dummy definition until the real one is available. */
8094 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8095 rgm->masks[i] = mask;
8096 }
8097 }
8098
8099 tree mask = rgm->masks[index];
8100 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8101 TYPE_VECTOR_SUBPARTS (vectype)))
8102 {
8103 /* A loop mask for data type X can be reused for data type Y
8104 if X has N times more elements than Y and if Y's elements
8105 are N times bigger than X's. In this case each sequence
8106 of N elements in the loop mask will be all-zero or all-one.
8107 We can then view-convert the mask so that each sequence of
8108 N elements is replaced by a single element. */
8109 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8110 TYPE_VECTOR_SUBPARTS (vectype)));
8111 gimple_seq seq = NULL;
8112 mask_type = build_same_sized_truth_vector_type (vectype);
8113 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8114 if (seq)
8115 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8116 }
8117 return mask;
8118 }
8119
8120 /* Scale profiling counters by estimation for LOOP which is vectorized
8121 by factor VF. */
8122
8123 static void
8124 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8125 {
8126 edge preheader = loop_preheader_edge (loop);
8127 /* Reduce loop iterations by the vectorization factor. */
8128 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8129 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8130
8131 if (freq_h.nonzero_p ())
8132 {
8133 profile_probability p;
8134
8135 /* Avoid dropping loop body profile counter to 0 because of zero count
8136 in loop's preheader. */
8137 if (!(freq_e == profile_count::zero ()))
8138 freq_e = freq_e.force_nonzero ();
8139 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8140 scale_loop_frequencies (loop, p);
8141 }
8142
8143 edge exit_e = single_exit (loop);
8144 exit_e->probability = profile_probability::always ()
8145 .apply_scale (1, new_est_niter + 1);
8146
8147 edge exit_l = single_pred_edge (loop->latch);
8148 profile_probability prob = exit_l->probability;
8149 exit_l->probability = exit_e->probability.invert ();
8150 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8151 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8152 }
8153
8154 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8155 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8156 stmt_vec_info. */
8157
8158 static void
8159 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8160 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8161 {
8162 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8163 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8164
8165 if (dump_enabled_p ())
8166 dump_printf_loc (MSG_NOTE, vect_location,
8167 "------>vectorizing statement: %G", stmt_info->stmt);
8168
8169 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8170 vect_loop_kill_debug_uses (loop, stmt_info);
8171
8172 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8173 && !STMT_VINFO_LIVE_P (stmt_info))
8174 return;
8175
8176 if (STMT_VINFO_VECTYPE (stmt_info))
8177 {
8178 poly_uint64 nunits
8179 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8180 if (!STMT_SLP_TYPE (stmt_info)
8181 && maybe_ne (nunits, vf)
8182 && dump_enabled_p ())
8183 /* For SLP VF is set according to unrolling factor, and not
8184 to vector size, hence for SLP this print is not valid. */
8185 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8186 }
8187
8188 /* Pure SLP statements have already been vectorized. We still need
8189 to apply loop vectorization to hybrid SLP statements. */
8190 if (PURE_SLP_STMT (stmt_info))
8191 return;
8192
8193 if (dump_enabled_p ())
8194 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8195
8196 if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8197 *seen_store = stmt_info;
8198 }
8199
8200 /* Function vect_transform_loop.
8201
8202 The analysis phase has determined that the loop is vectorizable.
8203 Vectorize the loop - created vectorized stmts to replace the scalar
8204 stmts in the loop, and update the loop exit condition.
8205 Returns scalar epilogue loop if any. */
8206
8207 struct loop *
8208 vect_transform_loop (loop_vec_info loop_vinfo)
8209 {
8210 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8211 struct loop *epilogue = NULL;
8212 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8213 int nbbs = loop->num_nodes;
8214 int i;
8215 tree niters_vector = NULL_TREE;
8216 tree step_vector = NULL_TREE;
8217 tree niters_vector_mult_vf = NULL_TREE;
8218 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8219 unsigned int lowest_vf = constant_lower_bound (vf);
8220 gimple *stmt;
8221 bool check_profitability = false;
8222 unsigned int th;
8223
8224 DUMP_VECT_SCOPE ("vec_transform_loop");
8225
8226 loop_vinfo->shared->check_datarefs ();
8227
8228 /* Use the more conservative vectorization threshold. If the number
8229 of iterations is constant assume the cost check has been performed
8230 by our caller. If the threshold makes all loops profitable that
8231 run at least the (estimated) vectorization factor number of times
8232 checking is pointless, too. */
8233 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8234 if (th >= vect_vf_for_cost (loop_vinfo)
8235 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8236 {
8237 if (dump_enabled_p ())
8238 dump_printf_loc (MSG_NOTE, vect_location,
8239 "Profitability threshold is %d loop iterations.\n",
8240 th);
8241 check_profitability = true;
8242 }
8243
8244 /* Make sure there exists a single-predecessor exit bb. Do this before
8245 versioning. */
8246 edge e = single_exit (loop);
8247 if (! single_pred_p (e->dest))
8248 {
8249 split_loop_exit_edge (e);
8250 if (dump_enabled_p ())
8251 dump_printf (MSG_NOTE, "split exit edge\n");
8252 }
8253
8254 /* Version the loop first, if required, so the profitability check
8255 comes first. */
8256
8257 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8258 {
8259 poly_uint64 versioning_threshold
8260 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8261 if (check_profitability
8262 && ordered_p (poly_uint64 (th), versioning_threshold))
8263 {
8264 versioning_threshold = ordered_max (poly_uint64 (th),
8265 versioning_threshold);
8266 check_profitability = false;
8267 }
8268 vect_loop_versioning (loop_vinfo, th, check_profitability,
8269 versioning_threshold);
8270 check_profitability = false;
8271 }
8272
8273 /* Make sure there exists a single-predecessor exit bb also on the
8274 scalar loop copy. Do this after versioning but before peeling
8275 so CFG structure is fine for both scalar and if-converted loop
8276 to make slpeel_duplicate_current_defs_from_edges face matched
8277 loop closed PHI nodes on the exit. */
8278 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8279 {
8280 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8281 if (! single_pred_p (e->dest))
8282 {
8283 split_loop_exit_edge (e);
8284 if (dump_enabled_p ())
8285 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8286 }
8287 }
8288
8289 tree niters = vect_build_loop_niters (loop_vinfo);
8290 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8291 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8292 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8293 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8294 &step_vector, &niters_vector_mult_vf, th,
8295 check_profitability, niters_no_overflow);
8296
8297 if (niters_vector == NULL_TREE)
8298 {
8299 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8300 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8301 && known_eq (lowest_vf, vf))
8302 {
8303 niters_vector
8304 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8305 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8306 step_vector = build_one_cst (TREE_TYPE (niters));
8307 }
8308 else
8309 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8310 &step_vector, niters_no_overflow);
8311 }
8312
8313 /* 1) Make sure the loop header has exactly two entries
8314 2) Make sure we have a preheader basic block. */
8315
8316 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8317
8318 split_edge (loop_preheader_edge (loop));
8319
8320 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8321 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8322 /* This will deal with any possible peeling. */
8323 vect_prepare_for_masked_peels (loop_vinfo);
8324
8325 /* Schedule the SLP instances first, then handle loop vectorization
8326 below. */
8327 if (!loop_vinfo->slp_instances.is_empty ())
8328 {
8329 DUMP_VECT_SCOPE ("scheduling SLP instances");
8330 vect_schedule_slp (loop_vinfo);
8331 }
8332
8333 /* FORNOW: the vectorizer supports only loops which body consist
8334 of one basic block (header + empty latch). When the vectorizer will
8335 support more involved loop forms, the order by which the BBs are
8336 traversed need to be reconsidered. */
8337
8338 for (i = 0; i < nbbs; i++)
8339 {
8340 basic_block bb = bbs[i];
8341 stmt_vec_info stmt_info;
8342
8343 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8344 gsi_next (&si))
8345 {
8346 gphi *phi = si.phi ();
8347 if (dump_enabled_p ())
8348 dump_printf_loc (MSG_NOTE, vect_location,
8349 "------>vectorizing phi: %G", phi);
8350 stmt_info = loop_vinfo->lookup_stmt (phi);
8351 if (!stmt_info)
8352 continue;
8353
8354 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8355 vect_loop_kill_debug_uses (loop, stmt_info);
8356
8357 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8358 && !STMT_VINFO_LIVE_P (stmt_info))
8359 continue;
8360
8361 if (STMT_VINFO_VECTYPE (stmt_info)
8362 && (maybe_ne
8363 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8364 && dump_enabled_p ())
8365 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8366
8367 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8368 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8369 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8370 && ! PURE_SLP_STMT (stmt_info))
8371 {
8372 if (dump_enabled_p ())
8373 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8374 vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8375 }
8376 }
8377
8378 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8379 !gsi_end_p (si);)
8380 {
8381 stmt = gsi_stmt (si);
8382 /* During vectorization remove existing clobber stmts. */
8383 if (gimple_clobber_p (stmt))
8384 {
8385 unlink_stmt_vdef (stmt);
8386 gsi_remove (&si, true);
8387 release_defs (stmt);
8388 }
8389 else
8390 {
8391 stmt_info = loop_vinfo->lookup_stmt (stmt);
8392
8393 /* vector stmts created in the outer-loop during vectorization of
8394 stmts in an inner-loop may not have a stmt_info, and do not
8395 need to be vectorized. */
8396 stmt_vec_info seen_store = NULL;
8397 if (stmt_info)
8398 {
8399 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8400 {
8401 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8402 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8403 !gsi_end_p (subsi); gsi_next (&subsi))
8404 {
8405 stmt_vec_info pat_stmt_info
8406 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8407 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8408 &si, &seen_store);
8409 }
8410 stmt_vec_info pat_stmt_info
8411 = STMT_VINFO_RELATED_STMT (stmt_info);
8412 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8413 &seen_store);
8414 }
8415 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8416 &seen_store);
8417 }
8418 gsi_next (&si);
8419 if (seen_store)
8420 {
8421 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8422 /* Interleaving. If IS_STORE is TRUE, the
8423 vectorization of the interleaving chain was
8424 completed - free all the stores in the chain. */
8425 vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8426 else
8427 /* Free the attached stmt_vec_info and remove the stmt. */
8428 loop_vinfo->remove_stmt (stmt_info);
8429 }
8430 }
8431 }
8432
8433 /* Stub out scalar statements that must not survive vectorization.
8434 Doing this here helps with grouped statements, or statements that
8435 are involved in patterns. */
8436 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8437 !gsi_end_p (gsi); gsi_next (&gsi))
8438 {
8439 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8440 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8441 {
8442 tree lhs = gimple_get_lhs (call);
8443 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8444 {
8445 tree zero = build_zero_cst (TREE_TYPE (lhs));
8446 gimple *new_stmt = gimple_build_assign (lhs, zero);
8447 gsi_replace (&gsi, new_stmt, true);
8448 }
8449 }
8450 }
8451 } /* BBs in loop */
8452
8453 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8454 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8455 if (integer_onep (step_vector))
8456 niters_no_overflow = true;
8457 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8458 niters_vector_mult_vf, !niters_no_overflow);
8459
8460 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8461 scale_profile_for_vect_loop (loop, assumed_vf);
8462
8463 /* True if the final iteration might not handle a full vector's
8464 worth of scalar iterations. */
8465 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8466 /* The minimum number of iterations performed by the epilogue. This
8467 is 1 when peeling for gaps because we always need a final scalar
8468 iteration. */
8469 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8470 /* +1 to convert latch counts to loop iteration counts,
8471 -min_epilogue_iters to remove iterations that cannot be performed
8472 by the vector code. */
8473 int bias_for_lowest = 1 - min_epilogue_iters;
8474 int bias_for_assumed = bias_for_lowest;
8475 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8476 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8477 {
8478 /* When the amount of peeling is known at compile time, the first
8479 iteration will have exactly alignment_npeels active elements.
8480 In the worst case it will have at least one. */
8481 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8482 bias_for_lowest += lowest_vf - min_first_active;
8483 bias_for_assumed += assumed_vf - min_first_active;
8484 }
8485 /* In these calculations the "- 1" converts loop iteration counts
8486 back to latch counts. */
8487 if (loop->any_upper_bound)
8488 loop->nb_iterations_upper_bound
8489 = (final_iter_may_be_partial
8490 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8491 lowest_vf) - 1
8492 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8493 lowest_vf) - 1);
8494 if (loop->any_likely_upper_bound)
8495 loop->nb_iterations_likely_upper_bound
8496 = (final_iter_may_be_partial
8497 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8498 + bias_for_lowest, lowest_vf) - 1
8499 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8500 + bias_for_lowest, lowest_vf) - 1);
8501 if (loop->any_estimate)
8502 loop->nb_iterations_estimate
8503 = (final_iter_may_be_partial
8504 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8505 assumed_vf) - 1
8506 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8507 assumed_vf) - 1);
8508
8509 if (dump_enabled_p ())
8510 {
8511 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8512 {
8513 dump_printf_loc (MSG_NOTE, vect_location,
8514 "LOOP VECTORIZED\n");
8515 if (loop->inner)
8516 dump_printf_loc (MSG_NOTE, vect_location,
8517 "OUTER LOOP VECTORIZED\n");
8518 dump_printf (MSG_NOTE, "\n");
8519 }
8520 else
8521 {
8522 dump_printf_loc (MSG_NOTE, vect_location,
8523 "LOOP EPILOGUE VECTORIZED (VS=");
8524 dump_dec (MSG_NOTE, current_vector_size);
8525 dump_printf (MSG_NOTE, ")\n");
8526 }
8527 }
8528
8529 /* Free SLP instances here because otherwise stmt reference counting
8530 won't work. */
8531 slp_instance instance;
8532 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8533 vect_free_slp_instance (instance, true);
8534 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8535 /* Clear-up safelen field since its value is invalid after vectorization
8536 since vectorized loop can have loop-carried dependencies. */
8537 loop->safelen = 0;
8538
8539 /* Don't vectorize epilogue for epilogue. */
8540 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8541 epilogue = NULL;
8542
8543 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8544 epilogue = NULL;
8545
8546 if (epilogue)
8547 {
8548 auto_vector_sizes vector_sizes;
8549 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8550 unsigned int next_size = 0;
8551
8552 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8553 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8554 && known_eq (vf, lowest_vf))
8555 {
8556 unsigned int eiters
8557 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8558 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8559 eiters = eiters % lowest_vf;
8560 epilogue->nb_iterations_upper_bound = eiters - 1;
8561
8562 unsigned int ratio;
8563 while (next_size < vector_sizes.length ()
8564 && !(constant_multiple_p (current_vector_size,
8565 vector_sizes[next_size], &ratio)
8566 && eiters >= lowest_vf / ratio))
8567 next_size += 1;
8568 }
8569 else
8570 while (next_size < vector_sizes.length ()
8571 && maybe_lt (current_vector_size, vector_sizes[next_size]))
8572 next_size += 1;
8573
8574 if (next_size == vector_sizes.length ())
8575 epilogue = NULL;
8576 }
8577
8578 if (epilogue)
8579 {
8580 epilogue->force_vectorize = loop->force_vectorize;
8581 epilogue->safelen = loop->safelen;
8582 epilogue->dont_vectorize = false;
8583
8584 /* We may need to if-convert epilogue to vectorize it. */
8585 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8586 tree_if_conversion (epilogue);
8587 }
8588
8589 return epilogue;
8590 }
8591
8592 /* The code below is trying to perform simple optimization - revert
8593 if-conversion for masked stores, i.e. if the mask of a store is zero
8594 do not perform it and all stored value producers also if possible.
8595 For example,
8596 for (i=0; i<n; i++)
8597 if (c[i])
8598 {
8599 p1[i] += 1;
8600 p2[i] = p3[i] +2;
8601 }
8602 this transformation will produce the following semi-hammock:
8603
8604 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8605 {
8606 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8607 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8608 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8609 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8610 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8611 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8612 }
8613 */
8614
8615 void
8616 optimize_mask_stores (struct loop *loop)
8617 {
8618 basic_block *bbs = get_loop_body (loop);
8619 unsigned nbbs = loop->num_nodes;
8620 unsigned i;
8621 basic_block bb;
8622 struct loop *bb_loop;
8623 gimple_stmt_iterator gsi;
8624 gimple *stmt;
8625 auto_vec<gimple *> worklist;
8626
8627 vect_location = find_loop_location (loop);
8628 /* Pick up all masked stores in loop if any. */
8629 for (i = 0; i < nbbs; i++)
8630 {
8631 bb = bbs[i];
8632 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8633 gsi_next (&gsi))
8634 {
8635 stmt = gsi_stmt (gsi);
8636 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8637 worklist.safe_push (stmt);
8638 }
8639 }
8640
8641 free (bbs);
8642 if (worklist.is_empty ())
8643 return;
8644
8645 /* Loop has masked stores. */
8646 while (!worklist.is_empty ())
8647 {
8648 gimple *last, *last_store;
8649 edge e, efalse;
8650 tree mask;
8651 basic_block store_bb, join_bb;
8652 gimple_stmt_iterator gsi_to;
8653 tree vdef, new_vdef;
8654 gphi *phi;
8655 tree vectype;
8656 tree zero;
8657
8658 last = worklist.pop ();
8659 mask = gimple_call_arg (last, 2);
8660 bb = gimple_bb (last);
8661 /* Create then_bb and if-then structure in CFG, then_bb belongs to
8662 the same loop as if_bb. It could be different to LOOP when two
8663 level loop-nest is vectorized and mask_store belongs to the inner
8664 one. */
8665 e = split_block (bb, last);
8666 bb_loop = bb->loop_father;
8667 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8668 join_bb = e->dest;
8669 store_bb = create_empty_bb (bb);
8670 add_bb_to_loop (store_bb, bb_loop);
8671 e->flags = EDGE_TRUE_VALUE;
8672 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8673 /* Put STORE_BB to likely part. */
8674 efalse->probability = profile_probability::unlikely ();
8675 store_bb->count = efalse->count ();
8676 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8677 if (dom_info_available_p (CDI_DOMINATORS))
8678 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8679 if (dump_enabled_p ())
8680 dump_printf_loc (MSG_NOTE, vect_location,
8681 "Create new block %d to sink mask stores.",
8682 store_bb->index);
8683 /* Create vector comparison with boolean result. */
8684 vectype = TREE_TYPE (mask);
8685 zero = build_zero_cst (vectype);
8686 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8687 gsi = gsi_last_bb (bb);
8688 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8689 /* Create new PHI node for vdef of the last masked store:
8690 .MEM_2 = VDEF <.MEM_1>
8691 will be converted to
8692 .MEM.3 = VDEF <.MEM_1>
8693 and new PHI node will be created in join bb
8694 .MEM_2 = PHI <.MEM_1, .MEM_3>
8695 */
8696 vdef = gimple_vdef (last);
8697 new_vdef = make_ssa_name (gimple_vop (cfun), last);
8698 gimple_set_vdef (last, new_vdef);
8699 phi = create_phi_node (vdef, join_bb);
8700 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8701
8702 /* Put all masked stores with the same mask to STORE_BB if possible. */
8703 while (true)
8704 {
8705 gimple_stmt_iterator gsi_from;
8706 gimple *stmt1 = NULL;
8707
8708 /* Move masked store to STORE_BB. */
8709 last_store = last;
8710 gsi = gsi_for_stmt (last);
8711 gsi_from = gsi;
8712 /* Shift GSI to the previous stmt for further traversal. */
8713 gsi_prev (&gsi);
8714 gsi_to = gsi_start_bb (store_bb);
8715 gsi_move_before (&gsi_from, &gsi_to);
8716 /* Setup GSI_TO to the non-empty block start. */
8717 gsi_to = gsi_start_bb (store_bb);
8718 if (dump_enabled_p ())
8719 dump_printf_loc (MSG_NOTE, vect_location,
8720 "Move stmt to created bb\n%G", last);
8721 /* Move all stored value producers if possible. */
8722 while (!gsi_end_p (gsi))
8723 {
8724 tree lhs;
8725 imm_use_iterator imm_iter;
8726 use_operand_p use_p;
8727 bool res;
8728
8729 /* Skip debug statements. */
8730 if (is_gimple_debug (gsi_stmt (gsi)))
8731 {
8732 gsi_prev (&gsi);
8733 continue;
8734 }
8735 stmt1 = gsi_stmt (gsi);
8736 /* Do not consider statements writing to memory or having
8737 volatile operand. */
8738 if (gimple_vdef (stmt1)
8739 || gimple_has_volatile_ops (stmt1))
8740 break;
8741 gsi_from = gsi;
8742 gsi_prev (&gsi);
8743 lhs = gimple_get_lhs (stmt1);
8744 if (!lhs)
8745 break;
8746
8747 /* LHS of vectorized stmt must be SSA_NAME. */
8748 if (TREE_CODE (lhs) != SSA_NAME)
8749 break;
8750
8751 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8752 {
8753 /* Remove dead scalar statement. */
8754 if (has_zero_uses (lhs))
8755 {
8756 gsi_remove (&gsi_from, true);
8757 continue;
8758 }
8759 }
8760
8761 /* Check that LHS does not have uses outside of STORE_BB. */
8762 res = true;
8763 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8764 {
8765 gimple *use_stmt;
8766 use_stmt = USE_STMT (use_p);
8767 if (is_gimple_debug (use_stmt))
8768 continue;
8769 if (gimple_bb (use_stmt) != store_bb)
8770 {
8771 res = false;
8772 break;
8773 }
8774 }
8775 if (!res)
8776 break;
8777
8778 if (gimple_vuse (stmt1)
8779 && gimple_vuse (stmt1) != gimple_vuse (last_store))
8780 break;
8781
8782 /* Can move STMT1 to STORE_BB. */
8783 if (dump_enabled_p ())
8784 dump_printf_loc (MSG_NOTE, vect_location,
8785 "Move stmt to created bb\n%G", stmt1);
8786 gsi_move_before (&gsi_from, &gsi_to);
8787 /* Shift GSI_TO for further insertion. */
8788 gsi_prev (&gsi_to);
8789 }
8790 /* Put other masked stores with the same mask to STORE_BB. */
8791 if (worklist.is_empty ()
8792 || gimple_call_arg (worklist.last (), 2) != mask
8793 || worklist.last () != stmt1)
8794 break;
8795 last = worklist.pop ();
8796 }
8797 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8798 }
8799 }