[03/11] Remove vect_transform_stmt grouped_store argument
[gcc.git] / gcc / tree-vect-loop.c
1 /* Loop Vectorization
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
57
58 /* Loop Vectorization Pass.
59
60 This pass tries to vectorize loops.
61
62 For example, the vectorizer transforms the following simple loop:
63
64 short a[N]; short b[N]; short c[N]; int i;
65
66 for (i=0; i<N; i++){
67 a[i] = b[i] + c[i];
68 }
69
70 as if it was manually vectorized by rewriting the source code into:
71
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75 v8hi va, vb, vc;
76
77 for (i=0; i<N/8; i++){
78 vb = pb[i];
79 vc = pc[i];
80 va = vb + vc;
81 pa[i] = va;
82 }
83
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
95
96 Analysis phase:
97 ===============
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
101
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
106
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
117
118 For example, say stmt S1 was vectorized into stmt VS1:
119
120 VS1: vb = px[i];
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122 S2: a = b;
123
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
128
129 VS1: vb = px[i];
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 VS2: va = vb;
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
133
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
136
137 Target modeling:
138 =================
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
144
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
151
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
154 */
155
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
157
158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
159 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
160 may already be set for general statements (not just data refs). */
161
162 static bool
163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
164 bool vectype_maybe_set_p,
165 poly_uint64 *vf,
166 vec<stmt_vec_info > *mask_producers)
167 {
168 gimple *stmt = stmt_info->stmt;
169
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
173 {
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return true;
177 }
178
179 tree stmt_vectype, nunits_vectype;
180 if (!vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
181 &nunits_vectype))
182 return false;
183
184 if (stmt_vectype)
185 {
186 if (STMT_VINFO_VECTYPE (stmt_info))
187 /* The only case when a vectype had been already set is for stmts
188 that contain a data ref, or for "pattern-stmts" (stmts generated
189 by the vectorizer to represent/replace a certain idiom). */
190 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
191 || vectype_maybe_set_p)
192 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
193 else if (stmt_vectype == boolean_type_node)
194 mask_producers->safe_push (stmt_info);
195 else
196 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
197 }
198
199 if (nunits_vectype)
200 vect_update_max_nunits (vf, nunits_vectype);
201
202 return true;
203 }
204
205 /* Subroutine of vect_determine_vectorization_factor. Set the vector
206 types of STMT_INFO and all attached pattern statements and update
207 the vectorization factor VF accordingly. If some of the statements
208 produce a mask result whose vector type can only be calculated later,
209 add them to MASK_PRODUCERS. Return true on success or false if
210 something prevented vectorization. */
211
212 static bool
213 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
214 vec<stmt_vec_info > *mask_producers)
215 {
216 vec_info *vinfo = stmt_info->vinfo;
217 if (dump_enabled_p ())
218 {
219 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: ");
220 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
221 }
222 if (!vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers))
223 return false;
224
225 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
226 && STMT_VINFO_RELATED_STMT (stmt_info))
227 {
228 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
229 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
230
231 /* If a pattern statement has def stmts, analyze them too. */
232 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
233 !gsi_end_p (si); gsi_next (&si))
234 {
235 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
236 if (dump_enabled_p ())
237 {
238 dump_printf_loc (MSG_NOTE, vect_location,
239 "==> examining pattern def stmt: ");
240 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
241 def_stmt_info->stmt, 0);
242 }
243 if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
244 vf, mask_producers))
245 return false;
246 }
247
248 if (dump_enabled_p ())
249 {
250 dump_printf_loc (MSG_NOTE, vect_location,
251 "==> examining pattern statement: ");
252 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
253 }
254 if (!vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers))
255 return false;
256 }
257
258 return true;
259 }
260
261 /* Function vect_determine_vectorization_factor
262
263 Determine the vectorization factor (VF). VF is the number of data elements
264 that are operated upon in parallel in a single iteration of the vectorized
265 loop. For example, when vectorizing a loop that operates on 4byte elements,
266 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
267 elements can fit in a single vector register.
268
269 We currently support vectorization of loops in which all types operated upon
270 are of the same size. Therefore this function currently sets VF according to
271 the size of the types operated upon, and fails if there are multiple sizes
272 in the loop.
273
274 VF is also the factor by which the loop iterations are strip-mined, e.g.:
275 original loop:
276 for (i=0; i<N; i++){
277 a[i] = b[i] + c[i];
278 }
279
280 vectorized loop:
281 for (i=0; i<N; i+=VF){
282 a[i:VF] = b[i:VF] + c[i:VF];
283 }
284 */
285
286 static bool
287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
288 {
289 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
290 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
291 unsigned nbbs = loop->num_nodes;
292 poly_uint64 vectorization_factor = 1;
293 tree scalar_type = NULL_TREE;
294 gphi *phi;
295 tree vectype;
296 stmt_vec_info stmt_info;
297 unsigned i;
298 auto_vec<stmt_vec_info> mask_producers;
299
300 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
301
302 for (i = 0; i < nbbs; i++)
303 {
304 basic_block bb = bbs[i];
305
306 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
307 gsi_next (&si))
308 {
309 phi = si.phi ();
310 stmt_info = loop_vinfo->lookup_stmt (phi);
311 if (dump_enabled_p ())
312 {
313 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
314 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
315 }
316
317 gcc_assert (stmt_info);
318
319 if (STMT_VINFO_RELEVANT_P (stmt_info)
320 || STMT_VINFO_LIVE_P (stmt_info))
321 {
322 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
323 scalar_type = TREE_TYPE (PHI_RESULT (phi));
324
325 if (dump_enabled_p ())
326 {
327 dump_printf_loc (MSG_NOTE, vect_location,
328 "get vectype for scalar type: ");
329 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
330 dump_printf (MSG_NOTE, "\n");
331 }
332
333 vectype = get_vectype_for_scalar_type (scalar_type);
334 if (!vectype)
335 {
336 if (dump_enabled_p ())
337 {
338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
339 "not vectorized: unsupported "
340 "data-type ");
341 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
342 scalar_type);
343 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
344 }
345 return false;
346 }
347 STMT_VINFO_VECTYPE (stmt_info) = vectype;
348
349 if (dump_enabled_p ())
350 {
351 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
352 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
353 dump_printf (MSG_NOTE, "\n");
354 }
355
356 if (dump_enabled_p ())
357 {
358 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
359 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
360 dump_printf (MSG_NOTE, "\n");
361 }
362
363 vect_update_max_nunits (&vectorization_factor, vectype);
364 }
365 }
366
367 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
368 gsi_next (&si))
369 {
370 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
371 if (!vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
372 &mask_producers))
373 return false;
374 }
375 }
376
377 /* TODO: Analyze cost. Decide if worth while to vectorize. */
378 if (dump_enabled_p ())
379 {
380 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
381 dump_dec (MSG_NOTE, vectorization_factor);
382 dump_printf (MSG_NOTE, "\n");
383 }
384
385 if (known_le (vectorization_factor, 1U))
386 {
387 if (dump_enabled_p ())
388 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
389 "not vectorized: unsupported data-type\n");
390 return false;
391 }
392 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
393
394 for (i = 0; i < mask_producers.length (); i++)
395 {
396 stmt_info = mask_producers[i];
397 tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
398 if (!mask_type)
399 return false;
400 STMT_VINFO_VECTYPE (stmt_info) = mask_type;
401 }
402
403 return true;
404 }
405
406
407 /* Function vect_is_simple_iv_evolution.
408
409 FORNOW: A simple evolution of an induction variables in the loop is
410 considered a polynomial evolution. */
411
412 static bool
413 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
414 tree * step)
415 {
416 tree init_expr;
417 tree step_expr;
418 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
419 basic_block bb;
420
421 /* When there is no evolution in this loop, the evolution function
422 is not "simple". */
423 if (evolution_part == NULL_TREE)
424 return false;
425
426 /* When the evolution is a polynomial of degree >= 2
427 the evolution function is not "simple". */
428 if (tree_is_chrec (evolution_part))
429 return false;
430
431 step_expr = evolution_part;
432 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
433
434 if (dump_enabled_p ())
435 {
436 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
437 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
438 dump_printf (MSG_NOTE, ", init: ");
439 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
440 dump_printf (MSG_NOTE, "\n");
441 }
442
443 *init = init_expr;
444 *step = step_expr;
445
446 if (TREE_CODE (step_expr) != INTEGER_CST
447 && (TREE_CODE (step_expr) != SSA_NAME
448 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
449 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
450 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
451 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
452 || !flag_associative_math)))
453 && (TREE_CODE (step_expr) != REAL_CST
454 || !flag_associative_math))
455 {
456 if (dump_enabled_p ())
457 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
458 "step unknown.\n");
459 return false;
460 }
461
462 return true;
463 }
464
465 /* Function vect_analyze_scalar_cycles_1.
466
467 Examine the cross iteration def-use cycles of scalar variables
468 in LOOP. LOOP_VINFO represents the loop that is now being
469 considered for vectorization (can be LOOP, or an outer-loop
470 enclosing LOOP). */
471
472 static void
473 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
474 {
475 basic_block bb = loop->header;
476 tree init, step;
477 auto_vec<stmt_vec_info, 64> worklist;
478 gphi_iterator gsi;
479 bool double_reduc;
480
481 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
482
483 /* First - identify all inductions. Reduction detection assumes that all the
484 inductions have been identified, therefore, this order must not be
485 changed. */
486 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
487 {
488 gphi *phi = gsi.phi ();
489 tree access_fn = NULL;
490 tree def = PHI_RESULT (phi);
491 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
492
493 if (dump_enabled_p ())
494 {
495 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
496 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
497 }
498
499 /* Skip virtual phi's. The data dependences that are associated with
500 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
501 if (virtual_operand_p (def))
502 continue;
503
504 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
505
506 /* Analyze the evolution function. */
507 access_fn = analyze_scalar_evolution (loop, def);
508 if (access_fn)
509 {
510 STRIP_NOPS (access_fn);
511 if (dump_enabled_p ())
512 {
513 dump_printf_loc (MSG_NOTE, vect_location,
514 "Access function of PHI: ");
515 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
516 dump_printf (MSG_NOTE, "\n");
517 }
518 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
519 = initial_condition_in_loop_num (access_fn, loop->num);
520 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
521 = evolution_part_in_loop_num (access_fn, loop->num);
522 }
523
524 if (!access_fn
525 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
526 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
527 && TREE_CODE (step) != INTEGER_CST))
528 {
529 worklist.safe_push (stmt_vinfo);
530 continue;
531 }
532
533 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
534 != NULL_TREE);
535 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
536
537 if (dump_enabled_p ())
538 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
539 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
540 }
541
542
543 /* Second - identify all reductions and nested cycles. */
544 while (worklist.length () > 0)
545 {
546 stmt_vec_info stmt_vinfo = worklist.pop ();
547 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
548 tree def = PHI_RESULT (phi);
549
550 if (dump_enabled_p ())
551 {
552 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
553 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
554 }
555
556 gcc_assert (!virtual_operand_p (def)
557 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
558
559 stmt_vec_info reduc_stmt_info
560 = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
561 &double_reduc, false);
562 if (reduc_stmt_info)
563 {
564 if (double_reduc)
565 {
566 if (dump_enabled_p ())
567 dump_printf_loc (MSG_NOTE, vect_location,
568 "Detected double reduction.\n");
569
570 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
571 STMT_VINFO_DEF_TYPE (reduc_stmt_info)
572 = vect_double_reduction_def;
573 }
574 else
575 {
576 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
577 {
578 if (dump_enabled_p ())
579 dump_printf_loc (MSG_NOTE, vect_location,
580 "Detected vectorizable nested cycle.\n");
581
582 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
583 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
584 }
585 else
586 {
587 if (dump_enabled_p ())
588 dump_printf_loc (MSG_NOTE, vect_location,
589 "Detected reduction.\n");
590
591 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
592 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
593 /* Store the reduction cycles for possible vectorization in
594 loop-aware SLP if it was not detected as reduction
595 chain. */
596 if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
597 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
598 (reduc_stmt_info);
599 }
600 }
601 }
602 else
603 if (dump_enabled_p ())
604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
605 "Unknown def-use cycle pattern.\n");
606 }
607 }
608
609
610 /* Function vect_analyze_scalar_cycles.
611
612 Examine the cross iteration def-use cycles of scalar variables, by
613 analyzing the loop-header PHIs of scalar variables. Classify each
614 cycle as one of the following: invariant, induction, reduction, unknown.
615 We do that for the loop represented by LOOP_VINFO, and also to its
616 inner-loop, if exists.
617 Examples for scalar cycles:
618
619 Example1: reduction:
620
621 loop1:
622 for (i=0; i<N; i++)
623 sum += a[i];
624
625 Example2: induction:
626
627 loop2:
628 for (i=0; i<N; i++)
629 a[i] = i; */
630
631 static void
632 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
633 {
634 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
635
636 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
637
638 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
639 Reductions in such inner-loop therefore have different properties than
640 the reductions in the nest that gets vectorized:
641 1. When vectorized, they are executed in the same order as in the original
642 scalar loop, so we can't change the order of computation when
643 vectorizing them.
644 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
645 current checks are too strict. */
646
647 if (loop->inner)
648 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
649 }
650
651 /* Transfer group and reduction information from STMT_INFO to its
652 pattern stmt. */
653
654 static void
655 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
656 {
657 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
658 stmt_vec_info stmtp;
659 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
660 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
661 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
662 do
663 {
664 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
665 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
666 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
667 if (stmt_info)
668 REDUC_GROUP_NEXT_ELEMENT (stmtp)
669 = STMT_VINFO_RELATED_STMT (stmt_info);
670 }
671 while (stmt_info);
672 STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
673 }
674
675 /* Fixup scalar cycles that now have their stmts detected as patterns. */
676
677 static void
678 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
679 {
680 stmt_vec_info first;
681 unsigned i;
682
683 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
684 if (STMT_VINFO_IN_PATTERN_P (first))
685 {
686 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
687 while (next)
688 {
689 if (! STMT_VINFO_IN_PATTERN_P (next))
690 break;
691 next = REDUC_GROUP_NEXT_ELEMENT (next);
692 }
693 /* If not all stmt in the chain are patterns try to handle
694 the chain without patterns. */
695 if (! next)
696 {
697 vect_fixup_reduc_chain (first);
698 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
699 = STMT_VINFO_RELATED_STMT (first);
700 }
701 }
702 }
703
704 /* Function vect_get_loop_niters.
705
706 Determine how many iterations the loop is executed and place it
707 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
708 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
709 niter information holds in ASSUMPTIONS.
710
711 Return the loop exit condition. */
712
713
714 static gcond *
715 vect_get_loop_niters (struct loop *loop, tree *assumptions,
716 tree *number_of_iterations, tree *number_of_iterationsm1)
717 {
718 edge exit = single_exit (loop);
719 struct tree_niter_desc niter_desc;
720 tree niter_assumptions, niter, may_be_zero;
721 gcond *cond = get_loop_exit_condition (loop);
722
723 *assumptions = boolean_true_node;
724 *number_of_iterationsm1 = chrec_dont_know;
725 *number_of_iterations = chrec_dont_know;
726 DUMP_VECT_SCOPE ("get_loop_niters");
727
728 if (!exit)
729 return cond;
730
731 niter = chrec_dont_know;
732 may_be_zero = NULL_TREE;
733 niter_assumptions = boolean_true_node;
734 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
735 || chrec_contains_undetermined (niter_desc.niter))
736 return cond;
737
738 niter_assumptions = niter_desc.assumptions;
739 may_be_zero = niter_desc.may_be_zero;
740 niter = niter_desc.niter;
741
742 if (may_be_zero && integer_zerop (may_be_zero))
743 may_be_zero = NULL_TREE;
744
745 if (may_be_zero)
746 {
747 if (COMPARISON_CLASS_P (may_be_zero))
748 {
749 /* Try to combine may_be_zero with assumptions, this can simplify
750 computation of niter expression. */
751 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
752 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
753 niter_assumptions,
754 fold_build1 (TRUTH_NOT_EXPR,
755 boolean_type_node,
756 may_be_zero));
757 else
758 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
759 build_int_cst (TREE_TYPE (niter), 0),
760 rewrite_to_non_trapping_overflow (niter));
761
762 may_be_zero = NULL_TREE;
763 }
764 else if (integer_nonzerop (may_be_zero))
765 {
766 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
767 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
768 return cond;
769 }
770 else
771 return cond;
772 }
773
774 *assumptions = niter_assumptions;
775 *number_of_iterationsm1 = niter;
776
777 /* We want the number of loop header executions which is the number
778 of latch executions plus one.
779 ??? For UINT_MAX latch executions this number overflows to zero
780 for loops like do { n++; } while (n != 0); */
781 if (niter && !chrec_contains_undetermined (niter))
782 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
783 build_int_cst (TREE_TYPE (niter), 1));
784 *number_of_iterations = niter;
785
786 return cond;
787 }
788
789 /* Function bb_in_loop_p
790
791 Used as predicate for dfs order traversal of the loop bbs. */
792
793 static bool
794 bb_in_loop_p (const_basic_block bb, const void *data)
795 {
796 const struct loop *const loop = (const struct loop *)data;
797 if (flow_bb_inside_loop_p (loop, bb))
798 return true;
799 return false;
800 }
801
802
803 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
804 stmt_vec_info structs for all the stmts in LOOP_IN. */
805
806 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
807 : vec_info (vec_info::loop, init_cost (loop_in), shared),
808 loop (loop_in),
809 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
810 num_itersm1 (NULL_TREE),
811 num_iters (NULL_TREE),
812 num_iters_unchanged (NULL_TREE),
813 num_iters_assumptions (NULL_TREE),
814 th (0),
815 versioning_threshold (0),
816 vectorization_factor (0),
817 max_vectorization_factor (0),
818 mask_skip_niters (NULL_TREE),
819 mask_compare_type (NULL_TREE),
820 unaligned_dr (NULL),
821 peeling_for_alignment (0),
822 ptr_mask (0),
823 ivexpr_map (NULL),
824 slp_unrolling_factor (1),
825 single_scalar_iteration_cost (0),
826 vectorizable (false),
827 can_fully_mask_p (true),
828 fully_masked_p (false),
829 peeling_for_gaps (false),
830 peeling_for_niter (false),
831 operands_swapped (false),
832 no_data_dependencies (false),
833 has_mask_store (false),
834 scalar_loop (NULL),
835 orig_loop_info (NULL)
836 {
837 /* Create/Update stmt_info for all stmts in the loop. */
838 basic_block *body = get_loop_body (loop);
839 for (unsigned int i = 0; i < loop->num_nodes; i++)
840 {
841 basic_block bb = body[i];
842 gimple_stmt_iterator si;
843
844 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
845 {
846 gimple *phi = gsi_stmt (si);
847 gimple_set_uid (phi, 0);
848 add_stmt (phi);
849 }
850
851 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
852 {
853 gimple *stmt = gsi_stmt (si);
854 gimple_set_uid (stmt, 0);
855 add_stmt (stmt);
856 }
857 }
858 free (body);
859
860 /* CHECKME: We want to visit all BBs before their successors (except for
861 latch blocks, for which this assertion wouldn't hold). In the simple
862 case of the loop forms we allow, a dfs order of the BBs would the same
863 as reversed postorder traversal, so we are safe. */
864
865 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
866 bbs, loop->num_nodes, loop);
867 gcc_assert (nbbs == loop->num_nodes);
868 }
869
870 /* Free all levels of MASKS. */
871
872 void
873 release_vec_loop_masks (vec_loop_masks *masks)
874 {
875 rgroup_masks *rgm;
876 unsigned int i;
877 FOR_EACH_VEC_ELT (*masks, i, rgm)
878 rgm->masks.release ();
879 masks->release ();
880 }
881
882 /* Free all memory used by the _loop_vec_info, as well as all the
883 stmt_vec_info structs of all the stmts in the loop. */
884
885 _loop_vec_info::~_loop_vec_info ()
886 {
887 int nbbs;
888 gimple_stmt_iterator si;
889 int j;
890
891 nbbs = loop->num_nodes;
892 for (j = 0; j < nbbs; j++)
893 {
894 basic_block bb = bbs[j];
895 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
896 {
897 gimple *stmt = gsi_stmt (si);
898
899 /* We may have broken canonical form by moving a constant
900 into RHS1 of a commutative op. Fix such occurrences. */
901 if (operands_swapped && is_gimple_assign (stmt))
902 {
903 enum tree_code code = gimple_assign_rhs_code (stmt);
904
905 if ((code == PLUS_EXPR
906 || code == POINTER_PLUS_EXPR
907 || code == MULT_EXPR)
908 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
909 swap_ssa_operands (stmt,
910 gimple_assign_rhs1_ptr (stmt),
911 gimple_assign_rhs2_ptr (stmt));
912 else if (code == COND_EXPR
913 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
914 {
915 tree cond_expr = gimple_assign_rhs1 (stmt);
916 enum tree_code cond_code = TREE_CODE (cond_expr);
917
918 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
919 {
920 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
921 0));
922 cond_code = invert_tree_comparison (cond_code,
923 honor_nans);
924 if (cond_code != ERROR_MARK)
925 {
926 TREE_SET_CODE (cond_expr, cond_code);
927 swap_ssa_operands (stmt,
928 gimple_assign_rhs2_ptr (stmt),
929 gimple_assign_rhs3_ptr (stmt));
930 }
931 }
932 }
933 }
934 gsi_next (&si);
935 }
936 }
937
938 free (bbs);
939
940 release_vec_loop_masks (&masks);
941 delete ivexpr_map;
942
943 loop->aux = NULL;
944 }
945
946 /* Return an invariant or register for EXPR and emit necessary
947 computations in the LOOP_VINFO loop preheader. */
948
949 tree
950 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
951 {
952 if (is_gimple_reg (expr)
953 || is_gimple_min_invariant (expr))
954 return expr;
955
956 if (! loop_vinfo->ivexpr_map)
957 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
958 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
959 if (! cached)
960 {
961 gimple_seq stmts = NULL;
962 cached = force_gimple_operand (unshare_expr (expr),
963 &stmts, true, NULL_TREE);
964 if (stmts)
965 {
966 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
967 gsi_insert_seq_on_edge_immediate (e, stmts);
968 }
969 }
970 return cached;
971 }
972
973 /* Return true if we can use CMP_TYPE as the comparison type to produce
974 all masks required to mask LOOP_VINFO. */
975
976 static bool
977 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
978 {
979 rgroup_masks *rgm;
980 unsigned int i;
981 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
982 if (rgm->mask_type != NULL_TREE
983 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
984 cmp_type, rgm->mask_type,
985 OPTIMIZE_FOR_SPEED))
986 return false;
987 return true;
988 }
989
990 /* Calculate the maximum number of scalars per iteration for every
991 rgroup in LOOP_VINFO. */
992
993 static unsigned int
994 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
995 {
996 unsigned int res = 1;
997 unsigned int i;
998 rgroup_masks *rgm;
999 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1000 res = MAX (res, rgm->max_nscalars_per_iter);
1001 return res;
1002 }
1003
1004 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1005 whether we can actually generate the masks required. Return true if so,
1006 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1007
1008 static bool
1009 vect_verify_full_masking (loop_vec_info loop_vinfo)
1010 {
1011 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1012 unsigned int min_ni_width;
1013
1014 /* Use a normal loop if there are no statements that need masking.
1015 This only happens in rare degenerate cases: it means that the loop
1016 has no loads, no stores, and no live-out values. */
1017 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1018 return false;
1019
1020 /* Get the maximum number of iterations that is representable
1021 in the counter type. */
1022 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1023 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1024
1025 /* Get a more refined estimate for the number of iterations. */
1026 widest_int max_back_edges;
1027 if (max_loop_iterations (loop, &max_back_edges))
1028 max_ni = wi::smin (max_ni, max_back_edges + 1);
1029
1030 /* Account for rgroup masks, in which each bit is replicated N times. */
1031 max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1032
1033 /* Work out how many bits we need to represent the limit. */
1034 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1035
1036 /* Find a scalar mode for which WHILE_ULT is supported. */
1037 opt_scalar_int_mode cmp_mode_iter;
1038 tree cmp_type = NULL_TREE;
1039 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1040 {
1041 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1042 if (cmp_bits >= min_ni_width
1043 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1044 {
1045 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1046 if (this_type
1047 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1048 {
1049 /* Although we could stop as soon as we find a valid mode,
1050 it's often better to continue until we hit Pmode, since the
1051 operands to the WHILE are more likely to be reusable in
1052 address calculations. */
1053 cmp_type = this_type;
1054 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1055 break;
1056 }
1057 }
1058 }
1059
1060 if (!cmp_type)
1061 return false;
1062
1063 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1064 return true;
1065 }
1066
1067 /* Calculate the cost of one scalar iteration of the loop. */
1068 static void
1069 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1070 {
1071 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1072 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1073 int nbbs = loop->num_nodes, factor;
1074 int innerloop_iters, i;
1075
1076 /* Gather costs for statements in the scalar loop. */
1077
1078 /* FORNOW. */
1079 innerloop_iters = 1;
1080 if (loop->inner)
1081 innerloop_iters = 50; /* FIXME */
1082
1083 for (i = 0; i < nbbs; i++)
1084 {
1085 gimple_stmt_iterator si;
1086 basic_block bb = bbs[i];
1087
1088 if (bb->loop_father == loop->inner)
1089 factor = innerloop_iters;
1090 else
1091 factor = 1;
1092
1093 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1094 {
1095 gimple *stmt = gsi_stmt (si);
1096 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1097
1098 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1099 continue;
1100
1101 /* Skip stmts that are not vectorized inside the loop. */
1102 if (stmt_info
1103 && !STMT_VINFO_RELEVANT_P (stmt_info)
1104 && (!STMT_VINFO_LIVE_P (stmt_info)
1105 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1106 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1107 continue;
1108
1109 vect_cost_for_stmt kind;
1110 if (STMT_VINFO_DATA_REF (stmt_info))
1111 {
1112 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1113 kind = scalar_load;
1114 else
1115 kind = scalar_store;
1116 }
1117 else
1118 kind = scalar_stmt;
1119
1120 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1121 factor, kind, stmt_info, 0, vect_prologue);
1122 }
1123 }
1124
1125 /* Now accumulate cost. */
1126 void *target_cost_data = init_cost (loop);
1127 stmt_info_for_cost *si;
1128 int j;
1129 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1130 j, si)
1131 (void) add_stmt_cost (target_cost_data, si->count,
1132 si->kind, si->stmt_info, si->misalign,
1133 vect_body);
1134 unsigned dummy, body_cost = 0;
1135 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1136 destroy_cost_data (target_cost_data);
1137 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1138 }
1139
1140
1141 /* Function vect_analyze_loop_form_1.
1142
1143 Verify that certain CFG restrictions hold, including:
1144 - the loop has a pre-header
1145 - the loop has a single entry and exit
1146 - the loop exit condition is simple enough
1147 - the number of iterations can be analyzed, i.e, a countable loop. The
1148 niter could be analyzed under some assumptions. */
1149
1150 bool
1151 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1152 tree *assumptions, tree *number_of_iterationsm1,
1153 tree *number_of_iterations, gcond **inner_loop_cond)
1154 {
1155 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1156
1157 /* Different restrictions apply when we are considering an inner-most loop,
1158 vs. an outer (nested) loop.
1159 (FORNOW. May want to relax some of these restrictions in the future). */
1160
1161 if (!loop->inner)
1162 {
1163 /* Inner-most loop. We currently require that the number of BBs is
1164 exactly 2 (the header and latch). Vectorizable inner-most loops
1165 look like this:
1166
1167 (pre-header)
1168 |
1169 header <--------+
1170 | | |
1171 | +--> latch --+
1172 |
1173 (exit-bb) */
1174
1175 if (loop->num_nodes != 2)
1176 {
1177 if (dump_enabled_p ())
1178 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1179 "not vectorized: control flow in loop.\n");
1180 return false;
1181 }
1182
1183 if (empty_block_p (loop->header))
1184 {
1185 if (dump_enabled_p ())
1186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1187 "not vectorized: empty loop.\n");
1188 return false;
1189 }
1190 }
1191 else
1192 {
1193 struct loop *innerloop = loop->inner;
1194 edge entryedge;
1195
1196 /* Nested loop. We currently require that the loop is doubly-nested,
1197 contains a single inner loop, and the number of BBs is exactly 5.
1198 Vectorizable outer-loops look like this:
1199
1200 (pre-header)
1201 |
1202 header <---+
1203 | |
1204 inner-loop |
1205 | |
1206 tail ------+
1207 |
1208 (exit-bb)
1209
1210 The inner-loop has the properties expected of inner-most loops
1211 as described above. */
1212
1213 if ((loop->inner)->inner || (loop->inner)->next)
1214 {
1215 if (dump_enabled_p ())
1216 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1217 "not vectorized: multiple nested loops.\n");
1218 return false;
1219 }
1220
1221 if (loop->num_nodes != 5)
1222 {
1223 if (dump_enabled_p ())
1224 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1225 "not vectorized: control flow in loop.\n");
1226 return false;
1227 }
1228
1229 entryedge = loop_preheader_edge (innerloop);
1230 if (entryedge->src != loop->header
1231 || !single_exit (innerloop)
1232 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1233 {
1234 if (dump_enabled_p ())
1235 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1236 "not vectorized: unsupported outerloop form.\n");
1237 return false;
1238 }
1239
1240 /* Analyze the inner-loop. */
1241 tree inner_niterm1, inner_niter, inner_assumptions;
1242 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1243 &inner_assumptions, &inner_niterm1,
1244 &inner_niter, NULL)
1245 /* Don't support analyzing niter under assumptions for inner
1246 loop. */
1247 || !integer_onep (inner_assumptions))
1248 {
1249 if (dump_enabled_p ())
1250 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1251 "not vectorized: Bad inner loop.\n");
1252 return false;
1253 }
1254
1255 if (!expr_invariant_in_loop_p (loop, inner_niter))
1256 {
1257 if (dump_enabled_p ())
1258 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1259 "not vectorized: inner-loop count not"
1260 " invariant.\n");
1261 return false;
1262 }
1263
1264 if (dump_enabled_p ())
1265 dump_printf_loc (MSG_NOTE, vect_location,
1266 "Considering outer-loop vectorization.\n");
1267 }
1268
1269 if (!single_exit (loop)
1270 || EDGE_COUNT (loop->header->preds) != 2)
1271 {
1272 if (dump_enabled_p ())
1273 {
1274 if (!single_exit (loop))
1275 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1276 "not vectorized: multiple exits.\n");
1277 else if (EDGE_COUNT (loop->header->preds) != 2)
1278 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1279 "not vectorized: too many incoming edges.\n");
1280 }
1281 return false;
1282 }
1283
1284 /* We assume that the loop exit condition is at the end of the loop. i.e,
1285 that the loop is represented as a do-while (with a proper if-guard
1286 before the loop if needed), where the loop header contains all the
1287 executable statements, and the latch is empty. */
1288 if (!empty_block_p (loop->latch)
1289 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1290 {
1291 if (dump_enabled_p ())
1292 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1293 "not vectorized: latch block not empty.\n");
1294 return false;
1295 }
1296
1297 /* Make sure the exit is not abnormal. */
1298 edge e = single_exit (loop);
1299 if (e->flags & EDGE_ABNORMAL)
1300 {
1301 if (dump_enabled_p ())
1302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1303 "not vectorized: abnormal loop exit edge.\n");
1304 return false;
1305 }
1306
1307 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1308 number_of_iterationsm1);
1309 if (!*loop_cond)
1310 {
1311 if (dump_enabled_p ())
1312 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1313 "not vectorized: complicated exit condition.\n");
1314 return false;
1315 }
1316
1317 if (integer_zerop (*assumptions)
1318 || !*number_of_iterations
1319 || chrec_contains_undetermined (*number_of_iterations))
1320 {
1321 if (dump_enabled_p ())
1322 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1323 "not vectorized: number of iterations cannot be "
1324 "computed.\n");
1325 return false;
1326 }
1327
1328 if (integer_zerop (*number_of_iterations))
1329 {
1330 if (dump_enabled_p ())
1331 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1332 "not vectorized: number of iterations = 0.\n");
1333 return false;
1334 }
1335
1336 return true;
1337 }
1338
1339 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1340
1341 loop_vec_info
1342 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1343 {
1344 tree assumptions, number_of_iterations, number_of_iterationsm1;
1345 gcond *loop_cond, *inner_loop_cond = NULL;
1346
1347 if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1348 &assumptions, &number_of_iterationsm1,
1349 &number_of_iterations, &inner_loop_cond))
1350 return NULL;
1351
1352 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1353 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1354 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1355 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1356 if (!integer_onep (assumptions))
1357 {
1358 /* We consider to vectorize this loop by versioning it under
1359 some assumptions. In order to do this, we need to clear
1360 existing information computed by scev and niter analyzer. */
1361 scev_reset_htab ();
1362 free_numbers_of_iterations_estimates (loop);
1363 /* Also set flag for this loop so that following scev and niter
1364 analysis are done under the assumptions. */
1365 loop_constraint_set (loop, LOOP_C_FINITE);
1366 /* Also record the assumptions for versioning. */
1367 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1368 }
1369
1370 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1371 {
1372 if (dump_enabled_p ())
1373 {
1374 dump_printf_loc (MSG_NOTE, vect_location,
1375 "Symbolic number of iterations is ");
1376 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1377 dump_printf (MSG_NOTE, "\n");
1378 }
1379 }
1380
1381 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1382 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1383 if (inner_loop_cond)
1384 {
1385 stmt_vec_info inner_loop_cond_info
1386 = loop_vinfo->lookup_stmt (inner_loop_cond);
1387 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1388 }
1389
1390 gcc_assert (!loop->aux);
1391 loop->aux = loop_vinfo;
1392 return loop_vinfo;
1393 }
1394
1395
1396
1397 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1398 statements update the vectorization factor. */
1399
1400 static void
1401 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1402 {
1403 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1404 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1405 int nbbs = loop->num_nodes;
1406 poly_uint64 vectorization_factor;
1407 int i;
1408
1409 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1410
1411 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1412 gcc_assert (known_ne (vectorization_factor, 0U));
1413
1414 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1415 vectorization factor of the loop is the unrolling factor required by
1416 the SLP instances. If that unrolling factor is 1, we say, that we
1417 perform pure SLP on loop - cross iteration parallelism is not
1418 exploited. */
1419 bool only_slp_in_loop = true;
1420 for (i = 0; i < nbbs; i++)
1421 {
1422 basic_block bb = bbs[i];
1423 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1424 gsi_next (&si))
1425 {
1426 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1427 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1428 && STMT_VINFO_RELATED_STMT (stmt_info))
1429 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
1430 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1431 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1432 && !PURE_SLP_STMT (stmt_info))
1433 /* STMT needs both SLP and loop-based vectorization. */
1434 only_slp_in_loop = false;
1435 }
1436 }
1437
1438 if (only_slp_in_loop)
1439 {
1440 dump_printf_loc (MSG_NOTE, vect_location,
1441 "Loop contains only SLP stmts\n");
1442 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1443 }
1444 else
1445 {
1446 dump_printf_loc (MSG_NOTE, vect_location,
1447 "Loop contains SLP and non-SLP stmts\n");
1448 /* Both the vectorization factor and unroll factor have the form
1449 current_vector_size * X for some rational X, so they must have
1450 a common multiple. */
1451 vectorization_factor
1452 = force_common_multiple (vectorization_factor,
1453 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1454 }
1455
1456 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1457 if (dump_enabled_p ())
1458 {
1459 dump_printf_loc (MSG_NOTE, vect_location,
1460 "Updating vectorization factor to ");
1461 dump_dec (MSG_NOTE, vectorization_factor);
1462 dump_printf (MSG_NOTE, ".\n");
1463 }
1464 }
1465
1466 /* Return true if STMT_INFO describes a double reduction phi and if
1467 the other phi in the reduction is also relevant for vectorization.
1468 This rejects cases such as:
1469
1470 outer1:
1471 x_1 = PHI <x_3(outer2), ...>;
1472 ...
1473
1474 inner:
1475 x_2 = ...;
1476 ...
1477
1478 outer2:
1479 x_3 = PHI <x_2(inner)>;
1480
1481 if nothing in x_2 or elsewhere makes x_1 relevant. */
1482
1483 static bool
1484 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1485 {
1486 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1487 return false;
1488
1489 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1490 }
1491
1492 /* Function vect_analyze_loop_operations.
1493
1494 Scan the loop stmts and make sure they are all vectorizable. */
1495
1496 static bool
1497 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1498 {
1499 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1500 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1501 int nbbs = loop->num_nodes;
1502 int i;
1503 stmt_vec_info stmt_info;
1504 bool need_to_vectorize = false;
1505 bool ok;
1506
1507 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1508
1509 stmt_vector_for_cost cost_vec;
1510 cost_vec.create (2);
1511
1512 for (i = 0; i < nbbs; i++)
1513 {
1514 basic_block bb = bbs[i];
1515
1516 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1517 gsi_next (&si))
1518 {
1519 gphi *phi = si.phi ();
1520 ok = true;
1521
1522 stmt_info = loop_vinfo->lookup_stmt (phi);
1523 if (dump_enabled_p ())
1524 {
1525 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1526 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1527 }
1528 if (virtual_operand_p (gimple_phi_result (phi)))
1529 continue;
1530
1531 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1532 (i.e., a phi in the tail of the outer-loop). */
1533 if (! is_loop_header_bb_p (bb))
1534 {
1535 /* FORNOW: we currently don't support the case that these phis
1536 are not used in the outerloop (unless it is double reduction,
1537 i.e., this phi is vect_reduction_def), cause this case
1538 requires to actually do something here. */
1539 if (STMT_VINFO_LIVE_P (stmt_info)
1540 && !vect_active_double_reduction_p (stmt_info))
1541 {
1542 if (dump_enabled_p ())
1543 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1544 "Unsupported loop-closed phi in "
1545 "outer-loop.\n");
1546 return false;
1547 }
1548
1549 /* If PHI is used in the outer loop, we check that its operand
1550 is defined in the inner loop. */
1551 if (STMT_VINFO_RELEVANT_P (stmt_info))
1552 {
1553 tree phi_op;
1554
1555 if (gimple_phi_num_args (phi) != 1)
1556 return false;
1557
1558 phi_op = PHI_ARG_DEF (phi, 0);
1559 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1560 if (!op_def_info)
1561 return false;
1562
1563 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1564 && (STMT_VINFO_RELEVANT (op_def_info)
1565 != vect_used_in_outer_by_reduction))
1566 return false;
1567 }
1568
1569 continue;
1570 }
1571
1572 gcc_assert (stmt_info);
1573
1574 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1575 || STMT_VINFO_LIVE_P (stmt_info))
1576 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1577 {
1578 /* A scalar-dependence cycle that we don't support. */
1579 if (dump_enabled_p ())
1580 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1581 "not vectorized: scalar dependence cycle.\n");
1582 return false;
1583 }
1584
1585 if (STMT_VINFO_RELEVANT_P (stmt_info))
1586 {
1587 need_to_vectorize = true;
1588 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1589 && ! PURE_SLP_STMT (stmt_info))
1590 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1591 &cost_vec);
1592 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1593 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1594 && ! PURE_SLP_STMT (stmt_info))
1595 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
1596 &cost_vec);
1597 }
1598
1599 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1600 if (ok
1601 && STMT_VINFO_LIVE_P (stmt_info)
1602 && !PURE_SLP_STMT (stmt_info))
1603 ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
1604 &cost_vec);
1605
1606 if (!ok)
1607 {
1608 if (dump_enabled_p ())
1609 {
1610 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1611 "not vectorized: relevant phi not "
1612 "supported: ");
1613 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1614 }
1615 return false;
1616 }
1617 }
1618
1619 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1620 gsi_next (&si))
1621 {
1622 gimple *stmt = gsi_stmt (si);
1623 if (!gimple_clobber_p (stmt)
1624 && !vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1625 &need_to_vectorize,
1626 NULL, NULL, &cost_vec))
1627 return false;
1628 }
1629 } /* bbs */
1630
1631 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1632 cost_vec.release ();
1633
1634 /* All operations in the loop are either irrelevant (deal with loop
1635 control, or dead), or only used outside the loop and can be moved
1636 out of the loop (e.g. invariants, inductions). The loop can be
1637 optimized away by scalar optimizations. We're better off not
1638 touching this loop. */
1639 if (!need_to_vectorize)
1640 {
1641 if (dump_enabled_p ())
1642 dump_printf_loc (MSG_NOTE, vect_location,
1643 "All the computation can be taken out of the loop.\n");
1644 if (dump_enabled_p ())
1645 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1646 "not vectorized: redundant loop. no profit to "
1647 "vectorize.\n");
1648 return false;
1649 }
1650
1651 return true;
1652 }
1653
1654 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1655 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1656 definitely no, or -1 if it's worth retrying. */
1657
1658 static int
1659 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1660 {
1661 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1662 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1663
1664 /* Only fully-masked loops can have iteration counts less than the
1665 vectorization factor. */
1666 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1667 {
1668 HOST_WIDE_INT max_niter;
1669
1670 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1671 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1672 else
1673 max_niter = max_stmt_executions_int (loop);
1674
1675 if (max_niter != -1
1676 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1677 {
1678 if (dump_enabled_p ())
1679 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1680 "not vectorized: iteration count smaller than "
1681 "vectorization factor.\n");
1682 return 0;
1683 }
1684 }
1685
1686 int min_profitable_iters, min_profitable_estimate;
1687 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1688 &min_profitable_estimate);
1689
1690 if (min_profitable_iters < 0)
1691 {
1692 if (dump_enabled_p ())
1693 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1694 "not vectorized: vectorization not profitable.\n");
1695 if (dump_enabled_p ())
1696 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1697 "not vectorized: vector version will never be "
1698 "profitable.\n");
1699 return -1;
1700 }
1701
1702 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1703 * assumed_vf);
1704
1705 /* Use the cost model only if it is more conservative than user specified
1706 threshold. */
1707 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1708 min_profitable_iters);
1709
1710 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1711
1712 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1713 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1714 {
1715 if (dump_enabled_p ())
1716 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1717 "not vectorized: vectorization not profitable.\n");
1718 if (dump_enabled_p ())
1719 dump_printf_loc (MSG_NOTE, vect_location,
1720 "not vectorized: iteration count smaller than user "
1721 "specified loop bound parameter or minimum profitable "
1722 "iterations (whichever is more conservative).\n");
1723 return 0;
1724 }
1725
1726 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1727 if (estimated_niter == -1)
1728 estimated_niter = likely_max_stmt_executions_int (loop);
1729 if (estimated_niter != -1
1730 && ((unsigned HOST_WIDE_INT) estimated_niter
1731 < MAX (th, (unsigned) min_profitable_estimate)))
1732 {
1733 if (dump_enabled_p ())
1734 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1735 "not vectorized: estimated iteration count too "
1736 "small.\n");
1737 if (dump_enabled_p ())
1738 dump_printf_loc (MSG_NOTE, vect_location,
1739 "not vectorized: estimated iteration count smaller "
1740 "than specified loop bound parameter or minimum "
1741 "profitable iterations (whichever is more "
1742 "conservative).\n");
1743 return -1;
1744 }
1745
1746 return 1;
1747 }
1748
1749 static bool
1750 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1751 vec<data_reference_p> *datarefs,
1752 unsigned int *n_stmts)
1753 {
1754 *n_stmts = 0;
1755 for (unsigned i = 0; i < loop->num_nodes; i++)
1756 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1757 !gsi_end_p (gsi); gsi_next (&gsi))
1758 {
1759 gimple *stmt = gsi_stmt (gsi);
1760 if (is_gimple_debug (stmt))
1761 continue;
1762 ++(*n_stmts);
1763 if (!vect_find_stmt_data_reference (loop, stmt, datarefs))
1764 {
1765 if (is_gimple_call (stmt) && loop->safelen)
1766 {
1767 tree fndecl = gimple_call_fndecl (stmt), op;
1768 if (fndecl != NULL_TREE)
1769 {
1770 cgraph_node *node = cgraph_node::get (fndecl);
1771 if (node != NULL && node->simd_clones != NULL)
1772 {
1773 unsigned int j, n = gimple_call_num_args (stmt);
1774 for (j = 0; j < n; j++)
1775 {
1776 op = gimple_call_arg (stmt, j);
1777 if (DECL_P (op)
1778 || (REFERENCE_CLASS_P (op)
1779 && get_base_address (op)))
1780 break;
1781 }
1782 op = gimple_call_lhs (stmt);
1783 /* Ignore #pragma omp declare simd functions
1784 if they don't have data references in the
1785 call stmt itself. */
1786 if (j == n
1787 && !(op
1788 && (DECL_P (op)
1789 || (REFERENCE_CLASS_P (op)
1790 && get_base_address (op)))))
1791 continue;
1792 }
1793 }
1794 }
1795 return false;
1796 }
1797 /* If dependence analysis will give up due to the limit on the
1798 number of datarefs stop here and fail fatally. */
1799 if (datarefs->length ()
1800 > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1801 return false;
1802 }
1803 return true;
1804 }
1805
1806 /* Function vect_analyze_loop_2.
1807
1808 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1809 for it. The different analyses will record information in the
1810 loop_vec_info struct. */
1811 static bool
1812 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1813 {
1814 bool ok;
1815 int res;
1816 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1817 poly_uint64 min_vf = 2;
1818
1819 /* The first group of checks is independent of the vector size. */
1820 fatal = true;
1821
1822 /* Find all data references in the loop (which correspond to vdefs/vuses)
1823 and analyze their evolution in the loop. */
1824
1825 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1826
1827 /* Gather the data references and count stmts in the loop. */
1828 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1829 {
1830 if (!vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1831 &LOOP_VINFO_DATAREFS (loop_vinfo),
1832 n_stmts))
1833 {
1834 if (dump_enabled_p ())
1835 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1836 "not vectorized: loop contains function "
1837 "calls or data references that cannot "
1838 "be analyzed\n");
1839 return false;
1840 }
1841 loop_vinfo->shared->save_datarefs ();
1842 }
1843 else
1844 loop_vinfo->shared->check_datarefs ();
1845
1846 /* Analyze the data references and also adjust the minimal
1847 vectorization factor according to the loads and stores. */
1848
1849 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1850 if (!ok)
1851 {
1852 if (dump_enabled_p ())
1853 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1854 "bad data references.\n");
1855 return false;
1856 }
1857
1858 /* Classify all cross-iteration scalar data-flow cycles.
1859 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1860 vect_analyze_scalar_cycles (loop_vinfo);
1861
1862 vect_pattern_recog (loop_vinfo);
1863
1864 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1865
1866 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1867 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1868
1869 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1870 if (!ok)
1871 {
1872 if (dump_enabled_p ())
1873 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1874 "bad data access.\n");
1875 return false;
1876 }
1877
1878 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1879
1880 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1881 if (!ok)
1882 {
1883 if (dump_enabled_p ())
1884 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1885 "unexpected pattern.\n");
1886 return false;
1887 }
1888
1889 /* While the rest of the analysis below depends on it in some way. */
1890 fatal = false;
1891
1892 /* Analyze data dependences between the data-refs in the loop
1893 and adjust the maximum vectorization factor according to
1894 the dependences.
1895 FORNOW: fail at the first data dependence that we encounter. */
1896
1897 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1898 if (!ok
1899 || (max_vf != MAX_VECTORIZATION_FACTOR
1900 && maybe_lt (max_vf, min_vf)))
1901 {
1902 if (dump_enabled_p ())
1903 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1904 "bad data dependence.\n");
1905 return false;
1906 }
1907 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1908
1909 ok = vect_determine_vectorization_factor (loop_vinfo);
1910 if (!ok)
1911 {
1912 if (dump_enabled_p ())
1913 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1914 "can't determine vectorization factor.\n");
1915 return false;
1916 }
1917 if (max_vf != MAX_VECTORIZATION_FACTOR
1918 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1919 {
1920 if (dump_enabled_p ())
1921 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1922 "bad data dependence.\n");
1923 return false;
1924 }
1925
1926 /* Compute the scalar iteration cost. */
1927 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1928
1929 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1930 unsigned th;
1931
1932 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1933 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1934 if (!ok)
1935 return false;
1936
1937 /* If there are any SLP instances mark them as pure_slp. */
1938 bool slp = vect_make_slp_decision (loop_vinfo);
1939 if (slp)
1940 {
1941 /* Find stmts that need to be both vectorized and SLPed. */
1942 vect_detect_hybrid_slp (loop_vinfo);
1943
1944 /* Update the vectorization factor based on the SLP decision. */
1945 vect_update_vf_for_slp (loop_vinfo);
1946 }
1947
1948 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1949
1950 /* We don't expect to have to roll back to anything other than an empty
1951 set of rgroups. */
1952 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1953
1954 /* This is the point where we can re-start analysis with SLP forced off. */
1955 start_over:
1956
1957 /* Now the vectorization factor is final. */
1958 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1959 gcc_assert (known_ne (vectorization_factor, 0U));
1960
1961 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1962 {
1963 dump_printf_loc (MSG_NOTE, vect_location,
1964 "vectorization_factor = ");
1965 dump_dec (MSG_NOTE, vectorization_factor);
1966 dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
1967 LOOP_VINFO_INT_NITERS (loop_vinfo));
1968 }
1969
1970 HOST_WIDE_INT max_niter
1971 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1972
1973 /* Analyze the alignment of the data-refs in the loop.
1974 Fail if a data reference is found that cannot be vectorized. */
1975
1976 ok = vect_analyze_data_refs_alignment (loop_vinfo);
1977 if (!ok)
1978 {
1979 if (dump_enabled_p ())
1980 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1981 "bad data alignment.\n");
1982 return false;
1983 }
1984
1985 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1986 It is important to call pruning after vect_analyze_data_ref_accesses,
1987 since we use grouping information gathered by interleaving analysis. */
1988 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1989 if (!ok)
1990 return false;
1991
1992 /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
1993 vectorization. */
1994 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1995 {
1996 /* This pass will decide on using loop versioning and/or loop peeling in
1997 order to enhance the alignment of data references in the loop. */
1998 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1999 if (!ok)
2000 {
2001 if (dump_enabled_p ())
2002 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2003 "bad data alignment.\n");
2004 return false;
2005 }
2006 }
2007
2008 if (slp)
2009 {
2010 /* Analyze operations in the SLP instances. Note this may
2011 remove unsupported SLP instances which makes the above
2012 SLP kind detection invalid. */
2013 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2014 vect_slp_analyze_operations (loop_vinfo);
2015 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2016 goto again;
2017 }
2018
2019 /* Scan all the remaining operations in the loop that are not subject
2020 to SLP and make sure they are vectorizable. */
2021 ok = vect_analyze_loop_operations (loop_vinfo);
2022 if (!ok)
2023 {
2024 if (dump_enabled_p ())
2025 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2026 "bad operation or unsupported loop bound.\n");
2027 return false;
2028 }
2029
2030 /* Decide whether to use a fully-masked loop for this vectorization
2031 factor. */
2032 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2033 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2034 && vect_verify_full_masking (loop_vinfo));
2035 if (dump_enabled_p ())
2036 {
2037 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2038 dump_printf_loc (MSG_NOTE, vect_location,
2039 "using a fully-masked loop.\n");
2040 else
2041 dump_printf_loc (MSG_NOTE, vect_location,
2042 "not using a fully-masked loop.\n");
2043 }
2044
2045 /* If epilog loop is required because of data accesses with gaps,
2046 one additional iteration needs to be peeled. Check if there is
2047 enough iterations for vectorization. */
2048 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2049 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2050 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2051 {
2052 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2053 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2054
2055 if (known_lt (wi::to_widest (scalar_niters), vf))
2056 {
2057 if (dump_enabled_p ())
2058 dump_printf_loc (MSG_NOTE, vect_location,
2059 "loop has no enough iterations to support"
2060 " peeling for gaps.\n");
2061 return false;
2062 }
2063 }
2064
2065 /* Check the costings of the loop make vectorizing worthwhile. */
2066 res = vect_analyze_loop_costing (loop_vinfo);
2067 if (res < 0)
2068 goto again;
2069 if (!res)
2070 {
2071 if (dump_enabled_p ())
2072 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2073 "Loop costings not worthwhile.\n");
2074 return false;
2075 }
2076
2077 /* Decide whether we need to create an epilogue loop to handle
2078 remaining scalar iterations. */
2079 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2080
2081 unsigned HOST_WIDE_INT const_vf;
2082 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2083 /* The main loop handles all iterations. */
2084 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2085 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2086 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2087 {
2088 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2089 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2090 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2091 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2092 }
2093 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2094 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2095 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2096 < (unsigned) exact_log2 (const_vf))
2097 /* In case of versioning, check if the maximum number of
2098 iterations is greater than th. If they are identical,
2099 the epilogue is unnecessary. */
2100 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2101 || ((unsigned HOST_WIDE_INT) max_niter
2102 > (th / const_vf) * const_vf))))
2103 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2104
2105 /* If an epilogue loop is required make sure we can create one. */
2106 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2107 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2108 {
2109 if (dump_enabled_p ())
2110 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2111 if (!vect_can_advance_ivs_p (loop_vinfo)
2112 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2113 single_exit (LOOP_VINFO_LOOP
2114 (loop_vinfo))))
2115 {
2116 if (dump_enabled_p ())
2117 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2118 "not vectorized: can't create required "
2119 "epilog loop\n");
2120 goto again;
2121 }
2122 }
2123
2124 /* During peeling, we need to check if number of loop iterations is
2125 enough for both peeled prolog loop and vector loop. This check
2126 can be merged along with threshold check of loop versioning, so
2127 increase threshold for this case if necessary. */
2128 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2129 {
2130 poly_uint64 niters_th = 0;
2131
2132 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2133 {
2134 /* Niters for peeled prolog loop. */
2135 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2136 {
2137 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2138 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2139 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2140 }
2141 else
2142 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2143 }
2144
2145 /* Niters for at least one iteration of vectorized loop. */
2146 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2147 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2148 /* One additional iteration because of peeling for gap. */
2149 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2150 niters_th += 1;
2151 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2152 }
2153
2154 gcc_assert (known_eq (vectorization_factor,
2155 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2156
2157 /* Ok to vectorize! */
2158 return true;
2159
2160 again:
2161 /* Try again with SLP forced off but if we didn't do any SLP there is
2162 no point in re-trying. */
2163 if (!slp)
2164 return false;
2165
2166 /* If there are reduction chains re-trying will fail anyway. */
2167 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2168 return false;
2169
2170 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2171 via interleaving or lane instructions. */
2172 slp_instance instance;
2173 slp_tree node;
2174 unsigned i, j;
2175 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2176 {
2177 stmt_vec_info vinfo;
2178 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2179 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2180 continue;
2181 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2182 unsigned int size = DR_GROUP_SIZE (vinfo);
2183 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2184 if (! vect_store_lanes_supported (vectype, size, false)
2185 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2186 && ! vect_grouped_store_supported (vectype, size))
2187 return false;
2188 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2189 {
2190 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2191 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2192 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2193 size = DR_GROUP_SIZE (vinfo);
2194 vectype = STMT_VINFO_VECTYPE (vinfo);
2195 if (! vect_load_lanes_supported (vectype, size, false)
2196 && ! vect_grouped_load_supported (vectype, single_element_p,
2197 size))
2198 return false;
2199 }
2200 }
2201
2202 if (dump_enabled_p ())
2203 dump_printf_loc (MSG_NOTE, vect_location,
2204 "re-trying with SLP disabled\n");
2205
2206 /* Roll back state appropriately. No SLP this time. */
2207 slp = false;
2208 /* Restore vectorization factor as it were without SLP. */
2209 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2210 /* Free the SLP instances. */
2211 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2212 vect_free_slp_instance (instance, false);
2213 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2214 /* Reset SLP type to loop_vect on all stmts. */
2215 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2216 {
2217 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2218 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2219 !gsi_end_p (si); gsi_next (&si))
2220 {
2221 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2222 STMT_SLP_TYPE (stmt_info) = loop_vect;
2223 }
2224 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2225 !gsi_end_p (si); gsi_next (&si))
2226 {
2227 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2228 STMT_SLP_TYPE (stmt_info) = loop_vect;
2229 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2230 {
2231 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2232 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2233 STMT_SLP_TYPE (stmt_info) = loop_vect;
2234 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2235 !gsi_end_p (pi); gsi_next (&pi))
2236 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2237 = loop_vect;
2238 }
2239 }
2240 }
2241 /* Free optimized alias test DDRS. */
2242 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2243 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2244 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2245 /* Reset target cost data. */
2246 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2247 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2248 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2249 /* Reset accumulated rgroup information. */
2250 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2251 /* Reset assorted flags. */
2252 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2253 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2254 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2255 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2256 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2257
2258 goto start_over;
2259 }
2260
2261 /* Function vect_analyze_loop.
2262
2263 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2264 for it. The different analyses will record information in the
2265 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2266 be vectorized. */
2267 loop_vec_info
2268 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2269 vec_info_shared *shared)
2270 {
2271 loop_vec_info loop_vinfo;
2272 auto_vector_sizes vector_sizes;
2273
2274 /* Autodetect first vector size we try. */
2275 current_vector_size = 0;
2276 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2277 unsigned int next_size = 0;
2278
2279 DUMP_VECT_SCOPE ("analyze_loop_nest");
2280
2281 if (loop_outer (loop)
2282 && loop_vec_info_for_loop (loop_outer (loop))
2283 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2284 {
2285 if (dump_enabled_p ())
2286 dump_printf_loc (MSG_NOTE, vect_location,
2287 "outer-loop already vectorized.\n");
2288 return NULL;
2289 }
2290
2291 if (!find_loop_nest (loop, &shared->loop_nest))
2292 {
2293 if (dump_enabled_p ())
2294 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2295 "not vectorized: loop nest containing two "
2296 "or more consecutive inner loops cannot be "
2297 "vectorized\n");
2298 return NULL;
2299 }
2300
2301 unsigned n_stmts = 0;
2302 poly_uint64 autodetected_vector_size = 0;
2303 while (1)
2304 {
2305 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2306 loop_vinfo = vect_analyze_loop_form (loop, shared);
2307 if (!loop_vinfo)
2308 {
2309 if (dump_enabled_p ())
2310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2311 "bad loop form.\n");
2312 return NULL;
2313 }
2314
2315 bool fatal = false;
2316
2317 if (orig_loop_vinfo)
2318 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2319
2320 if (vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts))
2321 {
2322 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2323
2324 return loop_vinfo;
2325 }
2326
2327 delete loop_vinfo;
2328
2329 if (next_size == 0)
2330 autodetected_vector_size = current_vector_size;
2331
2332 if (next_size < vector_sizes.length ()
2333 && known_eq (vector_sizes[next_size], autodetected_vector_size))
2334 next_size += 1;
2335
2336 if (fatal
2337 || next_size == vector_sizes.length ()
2338 || known_eq (current_vector_size, 0U))
2339 return NULL;
2340
2341 /* Try the next biggest vector size. */
2342 current_vector_size = vector_sizes[next_size++];
2343 if (dump_enabled_p ())
2344 {
2345 dump_printf_loc (MSG_NOTE, vect_location,
2346 "***** Re-trying analysis with "
2347 "vector size ");
2348 dump_dec (MSG_NOTE, current_vector_size);
2349 dump_printf (MSG_NOTE, "\n");
2350 }
2351 }
2352 }
2353
2354 /* Return true if there is an in-order reduction function for CODE, storing
2355 it in *REDUC_FN if so. */
2356
2357 static bool
2358 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2359 {
2360 switch (code)
2361 {
2362 case PLUS_EXPR:
2363 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2364 return true;
2365
2366 default:
2367 return false;
2368 }
2369 }
2370
2371 /* Function reduction_fn_for_scalar_code
2372
2373 Input:
2374 CODE - tree_code of a reduction operations.
2375
2376 Output:
2377 REDUC_FN - the corresponding internal function to be used to reduce the
2378 vector of partial results into a single scalar result, or IFN_LAST
2379 if the operation is a supported reduction operation, but does not have
2380 such an internal function.
2381
2382 Return FALSE if CODE currently cannot be vectorized as reduction. */
2383
2384 static bool
2385 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2386 {
2387 switch (code)
2388 {
2389 case MAX_EXPR:
2390 *reduc_fn = IFN_REDUC_MAX;
2391 return true;
2392
2393 case MIN_EXPR:
2394 *reduc_fn = IFN_REDUC_MIN;
2395 return true;
2396
2397 case PLUS_EXPR:
2398 *reduc_fn = IFN_REDUC_PLUS;
2399 return true;
2400
2401 case BIT_AND_EXPR:
2402 *reduc_fn = IFN_REDUC_AND;
2403 return true;
2404
2405 case BIT_IOR_EXPR:
2406 *reduc_fn = IFN_REDUC_IOR;
2407 return true;
2408
2409 case BIT_XOR_EXPR:
2410 *reduc_fn = IFN_REDUC_XOR;
2411 return true;
2412
2413 case MULT_EXPR:
2414 case MINUS_EXPR:
2415 *reduc_fn = IFN_LAST;
2416 return true;
2417
2418 default:
2419 return false;
2420 }
2421 }
2422
2423 /* If there is a neutral value X such that SLP reduction NODE would not
2424 be affected by the introduction of additional X elements, return that X,
2425 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2426 is true if the SLP statements perform a single reduction, false if each
2427 statement performs an independent reduction. */
2428
2429 static tree
2430 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2431 bool reduc_chain)
2432 {
2433 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2434 stmt_vec_info stmt_vinfo = stmts[0];
2435 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2436 tree scalar_type = TREE_TYPE (vector_type);
2437 struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2438 gcc_assert (loop);
2439
2440 switch (code)
2441 {
2442 case WIDEN_SUM_EXPR:
2443 case DOT_PROD_EXPR:
2444 case SAD_EXPR:
2445 case PLUS_EXPR:
2446 case MINUS_EXPR:
2447 case BIT_IOR_EXPR:
2448 case BIT_XOR_EXPR:
2449 return build_zero_cst (scalar_type);
2450
2451 case MULT_EXPR:
2452 return build_one_cst (scalar_type);
2453
2454 case BIT_AND_EXPR:
2455 return build_all_ones_cst (scalar_type);
2456
2457 case MAX_EXPR:
2458 case MIN_EXPR:
2459 /* For MIN/MAX the initial values are neutral. A reduction chain
2460 has only a single initial value, so that value is neutral for
2461 all statements. */
2462 if (reduc_chain)
2463 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2464 loop_preheader_edge (loop));
2465 return NULL_TREE;
2466
2467 default:
2468 return NULL_TREE;
2469 }
2470 }
2471
2472 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2473 STMT is printed with a message MSG. */
2474
2475 static void
2476 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2477 {
2478 dump_printf_loc (msg_type, vect_location, "%s", msg);
2479 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2480 }
2481
2482 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2483 operation. Return true if the results of DEF_STMT_INFO are something
2484 that can be accumulated by such a reduction. */
2485
2486 static bool
2487 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2488 {
2489 return (is_gimple_assign (def_stmt_info->stmt)
2490 || is_gimple_call (def_stmt_info->stmt)
2491 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2492 || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2493 && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2494 && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2495 }
2496
2497 /* Detect SLP reduction of the form:
2498
2499 #a1 = phi <a5, a0>
2500 a2 = operation (a1)
2501 a3 = operation (a2)
2502 a4 = operation (a3)
2503 a5 = operation (a4)
2504
2505 #a = phi <a5>
2506
2507 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2508 FIRST_STMT is the first reduction stmt in the chain
2509 (a2 = operation (a1)).
2510
2511 Return TRUE if a reduction chain was detected. */
2512
2513 static bool
2514 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2515 gimple *first_stmt)
2516 {
2517 struct loop *loop = (gimple_bb (phi))->loop_father;
2518 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2519 enum tree_code code;
2520 gimple *loop_use_stmt = NULL;
2521 stmt_vec_info use_stmt_info, current_stmt_info = NULL;
2522 tree lhs;
2523 imm_use_iterator imm_iter;
2524 use_operand_p use_p;
2525 int nloop_uses, size = 0, n_out_of_loop_uses;
2526 bool found = false;
2527
2528 if (loop != vect_loop)
2529 return false;
2530
2531 lhs = PHI_RESULT (phi);
2532 code = gimple_assign_rhs_code (first_stmt);
2533 while (1)
2534 {
2535 nloop_uses = 0;
2536 n_out_of_loop_uses = 0;
2537 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2538 {
2539 gimple *use_stmt = USE_STMT (use_p);
2540 if (is_gimple_debug (use_stmt))
2541 continue;
2542
2543 /* Check if we got back to the reduction phi. */
2544 if (use_stmt == phi)
2545 {
2546 loop_use_stmt = use_stmt;
2547 found = true;
2548 break;
2549 }
2550
2551 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2552 {
2553 loop_use_stmt = use_stmt;
2554 nloop_uses++;
2555 }
2556 else
2557 n_out_of_loop_uses++;
2558
2559 /* There are can be either a single use in the loop or two uses in
2560 phi nodes. */
2561 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2562 return false;
2563 }
2564
2565 if (found)
2566 break;
2567
2568 /* We reached a statement with no loop uses. */
2569 if (nloop_uses == 0)
2570 return false;
2571
2572 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2573 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2574 return false;
2575
2576 if (!is_gimple_assign (loop_use_stmt)
2577 || code != gimple_assign_rhs_code (loop_use_stmt)
2578 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2579 return false;
2580
2581 /* Insert USE_STMT into reduction chain. */
2582 use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2583 if (current_stmt_info)
2584 {
2585 REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = use_stmt_info;
2586 REDUC_GROUP_FIRST_ELEMENT (use_stmt_info)
2587 = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2588 }
2589 else
2590 REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = use_stmt_info;
2591
2592 lhs = gimple_assign_lhs (loop_use_stmt);
2593 current_stmt_info = use_stmt_info;
2594 size++;
2595 }
2596
2597 if (!found || loop_use_stmt != phi || size < 2)
2598 return false;
2599
2600 /* Swap the operands, if needed, to make the reduction operand be the second
2601 operand. */
2602 lhs = PHI_RESULT (phi);
2603 stmt_vec_info next_stmt_info = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2604 while (next_stmt_info)
2605 {
2606 gassign *next_stmt = as_a <gassign *> (next_stmt_info->stmt);
2607 if (gimple_assign_rhs2 (next_stmt) == lhs)
2608 {
2609 tree op = gimple_assign_rhs1 (next_stmt);
2610 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2611
2612 /* Check that the other def is either defined in the loop
2613 ("vect_internal_def"), or it's an induction (defined by a
2614 loop-header phi-node). */
2615 if (def_stmt_info
2616 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2617 && vect_valid_reduction_input_p (def_stmt_info))
2618 {
2619 lhs = gimple_assign_lhs (next_stmt);
2620 next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
2621 continue;
2622 }
2623
2624 return false;
2625 }
2626 else
2627 {
2628 tree op = gimple_assign_rhs2 (next_stmt);
2629 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2630
2631 /* Check that the other def is either defined in the loop
2632 ("vect_internal_def"), or it's an induction (defined by a
2633 loop-header phi-node). */
2634 if (def_stmt_info
2635 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2636 && vect_valid_reduction_input_p (def_stmt_info))
2637 {
2638 if (dump_enabled_p ())
2639 {
2640 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2641 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2642 }
2643
2644 swap_ssa_operands (next_stmt,
2645 gimple_assign_rhs1_ptr (next_stmt),
2646 gimple_assign_rhs2_ptr (next_stmt));
2647 update_stmt (next_stmt);
2648
2649 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2650 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2651 }
2652 else
2653 return false;
2654 }
2655
2656 lhs = gimple_assign_lhs (next_stmt);
2657 next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
2658 }
2659
2660 /* Save the chain for further analysis in SLP detection. */
2661 stmt_vec_info first_stmt_info
2662 = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2663 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first_stmt_info);
2664 REDUC_GROUP_SIZE (first_stmt_info) = size;
2665
2666 return true;
2667 }
2668
2669 /* Return true if we need an in-order reduction for operation CODE
2670 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2671 overflow must wrap. */
2672
2673 static bool
2674 needs_fold_left_reduction_p (tree type, tree_code code,
2675 bool need_wrapping_integral_overflow)
2676 {
2677 /* CHECKME: check for !flag_finite_math_only too? */
2678 if (SCALAR_FLOAT_TYPE_P (type))
2679 switch (code)
2680 {
2681 case MIN_EXPR:
2682 case MAX_EXPR:
2683 return false;
2684
2685 default:
2686 return !flag_associative_math;
2687 }
2688
2689 if (INTEGRAL_TYPE_P (type))
2690 {
2691 if (!operation_no_trapping_overflow (type, code))
2692 return true;
2693 if (need_wrapping_integral_overflow
2694 && !TYPE_OVERFLOW_WRAPS (type)
2695 && operation_can_overflow (code))
2696 return true;
2697 return false;
2698 }
2699
2700 if (SAT_FIXED_POINT_TYPE_P (type))
2701 return true;
2702
2703 return false;
2704 }
2705
2706 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2707 reduction operation CODE has a handled computation expression. */
2708
2709 bool
2710 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2711 tree loop_arg, enum tree_code code)
2712 {
2713 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2714 auto_bitmap visited;
2715 tree lookfor = PHI_RESULT (phi);
2716 ssa_op_iter curri;
2717 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2718 while (USE_FROM_PTR (curr) != loop_arg)
2719 curr = op_iter_next_use (&curri);
2720 curri.i = curri.numops;
2721 do
2722 {
2723 path.safe_push (std::make_pair (curri, curr));
2724 tree use = USE_FROM_PTR (curr);
2725 if (use == lookfor)
2726 break;
2727 gimple *def = SSA_NAME_DEF_STMT (use);
2728 if (gimple_nop_p (def)
2729 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2730 {
2731 pop:
2732 do
2733 {
2734 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2735 curri = x.first;
2736 curr = x.second;
2737 do
2738 curr = op_iter_next_use (&curri);
2739 /* Skip already visited or non-SSA operands (from iterating
2740 over PHI args). */
2741 while (curr != NULL_USE_OPERAND_P
2742 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2743 || ! bitmap_set_bit (visited,
2744 SSA_NAME_VERSION
2745 (USE_FROM_PTR (curr)))));
2746 }
2747 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2748 if (curr == NULL_USE_OPERAND_P)
2749 break;
2750 }
2751 else
2752 {
2753 if (gimple_code (def) == GIMPLE_PHI)
2754 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2755 else
2756 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2757 while (curr != NULL_USE_OPERAND_P
2758 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2759 || ! bitmap_set_bit (visited,
2760 SSA_NAME_VERSION
2761 (USE_FROM_PTR (curr)))))
2762 curr = op_iter_next_use (&curri);
2763 if (curr == NULL_USE_OPERAND_P)
2764 goto pop;
2765 }
2766 }
2767 while (1);
2768 if (dump_file && (dump_flags & TDF_DETAILS))
2769 {
2770 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2771 unsigned i;
2772 std::pair<ssa_op_iter, use_operand_p> *x;
2773 FOR_EACH_VEC_ELT (path, i, x)
2774 {
2775 dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2776 dump_printf (MSG_NOTE, " ");
2777 }
2778 dump_printf (MSG_NOTE, "\n");
2779 }
2780
2781 /* Check whether the reduction path detected is valid. */
2782 bool fail = path.length () == 0;
2783 bool neg = false;
2784 for (unsigned i = 1; i < path.length (); ++i)
2785 {
2786 gimple *use_stmt = USE_STMT (path[i].second);
2787 tree op = USE_FROM_PTR (path[i].second);
2788 if (! has_single_use (op)
2789 || ! is_gimple_assign (use_stmt))
2790 {
2791 fail = true;
2792 break;
2793 }
2794 if (gimple_assign_rhs_code (use_stmt) != code)
2795 {
2796 if (code == PLUS_EXPR
2797 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2798 {
2799 /* Track whether we negate the reduction value each iteration. */
2800 if (gimple_assign_rhs2 (use_stmt) == op)
2801 neg = ! neg;
2802 }
2803 else
2804 {
2805 fail = true;
2806 break;
2807 }
2808 }
2809 }
2810 return ! fail && ! neg;
2811 }
2812
2813
2814 /* Function vect_is_simple_reduction
2815
2816 (1) Detect a cross-iteration def-use cycle that represents a simple
2817 reduction computation. We look for the following pattern:
2818
2819 loop_header:
2820 a1 = phi < a0, a2 >
2821 a3 = ...
2822 a2 = operation (a3, a1)
2823
2824 or
2825
2826 a3 = ...
2827 loop_header:
2828 a1 = phi < a0, a2 >
2829 a2 = operation (a3, a1)
2830
2831 such that:
2832 1. operation is commutative and associative and it is safe to
2833 change the order of the computation
2834 2. no uses for a2 in the loop (a2 is used out of the loop)
2835 3. no uses of a1 in the loop besides the reduction operation
2836 4. no uses of a1 outside the loop.
2837
2838 Conditions 1,4 are tested here.
2839 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2840
2841 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2842 nested cycles.
2843
2844 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2845 reductions:
2846
2847 a1 = phi < a0, a2 >
2848 inner loop (def of a3)
2849 a2 = phi < a3 >
2850
2851 (4) Detect condition expressions, ie:
2852 for (int i = 0; i < N; i++)
2853 if (a[i] < val)
2854 ret_val = a[i];
2855
2856 */
2857
2858 static stmt_vec_info
2859 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2860 bool *double_reduc,
2861 bool need_wrapping_integral_overflow,
2862 enum vect_reduction_type *v_reduc_type)
2863 {
2864 gphi *phi = as_a <gphi *> (phi_info->stmt);
2865 struct loop *loop = (gimple_bb (phi))->loop_father;
2866 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2867 gimple *phi_use_stmt = NULL;
2868 enum tree_code orig_code, code;
2869 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2870 tree type;
2871 int nloop_uses;
2872 tree name;
2873 imm_use_iterator imm_iter;
2874 use_operand_p use_p;
2875 bool phi_def;
2876
2877 *double_reduc = false;
2878 *v_reduc_type = TREE_CODE_REDUCTION;
2879
2880 tree phi_name = PHI_RESULT (phi);
2881 /* ??? If there are no uses of the PHI result the inner loop reduction
2882 won't be detected as possibly double-reduction by vectorizable_reduction
2883 because that tries to walk the PHI arg from the preheader edge which
2884 can be constant. See PR60382. */
2885 if (has_zero_uses (phi_name))
2886 return NULL;
2887 nloop_uses = 0;
2888 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2889 {
2890 gimple *use_stmt = USE_STMT (use_p);
2891 if (is_gimple_debug (use_stmt))
2892 continue;
2893
2894 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2895 {
2896 if (dump_enabled_p ())
2897 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2898 "intermediate value used outside loop.\n");
2899
2900 return NULL;
2901 }
2902
2903 nloop_uses++;
2904 if (nloop_uses > 1)
2905 {
2906 if (dump_enabled_p ())
2907 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2908 "reduction value used in loop.\n");
2909 return NULL;
2910 }
2911
2912 phi_use_stmt = use_stmt;
2913 }
2914
2915 edge latch_e = loop_latch_edge (loop);
2916 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2917 if (TREE_CODE (loop_arg) != SSA_NAME)
2918 {
2919 if (dump_enabled_p ())
2920 {
2921 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2922 "reduction: not ssa_name: ");
2923 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2924 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2925 }
2926 return NULL;
2927 }
2928
2929 stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
2930 if (!def_stmt_info)
2931 return NULL;
2932
2933 if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
2934 {
2935 name = gimple_assign_lhs (def_stmt);
2936 phi_def = false;
2937 }
2938 else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2939 {
2940 name = PHI_RESULT (def_stmt);
2941 phi_def = true;
2942 }
2943 else
2944 {
2945 if (dump_enabled_p ())
2946 {
2947 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2948 "reduction: unhandled reduction operation: ");
2949 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2950 def_stmt_info->stmt, 0);
2951 }
2952 return NULL;
2953 }
2954
2955 nloop_uses = 0;
2956 auto_vec<gphi *, 3> lcphis;
2957 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2958 {
2959 gimple *use_stmt = USE_STMT (use_p);
2960 if (is_gimple_debug (use_stmt))
2961 continue;
2962 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2963 nloop_uses++;
2964 else
2965 /* We can have more than one loop-closed PHI. */
2966 lcphis.safe_push (as_a <gphi *> (use_stmt));
2967 if (nloop_uses > 1)
2968 {
2969 if (dump_enabled_p ())
2970 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2971 "reduction used in loop.\n");
2972 return NULL;
2973 }
2974 }
2975
2976 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2977 defined in the inner loop. */
2978 if (phi_def)
2979 {
2980 gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
2981 op1 = PHI_ARG_DEF (def_stmt, 0);
2982
2983 if (gimple_phi_num_args (def_stmt) != 1
2984 || TREE_CODE (op1) != SSA_NAME)
2985 {
2986 if (dump_enabled_p ())
2987 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2988 "unsupported phi node definition.\n");
2989
2990 return NULL;
2991 }
2992
2993 gimple *def1 = SSA_NAME_DEF_STMT (op1);
2994 if (gimple_bb (def1)
2995 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2996 && loop->inner
2997 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2998 && is_gimple_assign (def1)
2999 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3000 {
3001 if (dump_enabled_p ())
3002 report_vect_op (MSG_NOTE, def_stmt,
3003 "detected double reduction: ");
3004
3005 *double_reduc = true;
3006 return def_stmt_info;
3007 }
3008
3009 return NULL;
3010 }
3011
3012 /* If we are vectorizing an inner reduction we are executing that
3013 in the original order only in case we are not dealing with a
3014 double reduction. */
3015 bool check_reduction = true;
3016 if (flow_loop_nested_p (vect_loop, loop))
3017 {
3018 gphi *lcphi;
3019 unsigned i;
3020 check_reduction = false;
3021 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3022 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3023 {
3024 gimple *use_stmt = USE_STMT (use_p);
3025 if (is_gimple_debug (use_stmt))
3026 continue;
3027 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3028 check_reduction = true;
3029 }
3030 }
3031
3032 gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
3033 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3034 code = orig_code = gimple_assign_rhs_code (def_stmt);
3035
3036 /* We can handle "res -= x[i]", which is non-associative by
3037 simply rewriting this into "res += -x[i]". Avoid changing
3038 gimple instruction for the first simple tests and only do this
3039 if we're allowed to change code at all. */
3040 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3041 code = PLUS_EXPR;
3042
3043 if (code == COND_EXPR)
3044 {
3045 if (! nested_in_vect_loop)
3046 *v_reduc_type = COND_REDUCTION;
3047
3048 op3 = gimple_assign_rhs1 (def_stmt);
3049 if (COMPARISON_CLASS_P (op3))
3050 {
3051 op4 = TREE_OPERAND (op3, 1);
3052 op3 = TREE_OPERAND (op3, 0);
3053 }
3054 if (op3 == phi_name || op4 == phi_name)
3055 {
3056 if (dump_enabled_p ())
3057 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3058 "reduction: condition depends on previous"
3059 " iteration: ");
3060 return NULL;
3061 }
3062
3063 op1 = gimple_assign_rhs2 (def_stmt);
3064 op2 = gimple_assign_rhs3 (def_stmt);
3065 }
3066 else if (!commutative_tree_code (code) || !associative_tree_code (code))
3067 {
3068 if (dump_enabled_p ())
3069 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3070 "reduction: not commutative/associative: ");
3071 return NULL;
3072 }
3073 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3074 {
3075 op1 = gimple_assign_rhs1 (def_stmt);
3076 op2 = gimple_assign_rhs2 (def_stmt);
3077 }
3078 else
3079 {
3080 if (dump_enabled_p ())
3081 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3082 "reduction: not handled operation: ");
3083 return NULL;
3084 }
3085
3086 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3087 {
3088 if (dump_enabled_p ())
3089 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3090 "reduction: both uses not ssa_names: ");
3091
3092 return NULL;
3093 }
3094
3095 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3096 if ((TREE_CODE (op1) == SSA_NAME
3097 && !types_compatible_p (type,TREE_TYPE (op1)))
3098 || (TREE_CODE (op2) == SSA_NAME
3099 && !types_compatible_p (type, TREE_TYPE (op2)))
3100 || (op3 && TREE_CODE (op3) == SSA_NAME
3101 && !types_compatible_p (type, TREE_TYPE (op3)))
3102 || (op4 && TREE_CODE (op4) == SSA_NAME
3103 && !types_compatible_p (type, TREE_TYPE (op4))))
3104 {
3105 if (dump_enabled_p ())
3106 {
3107 dump_printf_loc (MSG_NOTE, vect_location,
3108 "reduction: multiple types: operation type: ");
3109 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3110 dump_printf (MSG_NOTE, ", operands types: ");
3111 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3112 TREE_TYPE (op1));
3113 dump_printf (MSG_NOTE, ",");
3114 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3115 TREE_TYPE (op2));
3116 if (op3)
3117 {
3118 dump_printf (MSG_NOTE, ",");
3119 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3120 TREE_TYPE (op3));
3121 }
3122
3123 if (op4)
3124 {
3125 dump_printf (MSG_NOTE, ",");
3126 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3127 TREE_TYPE (op4));
3128 }
3129 dump_printf (MSG_NOTE, "\n");
3130 }
3131
3132 return NULL;
3133 }
3134
3135 /* Check whether it's ok to change the order of the computation.
3136 Generally, when vectorizing a reduction we change the order of the
3137 computation. This may change the behavior of the program in some
3138 cases, so we need to check that this is ok. One exception is when
3139 vectorizing an outer-loop: the inner-loop is executed sequentially,
3140 and therefore vectorizing reductions in the inner-loop during
3141 outer-loop vectorization is safe. */
3142 if (check_reduction
3143 && *v_reduc_type == TREE_CODE_REDUCTION
3144 && needs_fold_left_reduction_p (type, code,
3145 need_wrapping_integral_overflow))
3146 *v_reduc_type = FOLD_LEFT_REDUCTION;
3147
3148 /* Reduction is safe. We're dealing with one of the following:
3149 1) integer arithmetic and no trapv
3150 2) floating point arithmetic, and special flags permit this optimization
3151 3) nested cycle (i.e., outer loop vectorization). */
3152 stmt_vec_info def1_info = loop_info->lookup_def (op1);
3153 stmt_vec_info def2_info = loop_info->lookup_def (op2);
3154 if (code != COND_EXPR && !def1_info && !def2_info)
3155 {
3156 if (dump_enabled_p ())
3157 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3158 return NULL;
3159 }
3160
3161 /* Check that one def is the reduction def, defined by PHI,
3162 the other def is either defined in the loop ("vect_internal_def"),
3163 or it's an induction (defined by a loop-header phi-node). */
3164
3165 if (def2_info
3166 && def2_info->stmt == phi
3167 && (code == COND_EXPR
3168 || !def1_info
3169 || vect_valid_reduction_input_p (def1_info)))
3170 {
3171 if (dump_enabled_p ())
3172 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3173 return def_stmt_info;
3174 }
3175
3176 if (def1_info
3177 && def1_info->stmt == phi
3178 && (code == COND_EXPR
3179 || !def2_info
3180 || vect_valid_reduction_input_p (def2_info)))
3181 {
3182 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3183 {
3184 /* Check if we can swap operands (just for simplicity - so that
3185 the rest of the code can assume that the reduction variable
3186 is always the last (second) argument). */
3187 if (code == COND_EXPR)
3188 {
3189 /* Swap cond_expr by inverting the condition. */
3190 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3191 enum tree_code invert_code = ERROR_MARK;
3192 enum tree_code cond_code = TREE_CODE (cond_expr);
3193
3194 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3195 {
3196 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3197 invert_code = invert_tree_comparison (cond_code, honor_nans);
3198 }
3199 if (invert_code != ERROR_MARK)
3200 {
3201 TREE_SET_CODE (cond_expr, invert_code);
3202 swap_ssa_operands (def_stmt,
3203 gimple_assign_rhs2_ptr (def_stmt),
3204 gimple_assign_rhs3_ptr (def_stmt));
3205 }
3206 else
3207 {
3208 if (dump_enabled_p ())
3209 report_vect_op (MSG_NOTE, def_stmt,
3210 "detected reduction: cannot swap operands "
3211 "for cond_expr");
3212 return NULL;
3213 }
3214 }
3215 else
3216 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3217 gimple_assign_rhs2_ptr (def_stmt));
3218
3219 if (dump_enabled_p ())
3220 report_vect_op (MSG_NOTE, def_stmt,
3221 "detected reduction: need to swap operands: ");
3222
3223 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3224 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3225 }
3226 else
3227 {
3228 if (dump_enabled_p ())
3229 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3230 }
3231
3232 return def_stmt_info;
3233 }
3234
3235 /* Try to find SLP reduction chain. */
3236 if (! nested_in_vect_loop
3237 && code != COND_EXPR
3238 && orig_code != MINUS_EXPR
3239 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3240 {
3241 if (dump_enabled_p ())
3242 report_vect_op (MSG_NOTE, def_stmt,
3243 "reduction: detected reduction chain: ");
3244
3245 return def_stmt_info;
3246 }
3247
3248 /* Dissolve group eventually half-built by vect_is_slp_reduction. */
3249 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (def_stmt_info);
3250 while (first)
3251 {
3252 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
3253 REDUC_GROUP_FIRST_ELEMENT (first) = NULL;
3254 REDUC_GROUP_NEXT_ELEMENT (first) = NULL;
3255 first = next;
3256 }
3257
3258 /* Look for the expression computing loop_arg from loop PHI result. */
3259 if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3260 return def_stmt_info;
3261
3262 if (dump_enabled_p ())
3263 {
3264 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3265 "reduction: unknown pattern: ");
3266 }
3267
3268 return NULL;
3269 }
3270
3271 /* Wrapper around vect_is_simple_reduction, which will modify code
3272 in-place if it enables detection of more reductions. Arguments
3273 as there. */
3274
3275 stmt_vec_info
3276 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3277 bool *double_reduc,
3278 bool need_wrapping_integral_overflow)
3279 {
3280 enum vect_reduction_type v_reduc_type;
3281 stmt_vec_info def_info
3282 = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3283 need_wrapping_integral_overflow,
3284 &v_reduc_type);
3285 if (def_info)
3286 {
3287 STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3288 STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3289 STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3290 STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3291 }
3292 return def_info;
3293 }
3294
3295 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3296 int
3297 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3298 int *peel_iters_epilogue,
3299 stmt_vector_for_cost *scalar_cost_vec,
3300 stmt_vector_for_cost *prologue_cost_vec,
3301 stmt_vector_for_cost *epilogue_cost_vec)
3302 {
3303 int retval = 0;
3304 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3305
3306 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3307 {
3308 *peel_iters_epilogue = assumed_vf / 2;
3309 if (dump_enabled_p ())
3310 dump_printf_loc (MSG_NOTE, vect_location,
3311 "cost model: epilogue peel iters set to vf/2 "
3312 "because loop iterations are unknown .\n");
3313
3314 /* If peeled iterations are known but number of scalar loop
3315 iterations are unknown, count a taken branch per peeled loop. */
3316 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3317 NULL, 0, vect_prologue);
3318 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3319 NULL, 0, vect_epilogue);
3320 }
3321 else
3322 {
3323 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3324 peel_iters_prologue = niters < peel_iters_prologue ?
3325 niters : peel_iters_prologue;
3326 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3327 /* If we need to peel for gaps, but no peeling is required, we have to
3328 peel VF iterations. */
3329 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3330 *peel_iters_epilogue = assumed_vf;
3331 }
3332
3333 stmt_info_for_cost *si;
3334 int j;
3335 if (peel_iters_prologue)
3336 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3337 retval += record_stmt_cost (prologue_cost_vec,
3338 si->count * peel_iters_prologue,
3339 si->kind, si->stmt_info, si->misalign,
3340 vect_prologue);
3341 if (*peel_iters_epilogue)
3342 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3343 retval += record_stmt_cost (epilogue_cost_vec,
3344 si->count * *peel_iters_epilogue,
3345 si->kind, si->stmt_info, si->misalign,
3346 vect_epilogue);
3347
3348 return retval;
3349 }
3350
3351 /* Function vect_estimate_min_profitable_iters
3352
3353 Return the number of iterations required for the vector version of the
3354 loop to be profitable relative to the cost of the scalar version of the
3355 loop.
3356
3357 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3358 of iterations for vectorization. -1 value means loop vectorization
3359 is not profitable. This returned value may be used for dynamic
3360 profitability check.
3361
3362 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3363 for static check against estimated number of iterations. */
3364
3365 static void
3366 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3367 int *ret_min_profitable_niters,
3368 int *ret_min_profitable_estimate)
3369 {
3370 int min_profitable_iters;
3371 int min_profitable_estimate;
3372 int peel_iters_prologue;
3373 int peel_iters_epilogue;
3374 unsigned vec_inside_cost = 0;
3375 int vec_outside_cost = 0;
3376 unsigned vec_prologue_cost = 0;
3377 unsigned vec_epilogue_cost = 0;
3378 int scalar_single_iter_cost = 0;
3379 int scalar_outside_cost = 0;
3380 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3381 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3382 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3383
3384 /* Cost model disabled. */
3385 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3386 {
3387 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3388 *ret_min_profitable_niters = 0;
3389 *ret_min_profitable_estimate = 0;
3390 return;
3391 }
3392
3393 /* Requires loop versioning tests to handle misalignment. */
3394 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3395 {
3396 /* FIXME: Make cost depend on complexity of individual check. */
3397 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3398 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3399 vect_prologue);
3400 dump_printf (MSG_NOTE,
3401 "cost model: Adding cost of checks for loop "
3402 "versioning to treat misalignment.\n");
3403 }
3404
3405 /* Requires loop versioning with alias checks. */
3406 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3407 {
3408 /* FIXME: Make cost depend on complexity of individual check. */
3409 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3410 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3411 vect_prologue);
3412 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3413 if (len)
3414 /* Count LEN - 1 ANDs and LEN comparisons. */
3415 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3416 NULL, 0, vect_prologue);
3417 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3418 if (len)
3419 {
3420 /* Count LEN - 1 ANDs and LEN comparisons. */
3421 unsigned int nstmts = len * 2 - 1;
3422 /* +1 for each bias that needs adding. */
3423 for (unsigned int i = 0; i < len; ++i)
3424 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3425 nstmts += 1;
3426 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3427 NULL, 0, vect_prologue);
3428 }
3429 dump_printf (MSG_NOTE,
3430 "cost model: Adding cost of checks for loop "
3431 "versioning aliasing.\n");
3432 }
3433
3434 /* Requires loop versioning with niter checks. */
3435 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3436 {
3437 /* FIXME: Make cost depend on complexity of individual check. */
3438 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3439 vect_prologue);
3440 dump_printf (MSG_NOTE,
3441 "cost model: Adding cost of checks for loop "
3442 "versioning niters.\n");
3443 }
3444
3445 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3446 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3447 vect_prologue);
3448
3449 /* Count statements in scalar loop. Using this as scalar cost for a single
3450 iteration for now.
3451
3452 TODO: Add outer loop support.
3453
3454 TODO: Consider assigning different costs to different scalar
3455 statements. */
3456
3457 scalar_single_iter_cost
3458 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3459
3460 /* Add additional cost for the peeled instructions in prologue and epilogue
3461 loop. (For fully-masked loops there will be no peeling.)
3462
3463 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3464 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3465
3466 TODO: Build an expression that represents peel_iters for prologue and
3467 epilogue to be used in a run-time test. */
3468
3469 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3470 {
3471 peel_iters_prologue = 0;
3472 peel_iters_epilogue = 0;
3473
3474 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3475 {
3476 /* We need to peel exactly one iteration. */
3477 peel_iters_epilogue += 1;
3478 stmt_info_for_cost *si;
3479 int j;
3480 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3481 j, si)
3482 (void) add_stmt_cost (target_cost_data, si->count,
3483 si->kind, si->stmt_info, si->misalign,
3484 vect_epilogue);
3485 }
3486 }
3487 else if (npeel < 0)
3488 {
3489 peel_iters_prologue = assumed_vf / 2;
3490 dump_printf (MSG_NOTE, "cost model: "
3491 "prologue peel iters set to vf/2.\n");
3492
3493 /* If peeling for alignment is unknown, loop bound of main loop becomes
3494 unknown. */
3495 peel_iters_epilogue = assumed_vf / 2;
3496 dump_printf (MSG_NOTE, "cost model: "
3497 "epilogue peel iters set to vf/2 because "
3498 "peeling for alignment is unknown.\n");
3499
3500 /* If peeled iterations are unknown, count a taken branch and a not taken
3501 branch per peeled loop. Even if scalar loop iterations are known,
3502 vector iterations are not known since peeled prologue iterations are
3503 not known. Hence guards remain the same. */
3504 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3505 NULL, 0, vect_prologue);
3506 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3507 NULL, 0, vect_prologue);
3508 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3509 NULL, 0, vect_epilogue);
3510 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3511 NULL, 0, vect_epilogue);
3512 stmt_info_for_cost *si;
3513 int j;
3514 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3515 {
3516 (void) add_stmt_cost (target_cost_data,
3517 si->count * peel_iters_prologue,
3518 si->kind, si->stmt_info, si->misalign,
3519 vect_prologue);
3520 (void) add_stmt_cost (target_cost_data,
3521 si->count * peel_iters_epilogue,
3522 si->kind, si->stmt_info, si->misalign,
3523 vect_epilogue);
3524 }
3525 }
3526 else
3527 {
3528 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3529 stmt_info_for_cost *si;
3530 int j;
3531 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3532
3533 prologue_cost_vec.create (2);
3534 epilogue_cost_vec.create (2);
3535 peel_iters_prologue = npeel;
3536
3537 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3538 &peel_iters_epilogue,
3539 &LOOP_VINFO_SCALAR_ITERATION_COST
3540 (loop_vinfo),
3541 &prologue_cost_vec,
3542 &epilogue_cost_vec);
3543
3544 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3545 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3546 si->misalign, vect_prologue);
3547
3548 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3549 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3550 si->misalign, vect_epilogue);
3551
3552 prologue_cost_vec.release ();
3553 epilogue_cost_vec.release ();
3554 }
3555
3556 /* FORNOW: The scalar outside cost is incremented in one of the
3557 following ways:
3558
3559 1. The vectorizer checks for alignment and aliasing and generates
3560 a condition that allows dynamic vectorization. A cost model
3561 check is ANDED with the versioning condition. Hence scalar code
3562 path now has the added cost of the versioning check.
3563
3564 if (cost > th & versioning_check)
3565 jmp to vector code
3566
3567 Hence run-time scalar is incremented by not-taken branch cost.
3568
3569 2. The vectorizer then checks if a prologue is required. If the
3570 cost model check was not done before during versioning, it has to
3571 be done before the prologue check.
3572
3573 if (cost <= th)
3574 prologue = scalar_iters
3575 if (prologue == 0)
3576 jmp to vector code
3577 else
3578 execute prologue
3579 if (prologue == num_iters)
3580 go to exit
3581
3582 Hence the run-time scalar cost is incremented by a taken branch,
3583 plus a not-taken branch, plus a taken branch cost.
3584
3585 3. The vectorizer then checks if an epilogue is required. If the
3586 cost model check was not done before during prologue check, it
3587 has to be done with the epilogue check.
3588
3589 if (prologue == 0)
3590 jmp to vector code
3591 else
3592 execute prologue
3593 if (prologue == num_iters)
3594 go to exit
3595 vector code:
3596 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3597 jmp to epilogue
3598
3599 Hence the run-time scalar cost should be incremented by 2 taken
3600 branches.
3601
3602 TODO: The back end may reorder the BBS's differently and reverse
3603 conditions/branch directions. Change the estimates below to
3604 something more reasonable. */
3605
3606 /* If the number of iterations is known and we do not do versioning, we can
3607 decide whether to vectorize at compile time. Hence the scalar version
3608 do not carry cost model guard costs. */
3609 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3610 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3611 {
3612 /* Cost model check occurs at versioning. */
3613 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3614 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3615 else
3616 {
3617 /* Cost model check occurs at prologue generation. */
3618 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3619 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3620 + vect_get_stmt_cost (cond_branch_not_taken);
3621 /* Cost model check occurs at epilogue generation. */
3622 else
3623 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3624 }
3625 }
3626
3627 /* Complete the target-specific cost calculations. */
3628 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3629 &vec_inside_cost, &vec_epilogue_cost);
3630
3631 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3632
3633 if (dump_enabled_p ())
3634 {
3635 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3636 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3637 vec_inside_cost);
3638 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3639 vec_prologue_cost);
3640 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3641 vec_epilogue_cost);
3642 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3643 scalar_single_iter_cost);
3644 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3645 scalar_outside_cost);
3646 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3647 vec_outside_cost);
3648 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3649 peel_iters_prologue);
3650 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3651 peel_iters_epilogue);
3652 }
3653
3654 /* Calculate number of iterations required to make the vector version
3655 profitable, relative to the loop bodies only. The following condition
3656 must hold true:
3657 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3658 where
3659 SIC = scalar iteration cost, VIC = vector iteration cost,
3660 VOC = vector outside cost, VF = vectorization factor,
3661 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3662 SOC = scalar outside cost for run time cost model check. */
3663
3664 if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3665 {
3666 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3667 * assumed_vf
3668 - vec_inside_cost * peel_iters_prologue
3669 - vec_inside_cost * peel_iters_epilogue);
3670 if (min_profitable_iters <= 0)
3671 min_profitable_iters = 0;
3672 else
3673 {
3674 min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3675 - vec_inside_cost);
3676
3677 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3678 <= (((int) vec_inside_cost * min_profitable_iters)
3679 + (((int) vec_outside_cost - scalar_outside_cost)
3680 * assumed_vf)))
3681 min_profitable_iters++;
3682 }
3683 }
3684 /* vector version will never be profitable. */
3685 else
3686 {
3687 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3688 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3689 "vectorization did not happen for a simd loop");
3690
3691 if (dump_enabled_p ())
3692 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3693 "cost model: the vector iteration cost = %d "
3694 "divided by the scalar iteration cost = %d "
3695 "is greater or equal to the vectorization factor = %d"
3696 ".\n",
3697 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3698 *ret_min_profitable_niters = -1;
3699 *ret_min_profitable_estimate = -1;
3700 return;
3701 }
3702
3703 dump_printf (MSG_NOTE,
3704 " Calculated minimum iters for profitability: %d\n",
3705 min_profitable_iters);
3706
3707 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3708 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3709 /* We want the vectorized loop to execute at least once. */
3710 min_profitable_iters = assumed_vf + peel_iters_prologue;
3711
3712 if (dump_enabled_p ())
3713 dump_printf_loc (MSG_NOTE, vect_location,
3714 " Runtime profitability threshold = %d\n",
3715 min_profitable_iters);
3716
3717 *ret_min_profitable_niters = min_profitable_iters;
3718
3719 /* Calculate number of iterations required to make the vector version
3720 profitable, relative to the loop bodies only.
3721
3722 Non-vectorized variant is SIC * niters and it must win over vector
3723 variant on the expected loop trip count. The following condition must hold true:
3724 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3725
3726 if (vec_outside_cost <= 0)
3727 min_profitable_estimate = 0;
3728 else
3729 {
3730 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3731 * assumed_vf
3732 - vec_inside_cost * peel_iters_prologue
3733 - vec_inside_cost * peel_iters_epilogue)
3734 / ((scalar_single_iter_cost * assumed_vf)
3735 - vec_inside_cost);
3736 }
3737 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3738 if (dump_enabled_p ())
3739 dump_printf_loc (MSG_NOTE, vect_location,
3740 " Static estimate profitability threshold = %d\n",
3741 min_profitable_estimate);
3742
3743 *ret_min_profitable_estimate = min_profitable_estimate;
3744 }
3745
3746 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3747 vector elements (not bits) for a vector with NELT elements. */
3748 static void
3749 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3750 vec_perm_builder *sel)
3751 {
3752 /* The encoding is a single stepped pattern. Any wrap-around is handled
3753 by vec_perm_indices. */
3754 sel->new_vector (nelt, 1, 3);
3755 for (unsigned int i = 0; i < 3; i++)
3756 sel->quick_push (i + offset);
3757 }
3758
3759 /* Checks whether the target supports whole-vector shifts for vectors of mode
3760 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3761 it supports vec_perm_const with masks for all necessary shift amounts. */
3762 static bool
3763 have_whole_vector_shift (machine_mode mode)
3764 {
3765 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3766 return true;
3767
3768 /* Variable-length vectors should be handled via the optab. */
3769 unsigned int nelt;
3770 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3771 return false;
3772
3773 vec_perm_builder sel;
3774 vec_perm_indices indices;
3775 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3776 {
3777 calc_vec_perm_mask_for_shift (i, nelt, &sel);
3778 indices.new_vector (sel, 2, nelt);
3779 if (!can_vec_perm_const_p (mode, indices, false))
3780 return false;
3781 }
3782 return true;
3783 }
3784
3785 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3786 functions. Design better to avoid maintenance issues. */
3787
3788 /* Function vect_model_reduction_cost.
3789
3790 Models cost for a reduction operation, including the vector ops
3791 generated within the strip-mine loop, the initial definition before
3792 the loop, and the epilogue code that must be generated. */
3793
3794 static void
3795 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3796 int ncopies, stmt_vector_for_cost *cost_vec)
3797 {
3798 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3799 enum tree_code code;
3800 optab optab;
3801 tree vectype;
3802 machine_mode mode;
3803 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3804 struct loop *loop = NULL;
3805
3806 if (loop_vinfo)
3807 loop = LOOP_VINFO_LOOP (loop_vinfo);
3808
3809 /* Condition reductions generate two reductions in the loop. */
3810 vect_reduction_type reduction_type
3811 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3812 if (reduction_type == COND_REDUCTION)
3813 ncopies *= 2;
3814
3815 vectype = STMT_VINFO_VECTYPE (stmt_info);
3816 mode = TYPE_MODE (vectype);
3817 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
3818
3819 if (!orig_stmt_info)
3820 orig_stmt_info = stmt_info;
3821
3822 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3823
3824 if (reduction_type == EXTRACT_LAST_REDUCTION
3825 || reduction_type == FOLD_LEFT_REDUCTION)
3826 {
3827 /* No extra instructions needed in the prologue. */
3828 prologue_cost = 0;
3829
3830 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3831 /* Count one reduction-like operation per vector. */
3832 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3833 stmt_info, 0, vect_body);
3834 else
3835 {
3836 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
3837 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3838 inside_cost = record_stmt_cost (cost_vec, nelements,
3839 vec_to_scalar, stmt_info, 0,
3840 vect_body);
3841 inside_cost += record_stmt_cost (cost_vec, nelements,
3842 scalar_stmt, stmt_info, 0,
3843 vect_body);
3844 }
3845 }
3846 else
3847 {
3848 /* Add in cost for initial definition.
3849 For cond reduction we have four vectors: initial index, step,
3850 initial result of the data reduction, initial value of the index
3851 reduction. */
3852 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3853 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3854 scalar_to_vec, stmt_info, 0,
3855 vect_prologue);
3856
3857 /* Cost of reduction op inside loop. */
3858 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3859 stmt_info, 0, vect_body);
3860 }
3861
3862 /* Determine cost of epilogue code.
3863
3864 We have a reduction operator that will reduce the vector in one statement.
3865 Also requires scalar extract. */
3866
3867 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3868 {
3869 if (reduc_fn != IFN_LAST)
3870 {
3871 if (reduction_type == COND_REDUCTION)
3872 {
3873 /* An EQ stmt and an COND_EXPR stmt. */
3874 epilogue_cost += record_stmt_cost (cost_vec, 2,
3875 vector_stmt, stmt_info, 0,
3876 vect_epilogue);
3877 /* Reduction of the max index and a reduction of the found
3878 values. */
3879 epilogue_cost += record_stmt_cost (cost_vec, 2,
3880 vec_to_scalar, stmt_info, 0,
3881 vect_epilogue);
3882 /* A broadcast of the max value. */
3883 epilogue_cost += record_stmt_cost (cost_vec, 1,
3884 scalar_to_vec, stmt_info, 0,
3885 vect_epilogue);
3886 }
3887 else
3888 {
3889 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3890 stmt_info, 0, vect_epilogue);
3891 epilogue_cost += record_stmt_cost (cost_vec, 1,
3892 vec_to_scalar, stmt_info, 0,
3893 vect_epilogue);
3894 }
3895 }
3896 else if (reduction_type == COND_REDUCTION)
3897 {
3898 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3899 /* Extraction of scalar elements. */
3900 epilogue_cost += record_stmt_cost (cost_vec,
3901 2 * estimated_nunits,
3902 vec_to_scalar, stmt_info, 0,
3903 vect_epilogue);
3904 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
3905 epilogue_cost += record_stmt_cost (cost_vec,
3906 2 * estimated_nunits - 3,
3907 scalar_stmt, stmt_info, 0,
3908 vect_epilogue);
3909 }
3910 else if (reduction_type == EXTRACT_LAST_REDUCTION
3911 || reduction_type == FOLD_LEFT_REDUCTION)
3912 /* No extra instructions need in the epilogue. */
3913 ;
3914 else
3915 {
3916 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3917 tree bitsize =
3918 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3919 int element_bitsize = tree_to_uhwi (bitsize);
3920 int nelements = vec_size_in_bits / element_bitsize;
3921
3922 if (code == COND_EXPR)
3923 code = MAX_EXPR;
3924
3925 optab = optab_for_tree_code (code, vectype, optab_default);
3926
3927 /* We have a whole vector shift available. */
3928 if (optab != unknown_optab
3929 && VECTOR_MODE_P (mode)
3930 && optab_handler (optab, mode) != CODE_FOR_nothing
3931 && have_whole_vector_shift (mode))
3932 {
3933 /* Final reduction via vector shifts and the reduction operator.
3934 Also requires scalar extract. */
3935 epilogue_cost += record_stmt_cost (cost_vec,
3936 exact_log2 (nelements) * 2,
3937 vector_stmt, stmt_info, 0,
3938 vect_epilogue);
3939 epilogue_cost += record_stmt_cost (cost_vec, 1,
3940 vec_to_scalar, stmt_info, 0,
3941 vect_epilogue);
3942 }
3943 else
3944 /* Use extracts and reduction op for final reduction. For N
3945 elements, we have N extracts and N-1 reduction ops. */
3946 epilogue_cost += record_stmt_cost (cost_vec,
3947 nelements + nelements - 1,
3948 vector_stmt, stmt_info, 0,
3949 vect_epilogue);
3950 }
3951 }
3952
3953 if (dump_enabled_p ())
3954 dump_printf (MSG_NOTE,
3955 "vect_model_reduction_cost: inside_cost = %d, "
3956 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3957 prologue_cost, epilogue_cost);
3958 }
3959
3960
3961 /* Function vect_model_induction_cost.
3962
3963 Models cost for induction operations. */
3964
3965 static void
3966 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
3967 stmt_vector_for_cost *cost_vec)
3968 {
3969 unsigned inside_cost, prologue_cost;
3970
3971 if (PURE_SLP_STMT (stmt_info))
3972 return;
3973
3974 /* loop cost for vec_loop. */
3975 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3976 stmt_info, 0, vect_body);
3977
3978 /* prologue cost for vec_init and vec_step. */
3979 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
3980 stmt_info, 0, vect_prologue);
3981
3982 if (dump_enabled_p ())
3983 dump_printf_loc (MSG_NOTE, vect_location,
3984 "vect_model_induction_cost: inside_cost = %d, "
3985 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3986 }
3987
3988
3989
3990 /* Function get_initial_def_for_reduction
3991
3992 Input:
3993 STMT_VINFO - a stmt that performs a reduction operation in the loop.
3994 INIT_VAL - the initial value of the reduction variable
3995
3996 Output:
3997 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3998 of the reduction (used for adjusting the epilog - see below).
3999 Return a vector variable, initialized according to the operation that
4000 STMT_VINFO performs. This vector will be used as the initial value
4001 of the vector of partial results.
4002
4003 Option1 (adjust in epilog): Initialize the vector as follows:
4004 add/bit or/xor: [0,0,...,0,0]
4005 mult/bit and: [1,1,...,1,1]
4006 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4007 and when necessary (e.g. add/mult case) let the caller know
4008 that it needs to adjust the result by init_val.
4009
4010 Option2: Initialize the vector as follows:
4011 add/bit or/xor: [init_val,0,0,...,0]
4012 mult/bit and: [init_val,1,1,...,1]
4013 min/max/cond_expr: [init_val,init_val,...,init_val]
4014 and no adjustments are needed.
4015
4016 For example, for the following code:
4017
4018 s = init_val;
4019 for (i=0;i<n;i++)
4020 s = s + a[i];
4021
4022 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4023 For a vector of 4 units, we want to return either [0,0,0,init_val],
4024 or [0,0,0,0] and let the caller know that it needs to adjust
4025 the result at the end by 'init_val'.
4026
4027 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4028 initialization vector is simpler (same element in all entries), if
4029 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4030
4031 A cost model should help decide between these two schemes. */
4032
4033 tree
4034 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
4035 tree *adjustment_def)
4036 {
4037 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4038 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4039 tree scalar_type = TREE_TYPE (init_val);
4040 tree vectype = get_vectype_for_scalar_type (scalar_type);
4041 enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
4042 tree def_for_init;
4043 tree init_def;
4044 REAL_VALUE_TYPE real_init_val = dconst0;
4045 int int_init_val = 0;
4046 gimple_seq stmts = NULL;
4047
4048 gcc_assert (vectype);
4049
4050 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4051 || SCALAR_FLOAT_TYPE_P (scalar_type));
4052
4053 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4054 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4055
4056 vect_reduction_type reduction_type
4057 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4058
4059 switch (code)
4060 {
4061 case WIDEN_SUM_EXPR:
4062 case DOT_PROD_EXPR:
4063 case SAD_EXPR:
4064 case PLUS_EXPR:
4065 case MINUS_EXPR:
4066 case BIT_IOR_EXPR:
4067 case BIT_XOR_EXPR:
4068 case MULT_EXPR:
4069 case BIT_AND_EXPR:
4070 {
4071 /* ADJUSTMENT_DEF is NULL when called from
4072 vect_create_epilog_for_reduction to vectorize double reduction. */
4073 if (adjustment_def)
4074 *adjustment_def = init_val;
4075
4076 if (code == MULT_EXPR)
4077 {
4078 real_init_val = dconst1;
4079 int_init_val = 1;
4080 }
4081
4082 if (code == BIT_AND_EXPR)
4083 int_init_val = -1;
4084
4085 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4086 def_for_init = build_real (scalar_type, real_init_val);
4087 else
4088 def_for_init = build_int_cst (scalar_type, int_init_val);
4089
4090 if (adjustment_def)
4091 /* Option1: the first element is '0' or '1' as well. */
4092 init_def = gimple_build_vector_from_val (&stmts, vectype,
4093 def_for_init);
4094 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4095 {
4096 /* Option2 (variable length): the first element is INIT_VAL. */
4097 init_def = gimple_build_vector_from_val (&stmts, vectype,
4098 def_for_init);
4099 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4100 vectype, init_def, init_val);
4101 }
4102 else
4103 {
4104 /* Option2: the first element is INIT_VAL. */
4105 tree_vector_builder elts (vectype, 1, 2);
4106 elts.quick_push (init_val);
4107 elts.quick_push (def_for_init);
4108 init_def = gimple_build_vector (&stmts, &elts);
4109 }
4110 }
4111 break;
4112
4113 case MIN_EXPR:
4114 case MAX_EXPR:
4115 case COND_EXPR:
4116 {
4117 if (adjustment_def)
4118 {
4119 *adjustment_def = NULL_TREE;
4120 if (reduction_type != COND_REDUCTION
4121 && reduction_type != EXTRACT_LAST_REDUCTION)
4122 {
4123 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
4124 break;
4125 }
4126 }
4127 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4128 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4129 }
4130 break;
4131
4132 default:
4133 gcc_unreachable ();
4134 }
4135
4136 if (stmts)
4137 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4138 return init_def;
4139 }
4140
4141 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4142 NUMBER_OF_VECTORS is the number of vector defs to create.
4143 If NEUTRAL_OP is nonnull, introducing extra elements of that
4144 value will not change the result. */
4145
4146 static void
4147 get_initial_defs_for_reduction (slp_tree slp_node,
4148 vec<tree> *vec_oprnds,
4149 unsigned int number_of_vectors,
4150 bool reduc_chain, tree neutral_op)
4151 {
4152 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4153 stmt_vec_info stmt_vinfo = stmts[0];
4154 unsigned HOST_WIDE_INT nunits;
4155 unsigned j, number_of_places_left_in_vector;
4156 tree vector_type;
4157 tree vop;
4158 int group_size = stmts.length ();
4159 unsigned int vec_num, i;
4160 unsigned number_of_copies = 1;
4161 vec<tree> voprnds;
4162 voprnds.create (number_of_vectors);
4163 struct loop *loop;
4164 auto_vec<tree, 16> permute_results;
4165
4166 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4167
4168 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4169
4170 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4171 gcc_assert (loop);
4172 edge pe = loop_preheader_edge (loop);
4173
4174 gcc_assert (!reduc_chain || neutral_op);
4175
4176 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4177 created vectors. It is greater than 1 if unrolling is performed.
4178
4179 For example, we have two scalar operands, s1 and s2 (e.g., group of
4180 strided accesses of size two), while NUNITS is four (i.e., four scalars
4181 of this type can be packed in a vector). The output vector will contain
4182 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4183 will be 2).
4184
4185 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4186 vectors containing the operands.
4187
4188 For example, NUNITS is four as before, and the group size is 8
4189 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4190 {s5, s6, s7, s8}. */
4191
4192 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4193 nunits = group_size;
4194
4195 number_of_copies = nunits * number_of_vectors / group_size;
4196
4197 number_of_places_left_in_vector = nunits;
4198 bool constant_p = true;
4199 tree_vector_builder elts (vector_type, nunits, 1);
4200 elts.quick_grow (nunits);
4201 for (j = 0; j < number_of_copies; j++)
4202 {
4203 for (i = group_size - 1; stmts.iterate (i, &stmt_vinfo); i--)
4204 {
4205 tree op;
4206 /* Get the def before the loop. In reduction chain we have only
4207 one initial value. */
4208 if ((j != (number_of_copies - 1)
4209 || (reduc_chain && i != 0))
4210 && neutral_op)
4211 op = neutral_op;
4212 else
4213 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4214
4215 /* Create 'vect_ = {op0,op1,...,opn}'. */
4216 number_of_places_left_in_vector--;
4217 elts[number_of_places_left_in_vector] = op;
4218 if (!CONSTANT_CLASS_P (op))
4219 constant_p = false;
4220
4221 if (number_of_places_left_in_vector == 0)
4222 {
4223 gimple_seq ctor_seq = NULL;
4224 tree init;
4225 if (constant_p && !neutral_op
4226 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4227 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4228 /* Build the vector directly from ELTS. */
4229 init = gimple_build_vector (&ctor_seq, &elts);
4230 else if (neutral_op)
4231 {
4232 /* Build a vector of the neutral value and shift the
4233 other elements into place. */
4234 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4235 neutral_op);
4236 int k = nunits;
4237 while (k > 0 && elts[k - 1] == neutral_op)
4238 k -= 1;
4239 while (k > 0)
4240 {
4241 k -= 1;
4242 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4243 vector_type, init, elts[k]);
4244 }
4245 }
4246 else
4247 {
4248 /* First time round, duplicate ELTS to fill the
4249 required number of vectors, then cherry pick the
4250 appropriate result for each iteration. */
4251 if (vec_oprnds->is_empty ())
4252 duplicate_and_interleave (&ctor_seq, vector_type, elts,
4253 number_of_vectors,
4254 permute_results);
4255 init = permute_results[number_of_vectors - j - 1];
4256 }
4257 if (ctor_seq != NULL)
4258 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4259 voprnds.quick_push (init);
4260
4261 number_of_places_left_in_vector = nunits;
4262 elts.new_vector (vector_type, nunits, 1);
4263 elts.quick_grow (nunits);
4264 constant_p = true;
4265 }
4266 }
4267 }
4268
4269 /* Since the vectors are created in the reverse order, we should invert
4270 them. */
4271 vec_num = voprnds.length ();
4272 for (j = vec_num; j != 0; j--)
4273 {
4274 vop = voprnds[j - 1];
4275 vec_oprnds->quick_push (vop);
4276 }
4277
4278 voprnds.release ();
4279
4280 /* In case that VF is greater than the unrolling factor needed for the SLP
4281 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4282 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4283 to replicate the vectors. */
4284 tree neutral_vec = NULL;
4285 while (number_of_vectors > vec_oprnds->length ())
4286 {
4287 if (neutral_op)
4288 {
4289 if (!neutral_vec)
4290 {
4291 gimple_seq ctor_seq = NULL;
4292 neutral_vec = gimple_build_vector_from_val
4293 (&ctor_seq, vector_type, neutral_op);
4294 if (ctor_seq != NULL)
4295 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4296 }
4297 vec_oprnds->quick_push (neutral_vec);
4298 }
4299 else
4300 {
4301 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4302 vec_oprnds->quick_push (vop);
4303 }
4304 }
4305 }
4306
4307
4308 /* Function vect_create_epilog_for_reduction
4309
4310 Create code at the loop-epilog to finalize the result of a reduction
4311 computation.
4312
4313 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4314 reduction statements.
4315 STMT_INFO is the scalar reduction stmt that is being vectorized.
4316 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4317 number of elements that we can fit in a vectype (nunits). In this case
4318 we have to generate more than one vector stmt - i.e - we need to "unroll"
4319 the vector stmt by a factor VF/nunits. For more details see documentation
4320 in vectorizable_operation.
4321 REDUC_FN is the internal function for the epilog reduction.
4322 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4323 computation.
4324 REDUC_INDEX is the index of the operand in the right hand side of the
4325 statement that is defined by REDUCTION_PHI.
4326 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4327 SLP_NODE is an SLP node containing a group of reduction statements. The
4328 first one in this group is STMT_INFO.
4329 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4330 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4331 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4332 any value of the IV in the loop.
4333 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4334 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4335 null if this is not an SLP reduction
4336
4337 This function:
4338 1. Creates the reduction def-use cycles: sets the arguments for
4339 REDUCTION_PHIS:
4340 The loop-entry argument is the vectorized initial-value of the reduction.
4341 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4342 sums.
4343 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4344 by calling the function specified by REDUC_FN if available, or by
4345 other means (whole-vector shifts or a scalar loop).
4346 The function also creates a new phi node at the loop exit to preserve
4347 loop-closed form, as illustrated below.
4348
4349 The flow at the entry to this function:
4350
4351 loop:
4352 vec_def = phi <null, null> # REDUCTION_PHI
4353 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4354 s_loop = scalar_stmt # (scalar) STMT_INFO
4355 loop_exit:
4356 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4357 use <s_out0>
4358 use <s_out0>
4359
4360 The above is transformed by this function into:
4361
4362 loop:
4363 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4364 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4365 s_loop = scalar_stmt # (scalar) STMT_INFO
4366 loop_exit:
4367 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4368 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4369 v_out2 = reduce <v_out1>
4370 s_out3 = extract_field <v_out2, 0>
4371 s_out4 = adjust_result <s_out3>
4372 use <s_out4>
4373 use <s_out4>
4374 */
4375
4376 static void
4377 vect_create_epilog_for_reduction (vec<tree> vect_defs,
4378 stmt_vec_info stmt_info,
4379 gimple *reduc_def_stmt,
4380 int ncopies, internal_fn reduc_fn,
4381 vec<stmt_vec_info> reduction_phis,
4382 bool double_reduc,
4383 slp_tree slp_node,
4384 slp_instance slp_node_instance,
4385 tree induc_val, enum tree_code induc_code,
4386 tree neutral_op)
4387 {
4388 stmt_vec_info prev_phi_info;
4389 tree vectype;
4390 machine_mode mode;
4391 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4392 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4393 basic_block exit_bb;
4394 tree scalar_dest;
4395 tree scalar_type;
4396 gimple *new_phi = NULL, *phi;
4397 stmt_vec_info phi_info;
4398 gimple_stmt_iterator exit_gsi;
4399 tree vec_dest;
4400 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4401 gimple *epilog_stmt = NULL;
4402 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
4403 gimple *exit_phi;
4404 tree bitsize;
4405 tree adjustment_def = NULL;
4406 tree vec_initial_def = NULL;
4407 tree expr, def, initial_def = NULL;
4408 tree orig_name, scalar_result;
4409 imm_use_iterator imm_iter, phi_imm_iter;
4410 use_operand_p use_p, phi_use_p;
4411 gimple *use_stmt;
4412 stmt_vec_info reduction_phi_info = NULL;
4413 bool nested_in_vect_loop = false;
4414 auto_vec<gimple *> new_phis;
4415 auto_vec<stmt_vec_info> inner_phis;
4416 int j, i;
4417 auto_vec<tree> scalar_results;
4418 unsigned int group_size = 1, k, ratio;
4419 auto_vec<tree> vec_initial_defs;
4420 auto_vec<gimple *> phis;
4421 bool slp_reduc = false;
4422 bool direct_slp_reduc;
4423 tree new_phi_result;
4424 stmt_vec_info inner_phi = NULL;
4425 tree induction_index = NULL_TREE;
4426
4427 if (slp_node)
4428 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4429
4430 if (nested_in_vect_loop_p (loop, stmt_info))
4431 {
4432 outer_loop = loop;
4433 loop = loop->inner;
4434 nested_in_vect_loop = true;
4435 gcc_assert (!slp_node);
4436 }
4437
4438 vectype = STMT_VINFO_VECTYPE (stmt_info);
4439 gcc_assert (vectype);
4440 mode = TYPE_MODE (vectype);
4441
4442 /* 1. Create the reduction def-use cycle:
4443 Set the arguments of REDUCTION_PHIS, i.e., transform
4444
4445 loop:
4446 vec_def = phi <null, null> # REDUCTION_PHI
4447 VECT_DEF = vector_stmt # vectorized form of STMT
4448 ...
4449
4450 into:
4451
4452 loop:
4453 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4454 VECT_DEF = vector_stmt # vectorized form of STMT
4455 ...
4456
4457 (in case of SLP, do it for all the phis). */
4458
4459 /* Get the loop-entry arguments. */
4460 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4461 if (slp_node)
4462 {
4463 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4464 vec_initial_defs.reserve (vec_num);
4465 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4466 &vec_initial_defs, vec_num,
4467 REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4468 neutral_op);
4469 }
4470 else
4471 {
4472 /* Get at the scalar def before the loop, that defines the initial value
4473 of the reduction variable. */
4474 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4475 loop_preheader_edge (loop));
4476 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4477 and we can't use zero for induc_val, use initial_def. Similarly
4478 for REDUC_MIN and initial_def larger than the base. */
4479 if (TREE_CODE (initial_def) == INTEGER_CST
4480 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4481 == INTEGER_INDUC_COND_REDUCTION)
4482 && !integer_zerop (induc_val)
4483 && ((induc_code == MAX_EXPR
4484 && tree_int_cst_lt (initial_def, induc_val))
4485 || (induc_code == MIN_EXPR
4486 && tree_int_cst_lt (induc_val, initial_def))))
4487 induc_val = initial_def;
4488
4489 if (double_reduc)
4490 /* In case of double reduction we only create a vector variable
4491 to be put in the reduction phi node. The actual statement
4492 creation is done later in this function. */
4493 vec_initial_def = vect_create_destination_var (initial_def, vectype);
4494 else if (nested_in_vect_loop)
4495 {
4496 /* Do not use an adjustment def as that case is not supported
4497 correctly if ncopies is not one. */
4498 vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4499 vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4500 stmt_info);
4501 }
4502 else
4503 vec_initial_def
4504 = get_initial_def_for_reduction (stmt_info, initial_def,
4505 &adjustment_def);
4506 vec_initial_defs.create (1);
4507 vec_initial_defs.quick_push (vec_initial_def);
4508 }
4509
4510 /* Set phi nodes arguments. */
4511 FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4512 {
4513 tree vec_init_def = vec_initial_defs[i];
4514 tree def = vect_defs[i];
4515 for (j = 0; j < ncopies; j++)
4516 {
4517 if (j != 0)
4518 {
4519 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4520 if (nested_in_vect_loop)
4521 vec_init_def
4522 = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
4523 }
4524
4525 /* Set the loop-entry arg of the reduction-phi. */
4526
4527 gphi *phi = as_a <gphi *> (phi_info->stmt);
4528 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4529 == INTEGER_INDUC_COND_REDUCTION)
4530 {
4531 /* Initialise the reduction phi to zero. This prevents initial
4532 values of non-zero interferring with the reduction op. */
4533 gcc_assert (ncopies == 1);
4534 gcc_assert (i == 0);
4535
4536 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4537 tree induc_val_vec
4538 = build_vector_from_val (vec_init_def_type, induc_val);
4539
4540 add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4541 UNKNOWN_LOCATION);
4542 }
4543 else
4544 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4545 UNKNOWN_LOCATION);
4546
4547 /* Set the loop-latch arg for the reduction-phi. */
4548 if (j > 0)
4549 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4550
4551 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4552
4553 if (dump_enabled_p ())
4554 {
4555 dump_printf_loc (MSG_NOTE, vect_location,
4556 "transform reduction: created def-use cycle: ");
4557 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4558 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4559 }
4560 }
4561 }
4562
4563 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4564 which is updated with the current index of the loop for every match of
4565 the original loop's cond_expr (VEC_STMT). This results in a vector
4566 containing the last time the condition passed for that vector lane.
4567 The first match will be a 1 to allow 0 to be used for non-matching
4568 indexes. If there are no matches at all then the vector will be all
4569 zeroes. */
4570 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4571 {
4572 tree indx_before_incr, indx_after_incr;
4573 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4574
4575 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4576 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4577
4578 int scalar_precision
4579 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4580 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4581 tree cr_index_vector_type = build_vector_type
4582 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4583
4584 /* First we create a simple vector induction variable which starts
4585 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4586 vector size (STEP). */
4587
4588 /* Create a {1,2,3,...} vector. */
4589 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4590
4591 /* Create a vector of the step value. */
4592 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4593 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4594
4595 /* Create an induction variable. */
4596 gimple_stmt_iterator incr_gsi;
4597 bool insert_after;
4598 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4599 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4600 insert_after, &indx_before_incr, &indx_after_incr);
4601
4602 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4603 filled with zeros (VEC_ZERO). */
4604
4605 /* Create a vector of 0s. */
4606 tree zero = build_zero_cst (cr_index_scalar_type);
4607 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4608
4609 /* Create a vector phi node. */
4610 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4611 new_phi = create_phi_node (new_phi_tree, loop->header);
4612 loop_vinfo->add_stmt (new_phi);
4613 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4614 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4615
4616 /* Now take the condition from the loops original cond_expr
4617 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4618 every match uses values from the induction variable
4619 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4620 (NEW_PHI_TREE).
4621 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4622 the new cond_expr (INDEX_COND_EXPR). */
4623
4624 /* Duplicate the condition from vec_stmt. */
4625 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4626
4627 /* Create a conditional, where the condition is taken from vec_stmt
4628 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4629 else is the phi (NEW_PHI_TREE). */
4630 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4631 ccompare, indx_before_incr,
4632 new_phi_tree);
4633 induction_index = make_ssa_name (cr_index_vector_type);
4634 gimple *index_condition = gimple_build_assign (induction_index,
4635 index_cond_expr);
4636 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4637 stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4638 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4639
4640 /* Update the phi with the vec cond. */
4641 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4642 loop_latch_edge (loop), UNKNOWN_LOCATION);
4643 }
4644
4645 /* 2. Create epilog code.
4646 The reduction epilog code operates across the elements of the vector
4647 of partial results computed by the vectorized loop.
4648 The reduction epilog code consists of:
4649
4650 step 1: compute the scalar result in a vector (v_out2)
4651 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4652 step 3: adjust the scalar result (s_out3) if needed.
4653
4654 Step 1 can be accomplished using one the following three schemes:
4655 (scheme 1) using reduc_fn, if available.
4656 (scheme 2) using whole-vector shifts, if available.
4657 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4658 combined.
4659
4660 The overall epilog code looks like this:
4661
4662 s_out0 = phi <s_loop> # original EXIT_PHI
4663 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4664 v_out2 = reduce <v_out1> # step 1
4665 s_out3 = extract_field <v_out2, 0> # step 2
4666 s_out4 = adjust_result <s_out3> # step 3
4667
4668 (step 3 is optional, and steps 1 and 2 may be combined).
4669 Lastly, the uses of s_out0 are replaced by s_out4. */
4670
4671
4672 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4673 v_out1 = phi <VECT_DEF>
4674 Store them in NEW_PHIS. */
4675
4676 exit_bb = single_exit (loop)->dest;
4677 prev_phi_info = NULL;
4678 new_phis.create (vect_defs.length ());
4679 FOR_EACH_VEC_ELT (vect_defs, i, def)
4680 {
4681 for (j = 0; j < ncopies; j++)
4682 {
4683 tree new_def = copy_ssa_name (def);
4684 phi = create_phi_node (new_def, exit_bb);
4685 stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4686 if (j == 0)
4687 new_phis.quick_push (phi);
4688 else
4689 {
4690 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4691 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4692 }
4693
4694 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4695 prev_phi_info = phi_info;
4696 }
4697 }
4698
4699 /* The epilogue is created for the outer-loop, i.e., for the loop being
4700 vectorized. Create exit phis for the outer loop. */
4701 if (double_reduc)
4702 {
4703 loop = outer_loop;
4704 exit_bb = single_exit (loop)->dest;
4705 inner_phis.create (vect_defs.length ());
4706 FOR_EACH_VEC_ELT (new_phis, i, phi)
4707 {
4708 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4709 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4710 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4711 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4712 PHI_RESULT (phi));
4713 prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4714 inner_phis.quick_push (phi_info);
4715 new_phis[i] = outer_phi;
4716 while (STMT_VINFO_RELATED_STMT (phi_info))
4717 {
4718 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4719 new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4720 outer_phi = create_phi_node (new_result, exit_bb);
4721 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4722 PHI_RESULT (phi_info->stmt));
4723 stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4724 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4725 prev_phi_info = outer_phi_info;
4726 }
4727 }
4728 }
4729
4730 exit_gsi = gsi_after_labels (exit_bb);
4731
4732 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4733 (i.e. when reduc_fn is not available) and in the final adjustment
4734 code (if needed). Also get the original scalar reduction variable as
4735 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4736 represents a reduction pattern), the tree-code and scalar-def are
4737 taken from the original stmt that the pattern-stmt (STMT) replaces.
4738 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4739 are taken from STMT. */
4740
4741 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
4742 if (!orig_stmt_info)
4743 {
4744 /* Regular reduction */
4745 orig_stmt_info = stmt_info;
4746 }
4747 else
4748 {
4749 /* Reduction pattern */
4750 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4751 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4752 }
4753
4754 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4755 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4756 partial results are added and not subtracted. */
4757 if (code == MINUS_EXPR)
4758 code = PLUS_EXPR;
4759
4760 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4761 scalar_type = TREE_TYPE (scalar_dest);
4762 scalar_results.create (group_size);
4763 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4764 bitsize = TYPE_SIZE (scalar_type);
4765
4766 /* In case this is a reduction in an inner-loop while vectorizing an outer
4767 loop - we don't need to extract a single scalar result at the end of the
4768 inner-loop (unless it is double reduction, i.e., the use of reduction is
4769 outside the outer-loop). The final vector of partial results will be used
4770 in the vectorized outer-loop, or reduced to a scalar result at the end of
4771 the outer-loop. */
4772 if (nested_in_vect_loop && !double_reduc)
4773 goto vect_finalize_reduction;
4774
4775 /* SLP reduction without reduction chain, e.g.,
4776 # a1 = phi <a2, a0>
4777 # b1 = phi <b2, b0>
4778 a2 = operation (a1)
4779 b2 = operation (b1) */
4780 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4781
4782 /* True if we should implement SLP_REDUC using native reduction operations
4783 instead of scalar operations. */
4784 direct_slp_reduc = (reduc_fn != IFN_LAST
4785 && slp_reduc
4786 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4787
4788 /* In case of reduction chain, e.g.,
4789 # a1 = phi <a3, a0>
4790 a2 = operation (a1)
4791 a3 = operation (a2),
4792
4793 we may end up with more than one vector result. Here we reduce them to
4794 one vector. */
4795 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4796 {
4797 tree first_vect = PHI_RESULT (new_phis[0]);
4798 gassign *new_vec_stmt = NULL;
4799 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4800 for (k = 1; k < new_phis.length (); k++)
4801 {
4802 gimple *next_phi = new_phis[k];
4803 tree second_vect = PHI_RESULT (next_phi);
4804 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4805 new_vec_stmt = gimple_build_assign (tem, code,
4806 first_vect, second_vect);
4807 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4808 first_vect = tem;
4809 }
4810
4811 new_phi_result = first_vect;
4812 if (new_vec_stmt)
4813 {
4814 new_phis.truncate (0);
4815 new_phis.safe_push (new_vec_stmt);
4816 }
4817 }
4818 /* Likewise if we couldn't use a single defuse cycle. */
4819 else if (ncopies > 1)
4820 {
4821 gcc_assert (new_phis.length () == 1);
4822 tree first_vect = PHI_RESULT (new_phis[0]);
4823 gassign *new_vec_stmt = NULL;
4824 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4825 stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4826 for (int k = 1; k < ncopies; ++k)
4827 {
4828 next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4829 tree second_vect = PHI_RESULT (next_phi_info->stmt);
4830 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4831 new_vec_stmt = gimple_build_assign (tem, code,
4832 first_vect, second_vect);
4833 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4834 first_vect = tem;
4835 }
4836 new_phi_result = first_vect;
4837 new_phis.truncate (0);
4838 new_phis.safe_push (new_vec_stmt);
4839 }
4840 else
4841 new_phi_result = PHI_RESULT (new_phis[0]);
4842
4843 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4844 && reduc_fn != IFN_LAST)
4845 {
4846 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4847 various data values where the condition matched and another vector
4848 (INDUCTION_INDEX) containing all the indexes of those matches. We
4849 need to extract the last matching index (which will be the index with
4850 highest value) and use this to index into the data vector.
4851 For the case where there were no matches, the data vector will contain
4852 all default values and the index vector will be all zeros. */
4853
4854 /* Get various versions of the type of the vector of indexes. */
4855 tree index_vec_type = TREE_TYPE (induction_index);
4856 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4857 tree index_scalar_type = TREE_TYPE (index_vec_type);
4858 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4859 (index_vec_type);
4860
4861 /* Get an unsigned integer version of the type of the data vector. */
4862 int scalar_precision
4863 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4864 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4865 tree vectype_unsigned = build_vector_type
4866 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4867
4868 /* First we need to create a vector (ZERO_VEC) of zeros and another
4869 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4870 can create using a MAX reduction and then expanding.
4871 In the case where the loop never made any matches, the max index will
4872 be zero. */
4873
4874 /* Vector of {0, 0, 0,...}. */
4875 tree zero_vec = make_ssa_name (vectype);
4876 tree zero_vec_rhs = build_zero_cst (vectype);
4877 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4878 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4879
4880 /* Find maximum value from the vector of found indexes. */
4881 tree max_index = make_ssa_name (index_scalar_type);
4882 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4883 1, induction_index);
4884 gimple_call_set_lhs (max_index_stmt, max_index);
4885 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4886
4887 /* Vector of {max_index, max_index, max_index,...}. */
4888 tree max_index_vec = make_ssa_name (index_vec_type);
4889 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4890 max_index);
4891 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4892 max_index_vec_rhs);
4893 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4894
4895 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4896 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4897 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4898 otherwise. Only one value should match, resulting in a vector
4899 (VEC_COND) with one data value and the rest zeros.
4900 In the case where the loop never made any matches, every index will
4901 match, resulting in a vector with all data values (which will all be
4902 the default value). */
4903
4904 /* Compare the max index vector to the vector of found indexes to find
4905 the position of the max value. */
4906 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4907 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4908 induction_index,
4909 max_index_vec);
4910 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4911
4912 /* Use the compare to choose either values from the data vector or
4913 zero. */
4914 tree vec_cond = make_ssa_name (vectype);
4915 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4916 vec_compare, new_phi_result,
4917 zero_vec);
4918 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4919
4920 /* Finally we need to extract the data value from the vector (VEC_COND)
4921 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4922 reduction, but because this doesn't exist, we can use a MAX reduction
4923 instead. The data value might be signed or a float so we need to cast
4924 it first.
4925 In the case where the loop never made any matches, the data values are
4926 all identical, and so will reduce down correctly. */
4927
4928 /* Make the matched data values unsigned. */
4929 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4930 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4931 vec_cond);
4932 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4933 VIEW_CONVERT_EXPR,
4934 vec_cond_cast_rhs);
4935 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4936
4937 /* Reduce down to a scalar value. */
4938 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4939 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4940 1, vec_cond_cast);
4941 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4942 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4943
4944 /* Convert the reduced value back to the result type and set as the
4945 result. */
4946 gimple_seq stmts = NULL;
4947 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4948 data_reduc);
4949 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4950 scalar_results.safe_push (new_temp);
4951 }
4952 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4953 && reduc_fn == IFN_LAST)
4954 {
4955 /* Condition reduction without supported IFN_REDUC_MAX. Generate
4956 idx = 0;
4957 idx_val = induction_index[0];
4958 val = data_reduc[0];
4959 for (idx = 0, val = init, i = 0; i < nelts; ++i)
4960 if (induction_index[i] > idx_val)
4961 val = data_reduc[i], idx_val = induction_index[i];
4962 return val; */
4963
4964 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4965 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4966 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4967 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4968 /* Enforced by vectorizable_reduction, which ensures we have target
4969 support before allowing a conditional reduction on variable-length
4970 vectors. */
4971 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4972 tree idx_val = NULL_TREE, val = NULL_TREE;
4973 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4974 {
4975 tree old_idx_val = idx_val;
4976 tree old_val = val;
4977 idx_val = make_ssa_name (idx_eltype);
4978 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4979 build3 (BIT_FIELD_REF, idx_eltype,
4980 induction_index,
4981 bitsize_int (el_size),
4982 bitsize_int (off)));
4983 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4984 val = make_ssa_name (data_eltype);
4985 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4986 build3 (BIT_FIELD_REF,
4987 data_eltype,
4988 new_phi_result,
4989 bitsize_int (el_size),
4990 bitsize_int (off)));
4991 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4992 if (off != 0)
4993 {
4994 tree new_idx_val = idx_val;
4995 tree new_val = val;
4996 if (off != v_size - el_size)
4997 {
4998 new_idx_val = make_ssa_name (idx_eltype);
4999 epilog_stmt = gimple_build_assign (new_idx_val,
5000 MAX_EXPR, idx_val,
5001 old_idx_val);
5002 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5003 }
5004 new_val = make_ssa_name (data_eltype);
5005 epilog_stmt = gimple_build_assign (new_val,
5006 COND_EXPR,
5007 build2 (GT_EXPR,
5008 boolean_type_node,
5009 idx_val,
5010 old_idx_val),
5011 val, old_val);
5012 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5013 idx_val = new_idx_val;
5014 val = new_val;
5015 }
5016 }
5017 /* Convert the reduced value back to the result type and set as the
5018 result. */
5019 gimple_seq stmts = NULL;
5020 val = gimple_convert (&stmts, scalar_type, val);
5021 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5022 scalar_results.safe_push (val);
5023 }
5024
5025 /* 2.3 Create the reduction code, using one of the three schemes described
5026 above. In SLP we simply need to extract all the elements from the
5027 vector (without reducing them), so we use scalar shifts. */
5028 else if (reduc_fn != IFN_LAST && !slp_reduc)
5029 {
5030 tree tmp;
5031 tree vec_elem_type;
5032
5033 /* Case 1: Create:
5034 v_out2 = reduc_expr <v_out1> */
5035
5036 if (dump_enabled_p ())
5037 dump_printf_loc (MSG_NOTE, vect_location,
5038 "Reduce using direct vector reduction.\n");
5039
5040 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5041 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5042 {
5043 tree tmp_dest
5044 = vect_create_destination_var (scalar_dest, vec_elem_type);
5045 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5046 new_phi_result);
5047 gimple_set_lhs (epilog_stmt, tmp_dest);
5048 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5049 gimple_set_lhs (epilog_stmt, new_temp);
5050 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5051
5052 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5053 new_temp);
5054 }
5055 else
5056 {
5057 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5058 new_phi_result);
5059 gimple_set_lhs (epilog_stmt, new_scalar_dest);
5060 }
5061
5062 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5063 gimple_set_lhs (epilog_stmt, new_temp);
5064 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5065
5066 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5067 == INTEGER_INDUC_COND_REDUCTION)
5068 && !operand_equal_p (initial_def, induc_val, 0))
5069 {
5070 /* Earlier we set the initial value to be a vector if induc_val
5071 values. Check the result and if it is induc_val then replace
5072 with the original initial value, unless induc_val is
5073 the same as initial_def already. */
5074 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5075 induc_val);
5076
5077 tmp = make_ssa_name (new_scalar_dest);
5078 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5079 initial_def, new_temp);
5080 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5081 new_temp = tmp;
5082 }
5083
5084 scalar_results.safe_push (new_temp);
5085 }
5086 else if (direct_slp_reduc)
5087 {
5088 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5089 with the elements for other SLP statements replaced with the
5090 neutral value. We can then do a normal reduction on each vector. */
5091
5092 /* Enforced by vectorizable_reduction. */
5093 gcc_assert (new_phis.length () == 1);
5094 gcc_assert (pow2p_hwi (group_size));
5095
5096 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5097 vec<stmt_vec_info> orig_phis
5098 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5099 gimple_seq seq = NULL;
5100
5101 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5102 and the same element size as VECTYPE. */
5103 tree index = build_index_vector (vectype, 0, 1);
5104 tree index_type = TREE_TYPE (index);
5105 tree index_elt_type = TREE_TYPE (index_type);
5106 tree mask_type = build_same_sized_truth_vector_type (index_type);
5107
5108 /* Create a vector that, for each element, identifies which of
5109 the REDUC_GROUP_SIZE results should use it. */
5110 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5111 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5112 build_vector_from_val (index_type, index_mask));
5113
5114 /* Get a neutral vector value. This is simply a splat of the neutral
5115 scalar value if we have one, otherwise the initial scalar value
5116 is itself a neutral value. */
5117 tree vector_identity = NULL_TREE;
5118 if (neutral_op)
5119 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5120 neutral_op);
5121 for (unsigned int i = 0; i < group_size; ++i)
5122 {
5123 /* If there's no univeral neutral value, we can use the
5124 initial scalar value from the original PHI. This is used
5125 for MIN and MAX reduction, for example. */
5126 if (!neutral_op)
5127 {
5128 tree scalar_value
5129 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5130 loop_preheader_edge (loop));
5131 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5132 scalar_value);
5133 }
5134
5135 /* Calculate the equivalent of:
5136
5137 sel[j] = (index[j] == i);
5138
5139 which selects the elements of NEW_PHI_RESULT that should
5140 be included in the result. */
5141 tree compare_val = build_int_cst (index_elt_type, i);
5142 compare_val = build_vector_from_val (index_type, compare_val);
5143 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5144 index, compare_val);
5145
5146 /* Calculate the equivalent of:
5147
5148 vec = seq ? new_phi_result : vector_identity;
5149
5150 VEC is now suitable for a full vector reduction. */
5151 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5152 sel, new_phi_result, vector_identity);
5153
5154 /* Do the reduction and convert it to the appropriate type. */
5155 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5156 TREE_TYPE (vectype), vec);
5157 scalar = gimple_convert (&seq, scalar_type, scalar);
5158 scalar_results.safe_push (scalar);
5159 }
5160 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5161 }
5162 else
5163 {
5164 bool reduce_with_shift;
5165 tree vec_temp;
5166
5167 /* COND reductions all do the final reduction with MAX_EXPR
5168 or MIN_EXPR. */
5169 if (code == COND_EXPR)
5170 {
5171 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5172 == INTEGER_INDUC_COND_REDUCTION)
5173 code = induc_code;
5174 else
5175 code = MAX_EXPR;
5176 }
5177
5178 /* See if the target wants to do the final (shift) reduction
5179 in a vector mode of smaller size and first reduce upper/lower
5180 halves against each other. */
5181 enum machine_mode mode1 = mode;
5182 tree vectype1 = vectype;
5183 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5184 unsigned sz1 = sz;
5185 if (!slp_reduc
5186 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5187 sz1 = GET_MODE_SIZE (mode1).to_constant ();
5188
5189 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5190 reduce_with_shift = have_whole_vector_shift (mode1);
5191 if (!VECTOR_MODE_P (mode1))
5192 reduce_with_shift = false;
5193 else
5194 {
5195 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5196 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5197 reduce_with_shift = false;
5198 }
5199
5200 /* First reduce the vector to the desired vector size we should
5201 do shift reduction on by combining upper and lower halves. */
5202 new_temp = new_phi_result;
5203 while (sz > sz1)
5204 {
5205 gcc_assert (!slp_reduc);
5206 sz /= 2;
5207 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5208
5209 /* The target has to make sure we support lowpart/highpart
5210 extraction, either via direct vector extract or through
5211 an integer mode punning. */
5212 tree dst1, dst2;
5213 if (convert_optab_handler (vec_extract_optab,
5214 TYPE_MODE (TREE_TYPE (new_temp)),
5215 TYPE_MODE (vectype1))
5216 != CODE_FOR_nothing)
5217 {
5218 /* Extract sub-vectors directly once vec_extract becomes
5219 a conversion optab. */
5220 dst1 = make_ssa_name (vectype1);
5221 epilog_stmt
5222 = gimple_build_assign (dst1, BIT_FIELD_REF,
5223 build3 (BIT_FIELD_REF, vectype1,
5224 new_temp, TYPE_SIZE (vectype1),
5225 bitsize_int (0)));
5226 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5227 dst2 = make_ssa_name (vectype1);
5228 epilog_stmt
5229 = gimple_build_assign (dst2, BIT_FIELD_REF,
5230 build3 (BIT_FIELD_REF, vectype1,
5231 new_temp, TYPE_SIZE (vectype1),
5232 bitsize_int (sz * BITS_PER_UNIT)));
5233 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5234 }
5235 else
5236 {
5237 /* Extract via punning to appropriately sized integer mode
5238 vector. */
5239 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5240 1);
5241 tree etype = build_vector_type (eltype, 2);
5242 gcc_assert (convert_optab_handler (vec_extract_optab,
5243 TYPE_MODE (etype),
5244 TYPE_MODE (eltype))
5245 != CODE_FOR_nothing);
5246 tree tem = make_ssa_name (etype);
5247 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5248 build1 (VIEW_CONVERT_EXPR,
5249 etype, new_temp));
5250 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5251 new_temp = tem;
5252 tem = make_ssa_name (eltype);
5253 epilog_stmt
5254 = gimple_build_assign (tem, BIT_FIELD_REF,
5255 build3 (BIT_FIELD_REF, eltype,
5256 new_temp, TYPE_SIZE (eltype),
5257 bitsize_int (0)));
5258 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5259 dst1 = make_ssa_name (vectype1);
5260 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5261 build1 (VIEW_CONVERT_EXPR,
5262 vectype1, tem));
5263 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5264 tem = make_ssa_name (eltype);
5265 epilog_stmt
5266 = gimple_build_assign (tem, BIT_FIELD_REF,
5267 build3 (BIT_FIELD_REF, eltype,
5268 new_temp, TYPE_SIZE (eltype),
5269 bitsize_int (sz * BITS_PER_UNIT)));
5270 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5271 dst2 = make_ssa_name (vectype1);
5272 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5273 build1 (VIEW_CONVERT_EXPR,
5274 vectype1, tem));
5275 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5276 }
5277
5278 new_temp = make_ssa_name (vectype1);
5279 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5280 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5281 }
5282
5283 if (reduce_with_shift && !slp_reduc)
5284 {
5285 int element_bitsize = tree_to_uhwi (bitsize);
5286 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5287 for variable-length vectors and also requires direct target support
5288 for loop reductions. */
5289 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5290 int nelements = vec_size_in_bits / element_bitsize;
5291 vec_perm_builder sel;
5292 vec_perm_indices indices;
5293
5294 int elt_offset;
5295
5296 tree zero_vec = build_zero_cst (vectype1);
5297 /* Case 2: Create:
5298 for (offset = nelements/2; offset >= 1; offset/=2)
5299 {
5300 Create: va' = vec_shift <va, offset>
5301 Create: va = vop <va, va'>
5302 } */
5303
5304 tree rhs;
5305
5306 if (dump_enabled_p ())
5307 dump_printf_loc (MSG_NOTE, vect_location,
5308 "Reduce using vector shifts\n");
5309
5310 mode1 = TYPE_MODE (vectype1);
5311 vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5312 for (elt_offset = nelements / 2;
5313 elt_offset >= 1;
5314 elt_offset /= 2)
5315 {
5316 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5317 indices.new_vector (sel, 2, nelements);
5318 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5319 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5320 new_temp, zero_vec, mask);
5321 new_name = make_ssa_name (vec_dest, epilog_stmt);
5322 gimple_assign_set_lhs (epilog_stmt, new_name);
5323 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5324
5325 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5326 new_temp);
5327 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5328 gimple_assign_set_lhs (epilog_stmt, new_temp);
5329 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5330 }
5331
5332 /* 2.4 Extract the final scalar result. Create:
5333 s_out3 = extract_field <v_out2, bitpos> */
5334
5335 if (dump_enabled_p ())
5336 dump_printf_loc (MSG_NOTE, vect_location,
5337 "extract scalar result\n");
5338
5339 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5340 bitsize, bitsize_zero_node);
5341 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5342 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5343 gimple_assign_set_lhs (epilog_stmt, new_temp);
5344 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5345 scalar_results.safe_push (new_temp);
5346 }
5347 else
5348 {
5349 /* Case 3: Create:
5350 s = extract_field <v_out2, 0>
5351 for (offset = element_size;
5352 offset < vector_size;
5353 offset += element_size;)
5354 {
5355 Create: s' = extract_field <v_out2, offset>
5356 Create: s = op <s, s'> // For non SLP cases
5357 } */
5358
5359 if (dump_enabled_p ())
5360 dump_printf_loc (MSG_NOTE, vect_location,
5361 "Reduce using scalar code.\n");
5362
5363 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5364 int element_bitsize = tree_to_uhwi (bitsize);
5365 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5366 {
5367 int bit_offset;
5368 if (gimple_code (new_phi) == GIMPLE_PHI)
5369 vec_temp = PHI_RESULT (new_phi);
5370 else
5371 vec_temp = gimple_assign_lhs (new_phi);
5372 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5373 bitsize_zero_node);
5374 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5375 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5376 gimple_assign_set_lhs (epilog_stmt, new_temp);
5377 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5378
5379 /* In SLP we don't need to apply reduction operation, so we just
5380 collect s' values in SCALAR_RESULTS. */
5381 if (slp_reduc)
5382 scalar_results.safe_push (new_temp);
5383
5384 for (bit_offset = element_bitsize;
5385 bit_offset < vec_size_in_bits;
5386 bit_offset += element_bitsize)
5387 {
5388 tree bitpos = bitsize_int (bit_offset);
5389 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5390 bitsize, bitpos);
5391
5392 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5393 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5394 gimple_assign_set_lhs (epilog_stmt, new_name);
5395 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5396
5397 if (slp_reduc)
5398 {
5399 /* In SLP we don't need to apply reduction operation, so
5400 we just collect s' values in SCALAR_RESULTS. */
5401 new_temp = new_name;
5402 scalar_results.safe_push (new_name);
5403 }
5404 else
5405 {
5406 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5407 new_name, new_temp);
5408 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5409 gimple_assign_set_lhs (epilog_stmt, new_temp);
5410 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5411 }
5412 }
5413 }
5414
5415 /* The only case where we need to reduce scalar results in SLP, is
5416 unrolling. If the size of SCALAR_RESULTS is greater than
5417 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5418 REDUC_GROUP_SIZE. */
5419 if (slp_reduc)
5420 {
5421 tree res, first_res, new_res;
5422 gimple *new_stmt;
5423
5424 /* Reduce multiple scalar results in case of SLP unrolling. */
5425 for (j = group_size; scalar_results.iterate (j, &res);
5426 j++)
5427 {
5428 first_res = scalar_results[j % group_size];
5429 new_stmt = gimple_build_assign (new_scalar_dest, code,
5430 first_res, res);
5431 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5432 gimple_assign_set_lhs (new_stmt, new_res);
5433 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5434 scalar_results[j % group_size] = new_res;
5435 }
5436 }
5437 else
5438 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5439 scalar_results.safe_push (new_temp);
5440 }
5441
5442 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5443 == INTEGER_INDUC_COND_REDUCTION)
5444 && !operand_equal_p (initial_def, induc_val, 0))
5445 {
5446 /* Earlier we set the initial value to be a vector if induc_val
5447 values. Check the result and if it is induc_val then replace
5448 with the original initial value, unless induc_val is
5449 the same as initial_def already. */
5450 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5451 induc_val);
5452
5453 tree tmp = make_ssa_name (new_scalar_dest);
5454 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5455 initial_def, new_temp);
5456 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5457 scalar_results[0] = tmp;
5458 }
5459 }
5460
5461 vect_finalize_reduction:
5462
5463 if (double_reduc)
5464 loop = loop->inner;
5465
5466 /* 2.5 Adjust the final result by the initial value of the reduction
5467 variable. (When such adjustment is not needed, then
5468 'adjustment_def' is zero). For example, if code is PLUS we create:
5469 new_temp = loop_exit_def + adjustment_def */
5470
5471 if (adjustment_def)
5472 {
5473 gcc_assert (!slp_reduc);
5474 if (nested_in_vect_loop)
5475 {
5476 new_phi = new_phis[0];
5477 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5478 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5479 new_dest = vect_create_destination_var (scalar_dest, vectype);
5480 }
5481 else
5482 {
5483 new_temp = scalar_results[0];
5484 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5485 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5486 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5487 }
5488
5489 epilog_stmt = gimple_build_assign (new_dest, expr);
5490 new_temp = make_ssa_name (new_dest, epilog_stmt);
5491 gimple_assign_set_lhs (epilog_stmt, new_temp);
5492 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5493 if (nested_in_vect_loop)
5494 {
5495 stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5496 STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5497 = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5498
5499 if (!double_reduc)
5500 scalar_results.quick_push (new_temp);
5501 else
5502 scalar_results[0] = new_temp;
5503 }
5504 else
5505 scalar_results[0] = new_temp;
5506
5507 new_phis[0] = epilog_stmt;
5508 }
5509
5510 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5511 phis with new adjusted scalar results, i.e., replace use <s_out0>
5512 with use <s_out4>.
5513
5514 Transform:
5515 loop_exit:
5516 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5517 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5518 v_out2 = reduce <v_out1>
5519 s_out3 = extract_field <v_out2, 0>
5520 s_out4 = adjust_result <s_out3>
5521 use <s_out0>
5522 use <s_out0>
5523
5524 into:
5525
5526 loop_exit:
5527 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5528 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5529 v_out2 = reduce <v_out1>
5530 s_out3 = extract_field <v_out2, 0>
5531 s_out4 = adjust_result <s_out3>
5532 use <s_out4>
5533 use <s_out4> */
5534
5535
5536 /* In SLP reduction chain we reduce vector results into one vector if
5537 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5538 LHS of the last stmt in the reduction chain, since we are looking for
5539 the loop exit phi node. */
5540 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5541 {
5542 stmt_vec_info dest_stmt_info
5543 = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5544 /* Handle reduction patterns. */
5545 if (STMT_VINFO_RELATED_STMT (dest_stmt_info))
5546 dest_stmt_info = STMT_VINFO_RELATED_STMT (dest_stmt_info);
5547
5548 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5549 group_size = 1;
5550 }
5551
5552 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5553 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5554 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5555 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5556 correspond to the first vector stmt, etc.
5557 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5558 if (group_size > new_phis.length ())
5559 {
5560 ratio = group_size / new_phis.length ();
5561 gcc_assert (!(group_size % new_phis.length ()));
5562 }
5563 else
5564 ratio = 1;
5565
5566 stmt_vec_info epilog_stmt_info = NULL;
5567 for (k = 0; k < group_size; k++)
5568 {
5569 if (k % ratio == 0)
5570 {
5571 epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
5572 reduction_phi_info = reduction_phis[k / ratio];
5573 if (double_reduc)
5574 inner_phi = inner_phis[k / ratio];
5575 }
5576
5577 if (slp_reduc)
5578 {
5579 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5580
5581 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5582 /* SLP statements can't participate in patterns. */
5583 gcc_assert (!orig_stmt_info);
5584 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5585 }
5586
5587 phis.create (3);
5588 /* Find the loop-closed-use at the loop exit of the original scalar
5589 result. (The reduction result is expected to have two immediate uses -
5590 one at the latch block, and one at the loop exit). */
5591 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5592 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5593 && !is_gimple_debug (USE_STMT (use_p)))
5594 phis.safe_push (USE_STMT (use_p));
5595
5596 /* While we expect to have found an exit_phi because of loop-closed-ssa
5597 form we can end up without one if the scalar cycle is dead. */
5598
5599 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5600 {
5601 if (outer_loop)
5602 {
5603 stmt_vec_info exit_phi_vinfo
5604 = loop_vinfo->lookup_stmt (exit_phi);
5605 gphi *vect_phi;
5606
5607 /* FORNOW. Currently not supporting the case that an inner-loop
5608 reduction is not used in the outer-loop (but only outside the
5609 outer-loop), unless it is double reduction. */
5610 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5611 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5612 || double_reduc);
5613
5614 if (double_reduc)
5615 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5616 else
5617 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
5618 if (!double_reduc
5619 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5620 != vect_double_reduction_def)
5621 continue;
5622
5623 /* Handle double reduction:
5624
5625 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5626 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5627 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5628 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5629
5630 At that point the regular reduction (stmt2 and stmt3) is
5631 already vectorized, as well as the exit phi node, stmt4.
5632 Here we vectorize the phi node of double reduction, stmt1, and
5633 update all relevant statements. */
5634
5635 /* Go through all the uses of s2 to find double reduction phi
5636 node, i.e., stmt1 above. */
5637 orig_name = PHI_RESULT (exit_phi);
5638 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5639 {
5640 stmt_vec_info use_stmt_vinfo;
5641 tree vect_phi_init, preheader_arg, vect_phi_res;
5642 basic_block bb = gimple_bb (use_stmt);
5643
5644 /* Check that USE_STMT is really double reduction phi
5645 node. */
5646 if (gimple_code (use_stmt) != GIMPLE_PHI
5647 || gimple_phi_num_args (use_stmt) != 2
5648 || bb->loop_father != outer_loop)
5649 continue;
5650 use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5651 if (!use_stmt_vinfo
5652 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5653 != vect_double_reduction_def)
5654 continue;
5655
5656 /* Create vector phi node for double reduction:
5657 vs1 = phi <vs0, vs2>
5658 vs1 was created previously in this function by a call to
5659 vect_get_vec_def_for_operand and is stored in
5660 vec_initial_def;
5661 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5662 vs0 is created here. */
5663
5664 /* Create vector phi node. */
5665 vect_phi = create_phi_node (vec_initial_def, bb);
5666 loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5667
5668 /* Create vs0 - initial def of the double reduction phi. */
5669 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5670 loop_preheader_edge (outer_loop));
5671 vect_phi_init = get_initial_def_for_reduction
5672 (stmt_info, preheader_arg, NULL);
5673
5674 /* Update phi node arguments with vs0 and vs2. */
5675 add_phi_arg (vect_phi, vect_phi_init,
5676 loop_preheader_edge (outer_loop),
5677 UNKNOWN_LOCATION);
5678 add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5679 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5680 if (dump_enabled_p ())
5681 {
5682 dump_printf_loc (MSG_NOTE, vect_location,
5683 "created double reduction phi node: ");
5684 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5685 }
5686
5687 vect_phi_res = PHI_RESULT (vect_phi);
5688
5689 /* Replace the use, i.e., set the correct vs1 in the regular
5690 reduction phi node. FORNOW, NCOPIES is always 1, so the
5691 loop is redundant. */
5692 stmt_vec_info use_info = reduction_phi_info;
5693 for (j = 0; j < ncopies; j++)
5694 {
5695 edge pr_edge = loop_preheader_edge (loop);
5696 SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5697 pr_edge->dest_idx, vect_phi_res);
5698 use_info = STMT_VINFO_RELATED_STMT (use_info);
5699 }
5700 }
5701 }
5702 }
5703
5704 phis.release ();
5705 if (nested_in_vect_loop)
5706 {
5707 if (double_reduc)
5708 loop = outer_loop;
5709 else
5710 continue;
5711 }
5712
5713 phis.create (3);
5714 /* Find the loop-closed-use at the loop exit of the original scalar
5715 result. (The reduction result is expected to have two immediate uses,
5716 one at the latch block, and one at the loop exit). For double
5717 reductions we are looking for exit phis of the outer loop. */
5718 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5719 {
5720 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5721 {
5722 if (!is_gimple_debug (USE_STMT (use_p)))
5723 phis.safe_push (USE_STMT (use_p));
5724 }
5725 else
5726 {
5727 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5728 {
5729 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5730
5731 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5732 {
5733 if (!flow_bb_inside_loop_p (loop,
5734 gimple_bb (USE_STMT (phi_use_p)))
5735 && !is_gimple_debug (USE_STMT (phi_use_p)))
5736 phis.safe_push (USE_STMT (phi_use_p));
5737 }
5738 }
5739 }
5740 }
5741
5742 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5743 {
5744 /* Replace the uses: */
5745 orig_name = PHI_RESULT (exit_phi);
5746 scalar_result = scalar_results[k];
5747 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5748 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5749 SET_USE (use_p, scalar_result);
5750 }
5751
5752 phis.release ();
5753 }
5754 }
5755
5756 /* Return a vector of type VECTYPE that is equal to the vector select
5757 operation "MASK ? VEC : IDENTITY". Insert the select statements
5758 before GSI. */
5759
5760 static tree
5761 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5762 tree vec, tree identity)
5763 {
5764 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5765 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5766 mask, vec, identity);
5767 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5768 return cond;
5769 }
5770
5771 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5772 order, starting with LHS. Insert the extraction statements before GSI and
5773 associate the new scalar SSA names with variable SCALAR_DEST.
5774 Return the SSA name for the result. */
5775
5776 static tree
5777 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5778 tree_code code, tree lhs, tree vector_rhs)
5779 {
5780 tree vectype = TREE_TYPE (vector_rhs);
5781 tree scalar_type = TREE_TYPE (vectype);
5782 tree bitsize = TYPE_SIZE (scalar_type);
5783 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5784 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5785
5786 for (unsigned HOST_WIDE_INT bit_offset = 0;
5787 bit_offset < vec_size_in_bits;
5788 bit_offset += element_bitsize)
5789 {
5790 tree bitpos = bitsize_int (bit_offset);
5791 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5792 bitsize, bitpos);
5793
5794 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5795 rhs = make_ssa_name (scalar_dest, stmt);
5796 gimple_assign_set_lhs (stmt, rhs);
5797 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5798
5799 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5800 tree new_name = make_ssa_name (scalar_dest, stmt);
5801 gimple_assign_set_lhs (stmt, new_name);
5802 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5803 lhs = new_name;
5804 }
5805 return lhs;
5806 }
5807
5808 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5809 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5810 statement. CODE is the operation performed by STMT_INFO and OPS are
5811 its scalar operands. REDUC_INDEX is the index of the operand in
5812 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5813 implements in-order reduction, or IFN_LAST if we should open-code it.
5814 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5815 that should be used to control the operation in a fully-masked loop. */
5816
5817 static bool
5818 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5819 gimple_stmt_iterator *gsi,
5820 stmt_vec_info *vec_stmt, slp_tree slp_node,
5821 gimple *reduc_def_stmt,
5822 tree_code code, internal_fn reduc_fn,
5823 tree ops[3], tree vectype_in,
5824 int reduc_index, vec_loop_masks *masks)
5825 {
5826 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5827 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5828 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5829 stmt_vec_info new_stmt_info = NULL;
5830
5831 int ncopies;
5832 if (slp_node)
5833 ncopies = 1;
5834 else
5835 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5836
5837 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5838 gcc_assert (ncopies == 1);
5839 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5840 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5841 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5842 == FOLD_LEFT_REDUCTION);
5843
5844 if (slp_node)
5845 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5846 TYPE_VECTOR_SUBPARTS (vectype_in)));
5847
5848 tree op0 = ops[1 - reduc_index];
5849
5850 int group_size = 1;
5851 stmt_vec_info scalar_dest_def_info;
5852 auto_vec<tree> vec_oprnds0;
5853 if (slp_node)
5854 {
5855 vect_get_vec_defs (op0, NULL_TREE, stmt_info, &vec_oprnds0, NULL,
5856 slp_node);
5857 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5858 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5859 }
5860 else
5861 {
5862 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5863 vec_oprnds0.create (1);
5864 vec_oprnds0.quick_push (loop_vec_def0);
5865 scalar_dest_def_info = stmt_info;
5866 }
5867
5868 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5869 tree scalar_type = TREE_TYPE (scalar_dest);
5870 tree reduc_var = gimple_phi_result (reduc_def_stmt);
5871
5872 int vec_num = vec_oprnds0.length ();
5873 gcc_assert (vec_num == 1 || slp_node);
5874 tree vec_elem_type = TREE_TYPE (vectype_out);
5875 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5876
5877 tree vector_identity = NULL_TREE;
5878 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5879 vector_identity = build_zero_cst (vectype_out);
5880
5881 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5882 int i;
5883 tree def0;
5884 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5885 {
5886 gimple *new_stmt;
5887 tree mask = NULL_TREE;
5888 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5889 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5890
5891 /* Handle MINUS by adding the negative. */
5892 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5893 {
5894 tree negated = make_ssa_name (vectype_out);
5895 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5896 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5897 def0 = negated;
5898 }
5899
5900 if (mask)
5901 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5902 vector_identity);
5903
5904 /* On the first iteration the input is simply the scalar phi
5905 result, and for subsequent iterations it is the output of
5906 the preceding operation. */
5907 if (reduc_fn != IFN_LAST)
5908 {
5909 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5910 /* For chained SLP reductions the output of the previous reduction
5911 operation serves as the input of the next. For the final statement
5912 the output cannot be a temporary - we reuse the original
5913 scalar destination of the last statement. */
5914 if (i != vec_num - 1)
5915 {
5916 gimple_set_lhs (new_stmt, scalar_dest_var);
5917 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5918 gimple_set_lhs (new_stmt, reduc_var);
5919 }
5920 }
5921 else
5922 {
5923 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5924 reduc_var, def0);
5925 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5926 /* Remove the statement, so that we can use the same code paths
5927 as for statements that we've just created. */
5928 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5929 gsi_remove (&tmp_gsi, false);
5930 }
5931
5932 if (i == vec_num - 1)
5933 {
5934 gimple_set_lhs (new_stmt, scalar_dest);
5935 new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5936 new_stmt);
5937 }
5938 else
5939 new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5940 new_stmt, gsi);
5941
5942 if (slp_node)
5943 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5944 }
5945
5946 if (!slp_node)
5947 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5948
5949 return true;
5950 }
5951
5952 /* Function is_nonwrapping_integer_induction.
5953
5954 Check if STMT_VINO (which is part of loop LOOP) both increments and
5955 does not cause overflow. */
5956
5957 static bool
5958 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
5959 {
5960 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5961 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5962 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5963 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5964 widest_int ni, max_loop_value, lhs_max;
5965 wi::overflow_type overflow = wi::OVF_NONE;
5966
5967 /* Make sure the loop is integer based. */
5968 if (TREE_CODE (base) != INTEGER_CST
5969 || TREE_CODE (step) != INTEGER_CST)
5970 return false;
5971
5972 /* Check that the max size of the loop will not wrap. */
5973
5974 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5975 return true;
5976
5977 if (! max_stmt_executions (loop, &ni))
5978 return false;
5979
5980 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5981 &overflow);
5982 if (overflow)
5983 return false;
5984
5985 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5986 TYPE_SIGN (lhs_type), &overflow);
5987 if (overflow)
5988 return false;
5989
5990 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5991 <= TYPE_PRECISION (lhs_type));
5992 }
5993
5994 /* Function vectorizable_reduction.
5995
5996 Check if STMT_INFO performs a reduction operation that can be vectorized.
5997 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5998 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5999 Return true if STMT_INFO is vectorizable in this way.
6000
6001 This function also handles reduction idioms (patterns) that have been
6002 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6003 may be of this form:
6004 X = pattern_expr (arg0, arg1, ..., X)
6005 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6006 sequence that had been detected and replaced by the pattern-stmt
6007 (STMT_INFO).
6008
6009 This function also handles reduction of condition expressions, for example:
6010 for (int i = 0; i < N; i++)
6011 if (a[i] < value)
6012 last = a[i];
6013 This is handled by vectorising the loop and creating an additional vector
6014 containing the loop indexes for which "a[i] < value" was true. In the
6015 function epilogue this is reduced to a single max value and then used to
6016 index into the vector of results.
6017
6018 In some cases of reduction patterns, the type of the reduction variable X is
6019 different than the type of the other arguments of STMT_INFO.
6020 In such cases, the vectype that is used when transforming STMT_INFO into
6021 a vector stmt is different than the vectype that is used to determine the
6022 vectorization factor, because it consists of a different number of elements
6023 than the actual number of elements that are being operated upon in parallel.
6024
6025 For example, consider an accumulation of shorts into an int accumulator.
6026 On some targets it's possible to vectorize this pattern operating on 8
6027 shorts at a time (hence, the vectype for purposes of determining the
6028 vectorization factor should be V8HI); on the other hand, the vectype that
6029 is used to create the vector form is actually V4SI (the type of the result).
6030
6031 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6032 indicates what is the actual level of parallelism (V8HI in the example), so
6033 that the right vectorization factor would be derived. This vectype
6034 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6035 be used to create the vectorized stmt. The right vectype for the vectorized
6036 stmt is obtained from the type of the result X:
6037 get_vectype_for_scalar_type (TREE_TYPE (X))
6038
6039 This means that, contrary to "regular" reductions (or "regular" stmts in
6040 general), the following equation:
6041 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6042 does *NOT* necessarily hold for reduction patterns. */
6043
6044 bool
6045 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6046 stmt_vec_info *vec_stmt, slp_tree slp_node,
6047 slp_instance slp_node_instance,
6048 stmt_vector_for_cost *cost_vec)
6049 {
6050 tree vec_dest;
6051 tree scalar_dest;
6052 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6053 tree vectype_in = NULL_TREE;
6054 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6055 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6056 enum tree_code code, orig_code;
6057 internal_fn reduc_fn;
6058 machine_mode vec_mode;
6059 int op_type;
6060 optab optab;
6061 tree new_temp = NULL_TREE;
6062 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6063 stmt_vec_info cond_stmt_vinfo = NULL;
6064 enum tree_code cond_reduc_op_code = ERROR_MARK;
6065 tree scalar_type;
6066 bool is_simple_use;
6067 int i;
6068 int ncopies;
6069 int epilog_copies;
6070 stmt_vec_info prev_stmt_info, prev_phi_info;
6071 bool single_defuse_cycle = false;
6072 stmt_vec_info new_stmt_info = NULL;
6073 int j;
6074 tree ops[3];
6075 enum vect_def_type dts[3];
6076 bool nested_cycle = false, found_nested_cycle_def = false;
6077 bool double_reduc = false;
6078 basic_block def_bb;
6079 struct loop * def_stmt_loop;
6080 tree def_arg;
6081 auto_vec<tree> vec_oprnds0;
6082 auto_vec<tree> vec_oprnds1;
6083 auto_vec<tree> vec_oprnds2;
6084 auto_vec<tree> vect_defs;
6085 auto_vec<stmt_vec_info> phis;
6086 int vec_num;
6087 tree def0, tem;
6088 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6089 tree cond_reduc_val = NULL_TREE;
6090
6091 /* Make sure it was already recognized as a reduction computation. */
6092 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6093 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6094 return false;
6095
6096 if (nested_in_vect_loop_p (loop, stmt_info))
6097 {
6098 loop = loop->inner;
6099 nested_cycle = true;
6100 }
6101
6102 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6103 gcc_assert (slp_node
6104 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6105
6106 if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
6107 {
6108 tree phi_result = gimple_phi_result (phi);
6109 /* Analysis is fully done on the reduction stmt invocation. */
6110 if (! vec_stmt)
6111 {
6112 if (slp_node)
6113 slp_node_instance->reduc_phis = slp_node;
6114
6115 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6116 return true;
6117 }
6118
6119 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6120 /* Leave the scalar phi in place. Note that checking
6121 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6122 for reductions involving a single statement. */
6123 return true;
6124
6125 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6126 if (STMT_VINFO_IN_PATTERN_P (reduc_stmt_info))
6127 reduc_stmt_info = STMT_VINFO_RELATED_STMT (reduc_stmt_info);
6128
6129 if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6130 == EXTRACT_LAST_REDUCTION)
6131 /* Leave the scalar phi in place. */
6132 return true;
6133
6134 gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6135 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6136 {
6137 tree op = gimple_op (reduc_stmt, k);
6138 if (op == phi_result)
6139 continue;
6140 if (k == 1
6141 && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6142 continue;
6143 if (!vectype_in
6144 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6145 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6146 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6147 break;
6148 }
6149 gcc_assert (vectype_in);
6150
6151 if (slp_node)
6152 ncopies = 1;
6153 else
6154 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6155
6156 stmt_vec_info use_stmt_info;
6157 if (ncopies > 1
6158 && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6159 && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6160 && (use_stmt_info == reduc_stmt_info
6161 || STMT_VINFO_RELATED_STMT (use_stmt_info) == reduc_stmt_info))
6162 single_defuse_cycle = true;
6163
6164 /* Create the destination vector */
6165 scalar_dest = gimple_assign_lhs (reduc_stmt);
6166 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6167
6168 if (slp_node)
6169 /* The size vect_schedule_slp_instance computes is off for us. */
6170 vec_num = vect_get_num_vectors
6171 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6172 * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6173 vectype_in);
6174 else
6175 vec_num = 1;
6176
6177 /* Generate the reduction PHIs upfront. */
6178 prev_phi_info = NULL;
6179 for (j = 0; j < ncopies; j++)
6180 {
6181 if (j == 0 || !single_defuse_cycle)
6182 {
6183 for (i = 0; i < vec_num; i++)
6184 {
6185 /* Create the reduction-phi that defines the reduction
6186 operand. */
6187 gimple *new_phi = create_phi_node (vec_dest, loop->header);
6188 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6189
6190 if (slp_node)
6191 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6192 else
6193 {
6194 if (j == 0)
6195 STMT_VINFO_VEC_STMT (stmt_info)
6196 = *vec_stmt = new_phi_info;
6197 else
6198 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6199 prev_phi_info = new_phi_info;
6200 }
6201 }
6202 }
6203 }
6204
6205 return true;
6206 }
6207
6208 /* 1. Is vectorizable reduction? */
6209 /* Not supportable if the reduction variable is used in the loop, unless
6210 it's a reduction chain. */
6211 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6212 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6213 return false;
6214
6215 /* Reductions that are not used even in an enclosing outer-loop,
6216 are expected to be "live" (used out of the loop). */
6217 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6218 && !STMT_VINFO_LIVE_P (stmt_info))
6219 return false;
6220
6221 /* 2. Has this been recognized as a reduction pattern?
6222
6223 Check if STMT represents a pattern that has been recognized
6224 in earlier analysis stages. For stmts that represent a pattern,
6225 the STMT_VINFO_RELATED_STMT field records the last stmt in
6226 the original sequence that constitutes the pattern. */
6227
6228 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6229 if (orig_stmt_info)
6230 {
6231 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6232 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6233 }
6234
6235 /* 3. Check the operands of the operation. The first operands are defined
6236 inside the loop body. The last operand is the reduction variable,
6237 which is defined by the loop-header-phi. */
6238
6239 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6240
6241 /* Flatten RHS. */
6242 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6243 {
6244 case GIMPLE_BINARY_RHS:
6245 code = gimple_assign_rhs_code (stmt);
6246 op_type = TREE_CODE_LENGTH (code);
6247 gcc_assert (op_type == binary_op);
6248 ops[0] = gimple_assign_rhs1 (stmt);
6249 ops[1] = gimple_assign_rhs2 (stmt);
6250 break;
6251
6252 case GIMPLE_TERNARY_RHS:
6253 code = gimple_assign_rhs_code (stmt);
6254 op_type = TREE_CODE_LENGTH (code);
6255 gcc_assert (op_type == ternary_op);
6256 ops[0] = gimple_assign_rhs1 (stmt);
6257 ops[1] = gimple_assign_rhs2 (stmt);
6258 ops[2] = gimple_assign_rhs3 (stmt);
6259 break;
6260
6261 case GIMPLE_UNARY_RHS:
6262 return false;
6263
6264 default:
6265 gcc_unreachable ();
6266 }
6267
6268 if (code == COND_EXPR && slp_node)
6269 return false;
6270
6271 scalar_dest = gimple_assign_lhs (stmt);
6272 scalar_type = TREE_TYPE (scalar_dest);
6273 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6274 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6275 return false;
6276
6277 /* Do not try to vectorize bit-precision reductions. */
6278 if (!type_has_mode_precision_p (scalar_type))
6279 return false;
6280
6281 /* All uses but the last are expected to be defined in the loop.
6282 The last use is the reduction variable. In case of nested cycle this
6283 assumption is not true: we use reduc_index to record the index of the
6284 reduction variable. */
6285 stmt_vec_info reduc_def_info = NULL;
6286 int reduc_index = -1;
6287 for (i = 0; i < op_type; i++)
6288 {
6289 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6290 if (i == 0 && code == COND_EXPR)
6291 continue;
6292
6293 stmt_vec_info def_stmt_info;
6294 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6295 &def_stmt_info);
6296 dt = dts[i];
6297 gcc_assert (is_simple_use);
6298 if (dt == vect_reduction_def)
6299 {
6300 reduc_def_info = def_stmt_info;
6301 reduc_index = i;
6302 continue;
6303 }
6304 else if (tem)
6305 {
6306 /* To properly compute ncopies we are interested in the widest
6307 input type in case we're looking at a widening accumulation. */
6308 if (!vectype_in
6309 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6310 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6311 vectype_in = tem;
6312 }
6313
6314 if (dt != vect_internal_def
6315 && dt != vect_external_def
6316 && dt != vect_constant_def
6317 && dt != vect_induction_def
6318 && !(dt == vect_nested_cycle && nested_cycle))
6319 return false;
6320
6321 if (dt == vect_nested_cycle)
6322 {
6323 found_nested_cycle_def = true;
6324 reduc_def_info = def_stmt_info;
6325 reduc_index = i;
6326 }
6327
6328 if (i == 1 && code == COND_EXPR)
6329 {
6330 /* Record how value of COND_EXPR is defined. */
6331 if (dt == vect_constant_def)
6332 {
6333 cond_reduc_dt = dt;
6334 cond_reduc_val = ops[i];
6335 }
6336 if (dt == vect_induction_def
6337 && def_stmt_info
6338 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6339 {
6340 cond_reduc_dt = dt;
6341 cond_stmt_vinfo = def_stmt_info;
6342 }
6343 }
6344 }
6345
6346 if (!vectype_in)
6347 vectype_in = vectype_out;
6348
6349 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6350 directy used in stmt. */
6351 if (reduc_index == -1)
6352 {
6353 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6354 {
6355 if (dump_enabled_p ())
6356 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6357 "in-order reduction chain without SLP.\n");
6358 return false;
6359 }
6360
6361 if (orig_stmt_info)
6362 reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6363 else
6364 reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6365 }
6366
6367 if (! reduc_def_info)
6368 return false;
6369
6370 gphi *reduc_def_phi = dyn_cast <gphi *> (reduc_def_info->stmt);
6371 if (!reduc_def_phi)
6372 return false;
6373
6374 if (!(reduc_index == -1
6375 || dts[reduc_index] == vect_reduction_def
6376 || dts[reduc_index] == vect_nested_cycle
6377 || ((dts[reduc_index] == vect_internal_def
6378 || dts[reduc_index] == vect_external_def
6379 || dts[reduc_index] == vect_constant_def
6380 || dts[reduc_index] == vect_induction_def)
6381 && nested_cycle && found_nested_cycle_def)))
6382 {
6383 /* For pattern recognized stmts, orig_stmt might be a reduction,
6384 but some helper statements for the pattern might not, or
6385 might be COND_EXPRs with reduction uses in the condition. */
6386 gcc_assert (orig_stmt_info);
6387 return false;
6388 }
6389
6390 /* PHIs should not participate in patterns. */
6391 gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6392 enum vect_reduction_type v_reduc_type
6393 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6394 stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6395
6396 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6397 /* If we have a condition reduction, see if we can simplify it further. */
6398 if (v_reduc_type == COND_REDUCTION)
6399 {
6400 /* TODO: We can't yet handle reduction chains, since we need to treat
6401 each COND_EXPR in the chain specially, not just the last one.
6402 E.g. for:
6403
6404 x_1 = PHI <x_3, ...>
6405 x_2 = a_2 ? ... : x_1;
6406 x_3 = a_3 ? ... : x_2;
6407
6408 we're interested in the last element in x_3 for which a_2 || a_3
6409 is true, whereas the current reduction chain handling would
6410 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6411 as a reduction operation. */
6412 if (reduc_index == -1)
6413 {
6414 if (dump_enabled_p ())
6415 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6416 "conditional reduction chains not supported\n");
6417 return false;
6418 }
6419
6420 /* vect_is_simple_reduction ensured that operand 2 is the
6421 loop-carried operand. */
6422 gcc_assert (reduc_index == 2);
6423
6424 /* Loop peeling modifies initial value of reduction PHI, which
6425 makes the reduction stmt to be transformed different to the
6426 original stmt analyzed. We need to record reduction code for
6427 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6428 it can be used directly at transform stage. */
6429 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6430 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6431 {
6432 /* Also set the reduction type to CONST_COND_REDUCTION. */
6433 gcc_assert (cond_reduc_dt == vect_constant_def);
6434 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6435 }
6436 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6437 vectype_in, OPTIMIZE_FOR_SPEED))
6438 {
6439 if (dump_enabled_p ())
6440 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6441 "optimizing condition reduction with"
6442 " FOLD_EXTRACT_LAST.\n");
6443 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6444 }
6445 else if (cond_reduc_dt == vect_induction_def)
6446 {
6447 tree base
6448 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6449 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6450
6451 gcc_assert (TREE_CODE (base) == INTEGER_CST
6452 && TREE_CODE (step) == INTEGER_CST);
6453 cond_reduc_val = NULL_TREE;
6454 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6455 above base; punt if base is the minimum value of the type for
6456 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6457 if (tree_int_cst_sgn (step) == -1)
6458 {
6459 cond_reduc_op_code = MIN_EXPR;
6460 if (tree_int_cst_sgn (base) == -1)
6461 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6462 else if (tree_int_cst_lt (base,
6463 TYPE_MAX_VALUE (TREE_TYPE (base))))
6464 cond_reduc_val
6465 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6466 }
6467 else
6468 {
6469 cond_reduc_op_code = MAX_EXPR;
6470 if (tree_int_cst_sgn (base) == 1)
6471 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6472 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6473 base))
6474 cond_reduc_val
6475 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6476 }
6477 if (cond_reduc_val)
6478 {
6479 if (dump_enabled_p ())
6480 dump_printf_loc (MSG_NOTE, vect_location,
6481 "condition expression based on "
6482 "integer induction.\n");
6483 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6484 = INTEGER_INDUC_COND_REDUCTION;
6485 }
6486 }
6487 else if (cond_reduc_dt == vect_constant_def)
6488 {
6489 enum vect_def_type cond_initial_dt;
6490 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6491 tree cond_initial_val
6492 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6493
6494 gcc_assert (cond_reduc_val != NULL_TREE);
6495 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6496 if (cond_initial_dt == vect_constant_def
6497 && types_compatible_p (TREE_TYPE (cond_initial_val),
6498 TREE_TYPE (cond_reduc_val)))
6499 {
6500 tree e = fold_binary (LE_EXPR, boolean_type_node,
6501 cond_initial_val, cond_reduc_val);
6502 if (e && (integer_onep (e) || integer_zerop (e)))
6503 {
6504 if (dump_enabled_p ())
6505 dump_printf_loc (MSG_NOTE, vect_location,
6506 "condition expression based on "
6507 "compile time constant.\n");
6508 /* Record reduction code at analysis stage. */
6509 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6510 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6511 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6512 = CONST_COND_REDUCTION;
6513 }
6514 }
6515 }
6516 }
6517
6518 if (orig_stmt_info)
6519 gcc_assert (tmp == orig_stmt_info
6520 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6521 else
6522 /* We changed STMT to be the first stmt in reduction chain, hence we
6523 check that in this case the first element in the chain is STMT. */
6524 gcc_assert (tmp == stmt_info
6525 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6526
6527 if (STMT_VINFO_LIVE_P (reduc_def_info))
6528 return false;
6529
6530 if (slp_node)
6531 ncopies = 1;
6532 else
6533 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6534
6535 gcc_assert (ncopies >= 1);
6536
6537 vec_mode = TYPE_MODE (vectype_in);
6538 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6539
6540 if (code == COND_EXPR)
6541 {
6542 /* Only call during the analysis stage, otherwise we'll lose
6543 STMT_VINFO_TYPE. */
6544 if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
6545 ops[reduc_index], 0, NULL,
6546 cost_vec))
6547 {
6548 if (dump_enabled_p ())
6549 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6550 "unsupported condition in reduction\n");
6551 return false;
6552 }
6553 }
6554 else
6555 {
6556 /* 4. Supportable by target? */
6557
6558 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6559 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6560 {
6561 /* Shifts and rotates are only supported by vectorizable_shifts,
6562 not vectorizable_reduction. */
6563 if (dump_enabled_p ())
6564 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6565 "unsupported shift or rotation.\n");
6566 return false;
6567 }
6568
6569 /* 4.1. check support for the operation in the loop */
6570 optab = optab_for_tree_code (code, vectype_in, optab_default);
6571 if (!optab)
6572 {
6573 if (dump_enabled_p ())
6574 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6575 "no optab.\n");
6576
6577 return false;
6578 }
6579
6580 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6581 {
6582 if (dump_enabled_p ())
6583 dump_printf (MSG_NOTE, "op not supported by target.\n");
6584
6585 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6586 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6587 return false;
6588
6589 if (dump_enabled_p ())
6590 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6591 }
6592
6593 /* Worthwhile without SIMD support? */
6594 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6595 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6596 {
6597 if (dump_enabled_p ())
6598 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6599 "not worthwhile without SIMD support.\n");
6600
6601 return false;
6602 }
6603 }
6604
6605 /* 4.2. Check support for the epilog operation.
6606
6607 If STMT represents a reduction pattern, then the type of the
6608 reduction variable may be different than the type of the rest
6609 of the arguments. For example, consider the case of accumulation
6610 of shorts into an int accumulator; The original code:
6611 S1: int_a = (int) short_a;
6612 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6613
6614 was replaced with:
6615 STMT: int_acc = widen_sum <short_a, int_acc>
6616
6617 This means that:
6618 1. The tree-code that is used to create the vector operation in the
6619 epilog code (that reduces the partial results) is not the
6620 tree-code of STMT, but is rather the tree-code of the original
6621 stmt from the pattern that STMT is replacing. I.e, in the example
6622 above we want to use 'widen_sum' in the loop, but 'plus' in the
6623 epilog.
6624 2. The type (mode) we use to check available target support
6625 for the vector operation to be created in the *epilog*, is
6626 determined by the type of the reduction variable (in the example
6627 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6628 However the type (mode) we use to check available target support
6629 for the vector operation to be created *inside the loop*, is
6630 determined by the type of the other arguments to STMT (in the
6631 example we'd check this: optab_handler (widen_sum_optab,
6632 vect_short_mode)).
6633
6634 This is contrary to "regular" reductions, in which the types of all
6635 the arguments are the same as the type of the reduction variable.
6636 For "regular" reductions we can therefore use the same vector type
6637 (and also the same tree-code) when generating the epilog code and
6638 when generating the code inside the loop. */
6639
6640 vect_reduction_type reduction_type
6641 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6642 if (orig_stmt_info
6643 && (reduction_type == TREE_CODE_REDUCTION
6644 || reduction_type == FOLD_LEFT_REDUCTION))
6645 {
6646 /* This is a reduction pattern: get the vectype from the type of the
6647 reduction variable, and get the tree-code from orig_stmt. */
6648 orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6649 gcc_assert (vectype_out);
6650 vec_mode = TYPE_MODE (vectype_out);
6651 }
6652 else
6653 {
6654 /* Regular reduction: use the same vectype and tree-code as used for
6655 the vector code inside the loop can be used for the epilog code. */
6656 orig_code = code;
6657
6658 if (code == MINUS_EXPR)
6659 orig_code = PLUS_EXPR;
6660
6661 /* For simple condition reductions, replace with the actual expression
6662 we want to base our reduction around. */
6663 if (reduction_type == CONST_COND_REDUCTION)
6664 {
6665 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6666 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6667 }
6668 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6669 orig_code = cond_reduc_op_code;
6670 }
6671
6672 if (nested_cycle)
6673 {
6674 def_bb = gimple_bb (reduc_def_phi);
6675 def_stmt_loop = def_bb->loop_father;
6676 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6677 loop_preheader_edge (def_stmt_loop));
6678 stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6679 if (def_arg_stmt_info
6680 && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6681 == vect_double_reduction_def))
6682 double_reduc = true;
6683 }
6684
6685 reduc_fn = IFN_LAST;
6686
6687 if (reduction_type == TREE_CODE_REDUCTION
6688 || reduction_type == FOLD_LEFT_REDUCTION
6689 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6690 || reduction_type == CONST_COND_REDUCTION)
6691 {
6692 if (reduction_type == FOLD_LEFT_REDUCTION
6693 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6694 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6695 {
6696 if (reduc_fn != IFN_LAST
6697 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6698 OPTIMIZE_FOR_SPEED))
6699 {
6700 if (dump_enabled_p ())
6701 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6702 "reduc op not supported by target.\n");
6703
6704 reduc_fn = IFN_LAST;
6705 }
6706 }
6707 else
6708 {
6709 if (!nested_cycle || double_reduc)
6710 {
6711 if (dump_enabled_p ())
6712 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6713 "no reduc code for scalar code.\n");
6714
6715 return false;
6716 }
6717 }
6718 }
6719 else if (reduction_type == COND_REDUCTION)
6720 {
6721 int scalar_precision
6722 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6723 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6724 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6725 nunits_out);
6726
6727 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6728 OPTIMIZE_FOR_SPEED))
6729 reduc_fn = IFN_REDUC_MAX;
6730 }
6731
6732 if (reduction_type != EXTRACT_LAST_REDUCTION
6733 && reduc_fn == IFN_LAST
6734 && !nunits_out.is_constant ())
6735 {
6736 if (dump_enabled_p ())
6737 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6738 "missing target support for reduction on"
6739 " variable-length vectors.\n");
6740 return false;
6741 }
6742
6743 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6744 && ncopies > 1)
6745 {
6746 if (dump_enabled_p ())
6747 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6748 "multiple types in double reduction or condition "
6749 "reduction.\n");
6750 return false;
6751 }
6752
6753 /* For SLP reductions, see if there is a neutral value we can use. */
6754 tree neutral_op = NULL_TREE;
6755 if (slp_node)
6756 neutral_op = neutral_op_for_slp_reduction
6757 (slp_node_instance->reduc_phis, code,
6758 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6759
6760 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6761 {
6762 /* We can't support in-order reductions of code such as this:
6763
6764 for (int i = 0; i < n1; ++i)
6765 for (int j = 0; j < n2; ++j)
6766 l += a[j];
6767
6768 since GCC effectively transforms the loop when vectorizing:
6769
6770 for (int i = 0; i < n1 / VF; ++i)
6771 for (int j = 0; j < n2; ++j)
6772 for (int k = 0; k < VF; ++k)
6773 l += a[j];
6774
6775 which is a reassociation of the original operation. */
6776 if (dump_enabled_p ())
6777 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6778 "in-order double reduction not supported.\n");
6779
6780 return false;
6781 }
6782
6783 if (reduction_type == FOLD_LEFT_REDUCTION
6784 && slp_node
6785 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6786 {
6787 /* We cannot use in-order reductions in this case because there is
6788 an implicit reassociation of the operations involved. */
6789 if (dump_enabled_p ())
6790 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6791 "in-order unchained SLP reductions not supported.\n");
6792 return false;
6793 }
6794
6795 /* For double reductions, and for SLP reductions with a neutral value,
6796 we construct a variable-length initial vector by loading a vector
6797 full of the neutral value and then shift-and-inserting the start
6798 values into the low-numbered elements. */
6799 if ((double_reduc || neutral_op)
6800 && !nunits_out.is_constant ()
6801 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6802 vectype_out, OPTIMIZE_FOR_SPEED))
6803 {
6804 if (dump_enabled_p ())
6805 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6806 "reduction on variable-length vectors requires"
6807 " target support for a vector-shift-and-insert"
6808 " operation.\n");
6809 return false;
6810 }
6811
6812 /* Check extra constraints for variable-length unchained SLP reductions. */
6813 if (STMT_SLP_TYPE (stmt_info)
6814 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6815 && !nunits_out.is_constant ())
6816 {
6817 /* We checked above that we could build the initial vector when
6818 there's a neutral element value. Check here for the case in
6819 which each SLP statement has its own initial value and in which
6820 that value needs to be repeated for every instance of the
6821 statement within the initial vector. */
6822 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6823 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6824 if (!neutral_op
6825 && !can_duplicate_and_interleave_p (group_size, elt_mode))
6826 {
6827 if (dump_enabled_p ())
6828 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6829 "unsupported form of SLP reduction for"
6830 " variable-length vectors: cannot build"
6831 " initial vector.\n");
6832 return false;
6833 }
6834 /* The epilogue code relies on the number of elements being a multiple
6835 of the group size. The duplicate-and-interleave approach to setting
6836 up the the initial vector does too. */
6837 if (!multiple_p (nunits_out, group_size))
6838 {
6839 if (dump_enabled_p ())
6840 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6841 "unsupported form of SLP reduction for"
6842 " variable-length vectors: the vector size"
6843 " is not a multiple of the number of results.\n");
6844 return false;
6845 }
6846 }
6847
6848 /* In case of widenning multiplication by a constant, we update the type
6849 of the constant to be the type of the other operand. We check that the
6850 constant fits the type in the pattern recognition pass. */
6851 if (code == DOT_PROD_EXPR
6852 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6853 {
6854 if (TREE_CODE (ops[0]) == INTEGER_CST)
6855 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6856 else if (TREE_CODE (ops[1]) == INTEGER_CST)
6857 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6858 else
6859 {
6860 if (dump_enabled_p ())
6861 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6862 "invalid types in dot-prod\n");
6863
6864 return false;
6865 }
6866 }
6867
6868 if (reduction_type == COND_REDUCTION)
6869 {
6870 widest_int ni;
6871
6872 if (! max_loop_iterations (loop, &ni))
6873 {
6874 if (dump_enabled_p ())
6875 dump_printf_loc (MSG_NOTE, vect_location,
6876 "loop count not known, cannot create cond "
6877 "reduction.\n");
6878 return false;
6879 }
6880 /* Convert backedges to iterations. */
6881 ni += 1;
6882
6883 /* The additional index will be the same type as the condition. Check
6884 that the loop can fit into this less one (because we'll use up the
6885 zero slot for when there are no matches). */
6886 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6887 if (wi::geu_p (ni, wi::to_widest (max_index)))
6888 {
6889 if (dump_enabled_p ())
6890 dump_printf_loc (MSG_NOTE, vect_location,
6891 "loop size is greater than data size.\n");
6892 return false;
6893 }
6894 }
6895
6896 /* In case the vectorization factor (VF) is bigger than the number
6897 of elements that we can fit in a vectype (nunits), we have to generate
6898 more than one vector stmt - i.e - we need to "unroll" the
6899 vector stmt by a factor VF/nunits. For more details see documentation
6900 in vectorizable_operation. */
6901
6902 /* If the reduction is used in an outer loop we need to generate
6903 VF intermediate results, like so (e.g. for ncopies=2):
6904 r0 = phi (init, r0)
6905 r1 = phi (init, r1)
6906 r0 = x0 + r0;
6907 r1 = x1 + r1;
6908 (i.e. we generate VF results in 2 registers).
6909 In this case we have a separate def-use cycle for each copy, and therefore
6910 for each copy we get the vector def for the reduction variable from the
6911 respective phi node created for this copy.
6912
6913 Otherwise (the reduction is unused in the loop nest), we can combine
6914 together intermediate results, like so (e.g. for ncopies=2):
6915 r = phi (init, r)
6916 r = x0 + r;
6917 r = x1 + r;
6918 (i.e. we generate VF/2 results in a single register).
6919 In this case for each copy we get the vector def for the reduction variable
6920 from the vectorized reduction operation generated in the previous iteration.
6921
6922 This only works when we see both the reduction PHI and its only consumer
6923 in vectorizable_reduction and there are no intermediate stmts
6924 participating. */
6925 stmt_vec_info use_stmt_info;
6926 tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6927 if (ncopies > 1
6928 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6929 && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6930 && (use_stmt_info == stmt_info
6931 || STMT_VINFO_RELATED_STMT (use_stmt_info) == stmt_info))
6932 {
6933 single_defuse_cycle = true;
6934 epilog_copies = 1;
6935 }
6936 else
6937 epilog_copies = ncopies;
6938
6939 /* If the reduction stmt is one of the patterns that have lane
6940 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
6941 if ((ncopies > 1
6942 && ! single_defuse_cycle)
6943 && (code == DOT_PROD_EXPR
6944 || code == WIDEN_SUM_EXPR
6945 || code == SAD_EXPR))
6946 {
6947 if (dump_enabled_p ())
6948 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6949 "multi def-use cycle not possible for lane-reducing "
6950 "reduction operation\n");
6951 return false;
6952 }
6953
6954 if (slp_node)
6955 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6956 else
6957 vec_num = 1;
6958
6959 internal_fn cond_fn = get_conditional_internal_fn (code);
6960 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6961
6962 if (!vec_stmt) /* transformation not required. */
6963 {
6964 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
6965 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6966 {
6967 if (reduction_type != FOLD_LEFT_REDUCTION
6968 && (cond_fn == IFN_LAST
6969 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6970 OPTIMIZE_FOR_SPEED)))
6971 {
6972 if (dump_enabled_p ())
6973 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6974 "can't use a fully-masked loop because no"
6975 " conditional operation is available.\n");
6976 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6977 }
6978 else if (reduc_index == -1)
6979 {
6980 if (dump_enabled_p ())
6981 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6982 "can't use a fully-masked loop for chained"
6983 " reductions.\n");
6984 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6985 }
6986 else
6987 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6988 vectype_in);
6989 }
6990 if (dump_enabled_p ()
6991 && reduction_type == FOLD_LEFT_REDUCTION)
6992 dump_printf_loc (MSG_NOTE, vect_location,
6993 "using an in-order (fold-left) reduction.\n");
6994 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6995 return true;
6996 }
6997
6998 /* Transform. */
6999
7000 if (dump_enabled_p ())
7001 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7002
7003 /* FORNOW: Multiple types are not supported for condition. */
7004 if (code == COND_EXPR)
7005 gcc_assert (ncopies == 1);
7006
7007 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7008
7009 if (reduction_type == FOLD_LEFT_REDUCTION)
7010 return vectorize_fold_left_reduction
7011 (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7012 reduc_fn, ops, vectype_in, reduc_index, masks);
7013
7014 if (reduction_type == EXTRACT_LAST_REDUCTION)
7015 {
7016 gcc_assert (!slp_node);
7017 return vectorizable_condition (stmt_info, gsi, vec_stmt,
7018 NULL, reduc_index, NULL, NULL);
7019 }
7020
7021 /* Create the destination vector */
7022 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7023
7024 prev_stmt_info = NULL;
7025 prev_phi_info = NULL;
7026 if (!slp_node)
7027 {
7028 vec_oprnds0.create (1);
7029 vec_oprnds1.create (1);
7030 if (op_type == ternary_op)
7031 vec_oprnds2.create (1);
7032 }
7033
7034 phis.create (vec_num);
7035 vect_defs.create (vec_num);
7036 if (!slp_node)
7037 vect_defs.quick_push (NULL_TREE);
7038
7039 if (slp_node)
7040 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7041 else
7042 phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
7043
7044 for (j = 0; j < ncopies; j++)
7045 {
7046 if (code == COND_EXPR)
7047 {
7048 gcc_assert (!slp_node);
7049 vectorizable_condition (stmt_info, gsi, vec_stmt,
7050 PHI_RESULT (phis[0]->stmt),
7051 reduc_index, NULL, NULL);
7052 /* Multiple types are not supported for condition. */
7053 break;
7054 }
7055
7056 /* Handle uses. */
7057 if (j == 0)
7058 {
7059 if (slp_node)
7060 {
7061 /* Get vec defs for all the operands except the reduction index,
7062 ensuring the ordering of the ops in the vector is kept. */
7063 auto_vec<tree, 3> slp_ops;
7064 auto_vec<vec<tree>, 3> vec_defs;
7065
7066 slp_ops.quick_push (ops[0]);
7067 slp_ops.quick_push (ops[1]);
7068 if (op_type == ternary_op)
7069 slp_ops.quick_push (ops[2]);
7070
7071 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7072
7073 vec_oprnds0.safe_splice (vec_defs[0]);
7074 vec_defs[0].release ();
7075 vec_oprnds1.safe_splice (vec_defs[1]);
7076 vec_defs[1].release ();
7077 if (op_type == ternary_op)
7078 {
7079 vec_oprnds2.safe_splice (vec_defs[2]);
7080 vec_defs[2].release ();
7081 }
7082 }
7083 else
7084 {
7085 vec_oprnds0.quick_push
7086 (vect_get_vec_def_for_operand (ops[0], stmt_info));
7087 vec_oprnds1.quick_push
7088 (vect_get_vec_def_for_operand (ops[1], stmt_info));
7089 if (op_type == ternary_op)
7090 vec_oprnds2.quick_push
7091 (vect_get_vec_def_for_operand (ops[2], stmt_info));
7092 }
7093 }
7094 else
7095 {
7096 if (!slp_node)
7097 {
7098 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7099
7100 if (single_defuse_cycle && reduc_index == 0)
7101 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7102 else
7103 vec_oprnds0[0]
7104 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7105 vec_oprnds0[0]);
7106 if (single_defuse_cycle && reduc_index == 1)
7107 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7108 else
7109 vec_oprnds1[0]
7110 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7111 vec_oprnds1[0]);
7112 if (op_type == ternary_op)
7113 {
7114 if (single_defuse_cycle && reduc_index == 2)
7115 vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7116 else
7117 vec_oprnds2[0]
7118 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7119 vec_oprnds2[0]);
7120 }
7121 }
7122 }
7123
7124 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7125 {
7126 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7127 if (masked_loop_p)
7128 {
7129 /* Make sure that the reduction accumulator is vop[0]. */
7130 if (reduc_index == 1)
7131 {
7132 gcc_assert (commutative_tree_code (code));
7133 std::swap (vop[0], vop[1]);
7134 }
7135 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7136 vectype_in, i * ncopies + j);
7137 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7138 vop[0], vop[1],
7139 vop[0]);
7140 new_temp = make_ssa_name (vec_dest, call);
7141 gimple_call_set_lhs (call, new_temp);
7142 gimple_call_set_nothrow (call, true);
7143 new_stmt_info
7144 = vect_finish_stmt_generation (stmt_info, call, gsi);
7145 }
7146 else
7147 {
7148 if (op_type == ternary_op)
7149 vop[2] = vec_oprnds2[i];
7150
7151 gassign *new_stmt = gimple_build_assign (vec_dest, code,
7152 vop[0], vop[1], vop[2]);
7153 new_temp = make_ssa_name (vec_dest, new_stmt);
7154 gimple_assign_set_lhs (new_stmt, new_temp);
7155 new_stmt_info
7156 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7157 }
7158
7159 if (slp_node)
7160 {
7161 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7162 vect_defs.quick_push (new_temp);
7163 }
7164 else
7165 vect_defs[0] = new_temp;
7166 }
7167
7168 if (slp_node)
7169 continue;
7170
7171 if (j == 0)
7172 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7173 else
7174 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7175
7176 prev_stmt_info = new_stmt_info;
7177 }
7178
7179 /* Finalize the reduction-phi (set its arguments) and create the
7180 epilog reduction code. */
7181 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7182 vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7183
7184 vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
7185 epilog_copies, reduc_fn, phis,
7186 double_reduc, slp_node, slp_node_instance,
7187 cond_reduc_val, cond_reduc_op_code,
7188 neutral_op);
7189
7190 return true;
7191 }
7192
7193 /* Function vect_min_worthwhile_factor.
7194
7195 For a loop where we could vectorize the operation indicated by CODE,
7196 return the minimum vectorization factor that makes it worthwhile
7197 to use generic vectors. */
7198 static unsigned int
7199 vect_min_worthwhile_factor (enum tree_code code)
7200 {
7201 switch (code)
7202 {
7203 case PLUS_EXPR:
7204 case MINUS_EXPR:
7205 case NEGATE_EXPR:
7206 return 4;
7207
7208 case BIT_AND_EXPR:
7209 case BIT_IOR_EXPR:
7210 case BIT_XOR_EXPR:
7211 case BIT_NOT_EXPR:
7212 return 2;
7213
7214 default:
7215 return INT_MAX;
7216 }
7217 }
7218
7219 /* Return true if VINFO indicates we are doing loop vectorization and if
7220 it is worth decomposing CODE operations into scalar operations for
7221 that loop's vectorization factor. */
7222
7223 bool
7224 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7225 {
7226 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7227 unsigned HOST_WIDE_INT value;
7228 return (loop_vinfo
7229 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7230 && value >= vect_min_worthwhile_factor (code));
7231 }
7232
7233 /* Function vectorizable_induction
7234
7235 Check if STMT_INFO performs an induction computation that can be vectorized.
7236 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7237 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7238 Return true if STMT_INFO is vectorizable in this way. */
7239
7240 bool
7241 vectorizable_induction (stmt_vec_info stmt_info,
7242 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7243 stmt_vec_info *vec_stmt, slp_tree slp_node,
7244 stmt_vector_for_cost *cost_vec)
7245 {
7246 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7247 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7248 unsigned ncopies;
7249 bool nested_in_vect_loop = false;
7250 struct loop *iv_loop;
7251 tree vec_def;
7252 edge pe = loop_preheader_edge (loop);
7253 basic_block new_bb;
7254 tree new_vec, vec_init, vec_step, t;
7255 tree new_name;
7256 gimple *new_stmt;
7257 gphi *induction_phi;
7258 tree induc_def, vec_dest;
7259 tree init_expr, step_expr;
7260 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7261 unsigned i;
7262 tree expr;
7263 gimple_seq stmts;
7264 imm_use_iterator imm_iter;
7265 use_operand_p use_p;
7266 gimple *exit_phi;
7267 edge latch_e;
7268 tree loop_arg;
7269 gimple_stmt_iterator si;
7270
7271 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7272 if (!phi)
7273 return false;
7274
7275 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7276 return false;
7277
7278 /* Make sure it was recognized as induction computation. */
7279 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7280 return false;
7281
7282 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7283 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7284
7285 if (slp_node)
7286 ncopies = 1;
7287 else
7288 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7289 gcc_assert (ncopies >= 1);
7290
7291 /* FORNOW. These restrictions should be relaxed. */
7292 if (nested_in_vect_loop_p (loop, stmt_info))
7293 {
7294 imm_use_iterator imm_iter;
7295 use_operand_p use_p;
7296 gimple *exit_phi;
7297 edge latch_e;
7298 tree loop_arg;
7299
7300 if (ncopies > 1)
7301 {
7302 if (dump_enabled_p ())
7303 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7304 "multiple types in nested loop.\n");
7305 return false;
7306 }
7307
7308 /* FORNOW: outer loop induction with SLP not supported. */
7309 if (STMT_SLP_TYPE (stmt_info))
7310 return false;
7311
7312 exit_phi = NULL;
7313 latch_e = loop_latch_edge (loop->inner);
7314 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7315 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7316 {
7317 gimple *use_stmt = USE_STMT (use_p);
7318 if (is_gimple_debug (use_stmt))
7319 continue;
7320
7321 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7322 {
7323 exit_phi = use_stmt;
7324 break;
7325 }
7326 }
7327 if (exit_phi)
7328 {
7329 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7330 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7331 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7332 {
7333 if (dump_enabled_p ())
7334 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7335 "inner-loop induction only used outside "
7336 "of the outer vectorized loop.\n");
7337 return false;
7338 }
7339 }
7340
7341 nested_in_vect_loop = true;
7342 iv_loop = loop->inner;
7343 }
7344 else
7345 iv_loop = loop;
7346 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7347
7348 if (slp_node && !nunits.is_constant ())
7349 {
7350 /* The current SLP code creates the initial value element-by-element. */
7351 if (dump_enabled_p ())
7352 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7353 "SLP induction not supported for variable-length"
7354 " vectors.\n");
7355 return false;
7356 }
7357
7358 if (!vec_stmt) /* transformation not required. */
7359 {
7360 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7361 DUMP_VECT_SCOPE ("vectorizable_induction");
7362 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7363 return true;
7364 }
7365
7366 /* Transform. */
7367
7368 /* Compute a vector variable, initialized with the first VF values of
7369 the induction variable. E.g., for an iv with IV_PHI='X' and
7370 evolution S, for a vector of 4 units, we want to compute:
7371 [X, X + S, X + 2*S, X + 3*S]. */
7372
7373 if (dump_enabled_p ())
7374 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7375
7376 latch_e = loop_latch_edge (iv_loop);
7377 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7378
7379 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7380 gcc_assert (step_expr != NULL_TREE);
7381
7382 pe = loop_preheader_edge (iv_loop);
7383 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7384 loop_preheader_edge (iv_loop));
7385
7386 stmts = NULL;
7387 if (!nested_in_vect_loop)
7388 {
7389 /* Convert the initial value to the desired type. */
7390 tree new_type = TREE_TYPE (vectype);
7391 init_expr = gimple_convert (&stmts, new_type, init_expr);
7392
7393 /* If we are using the loop mask to "peel" for alignment then we need
7394 to adjust the start value here. */
7395 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7396 if (skip_niters != NULL_TREE)
7397 {
7398 if (FLOAT_TYPE_P (vectype))
7399 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7400 skip_niters);
7401 else
7402 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7403 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7404 skip_niters, step_expr);
7405 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7406 init_expr, skip_step);
7407 }
7408 }
7409
7410 /* Convert the step to the desired type. */
7411 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7412
7413 if (stmts)
7414 {
7415 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7416 gcc_assert (!new_bb);
7417 }
7418
7419 /* Find the first insertion point in the BB. */
7420 basic_block bb = gimple_bb (phi);
7421 si = gsi_after_labels (bb);
7422
7423 /* For SLP induction we have to generate several IVs as for example
7424 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7425 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7426 [VF*S, VF*S, VF*S, VF*S] for all. */
7427 if (slp_node)
7428 {
7429 /* Enforced above. */
7430 unsigned int const_nunits = nunits.to_constant ();
7431
7432 /* Generate [VF*S, VF*S, ... ]. */
7433 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7434 {
7435 expr = build_int_cst (integer_type_node, vf);
7436 expr = fold_convert (TREE_TYPE (step_expr), expr);
7437 }
7438 else
7439 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7440 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7441 expr, step_expr);
7442 if (! CONSTANT_CLASS_P (new_name))
7443 new_name = vect_init_vector (stmt_info, new_name,
7444 TREE_TYPE (step_expr), NULL);
7445 new_vec = build_vector_from_val (vectype, new_name);
7446 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7447
7448 /* Now generate the IVs. */
7449 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7450 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7451 unsigned elts = const_nunits * nvects;
7452 unsigned nivs = least_common_multiple (group_size,
7453 const_nunits) / const_nunits;
7454 gcc_assert (elts % group_size == 0);
7455 tree elt = init_expr;
7456 unsigned ivn;
7457 for (ivn = 0; ivn < nivs; ++ivn)
7458 {
7459 tree_vector_builder elts (vectype, const_nunits, 1);
7460 stmts = NULL;
7461 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7462 {
7463 if (ivn*const_nunits + eltn >= group_size
7464 && (ivn * const_nunits + eltn) % group_size == 0)
7465 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7466 elt, step_expr);
7467 elts.quick_push (elt);
7468 }
7469 vec_init = gimple_build_vector (&stmts, &elts);
7470 if (stmts)
7471 {
7472 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7473 gcc_assert (!new_bb);
7474 }
7475
7476 /* Create the induction-phi that defines the induction-operand. */
7477 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7478 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7479 stmt_vec_info induction_phi_info
7480 = loop_vinfo->add_stmt (induction_phi);
7481 induc_def = PHI_RESULT (induction_phi);
7482
7483 /* Create the iv update inside the loop */
7484 vec_def = make_ssa_name (vec_dest);
7485 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7486 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7487 loop_vinfo->add_stmt (new_stmt);
7488
7489 /* Set the arguments of the phi node: */
7490 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7491 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7492 UNKNOWN_LOCATION);
7493
7494 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7495 }
7496
7497 /* Re-use IVs when we can. */
7498 if (ivn < nvects)
7499 {
7500 unsigned vfp
7501 = least_common_multiple (group_size, const_nunits) / group_size;
7502 /* Generate [VF'*S, VF'*S, ... ]. */
7503 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7504 {
7505 expr = build_int_cst (integer_type_node, vfp);
7506 expr = fold_convert (TREE_TYPE (step_expr), expr);
7507 }
7508 else
7509 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7510 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7511 expr, step_expr);
7512 if (! CONSTANT_CLASS_P (new_name))
7513 new_name = vect_init_vector (stmt_info, new_name,
7514 TREE_TYPE (step_expr), NULL);
7515 new_vec = build_vector_from_val (vectype, new_name);
7516 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7517 for (; ivn < nvects; ++ivn)
7518 {
7519 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7520 tree def;
7521 if (gimple_code (iv) == GIMPLE_PHI)
7522 def = gimple_phi_result (iv);
7523 else
7524 def = gimple_assign_lhs (iv);
7525 new_stmt = gimple_build_assign (make_ssa_name (vectype),
7526 PLUS_EXPR,
7527 def, vec_step);
7528 if (gimple_code (iv) == GIMPLE_PHI)
7529 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7530 else
7531 {
7532 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7533 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7534 }
7535 SLP_TREE_VEC_STMTS (slp_node).quick_push
7536 (loop_vinfo->add_stmt (new_stmt));
7537 }
7538 }
7539
7540 return true;
7541 }
7542
7543 /* Create the vector that holds the initial_value of the induction. */
7544 if (nested_in_vect_loop)
7545 {
7546 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7547 been created during vectorization of previous stmts. We obtain it
7548 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7549 vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7550 /* If the initial value is not of proper type, convert it. */
7551 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7552 {
7553 new_stmt
7554 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7555 vect_simple_var,
7556 "vec_iv_"),
7557 VIEW_CONVERT_EXPR,
7558 build1 (VIEW_CONVERT_EXPR, vectype,
7559 vec_init));
7560 vec_init = gimple_assign_lhs (new_stmt);
7561 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7562 new_stmt);
7563 gcc_assert (!new_bb);
7564 loop_vinfo->add_stmt (new_stmt);
7565 }
7566 }
7567 else
7568 {
7569 /* iv_loop is the loop to be vectorized. Create:
7570 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7571 stmts = NULL;
7572 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7573
7574 unsigned HOST_WIDE_INT const_nunits;
7575 if (nunits.is_constant (&const_nunits))
7576 {
7577 tree_vector_builder elts (vectype, const_nunits, 1);
7578 elts.quick_push (new_name);
7579 for (i = 1; i < const_nunits; i++)
7580 {
7581 /* Create: new_name_i = new_name + step_expr */
7582 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7583 new_name, step_expr);
7584 elts.quick_push (new_name);
7585 }
7586 /* Create a vector from [new_name_0, new_name_1, ...,
7587 new_name_nunits-1] */
7588 vec_init = gimple_build_vector (&stmts, &elts);
7589 }
7590 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7591 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7592 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7593 new_name, step_expr);
7594 else
7595 {
7596 /* Build:
7597 [base, base, base, ...]
7598 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7599 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7600 gcc_assert (flag_associative_math);
7601 tree index = build_index_vector (vectype, 0, 1);
7602 tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7603 new_name);
7604 tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7605 step_expr);
7606 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7607 vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7608 vec_init, step_vec);
7609 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7610 vec_init, base_vec);
7611 }
7612
7613 if (stmts)
7614 {
7615 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7616 gcc_assert (!new_bb);
7617 }
7618 }
7619
7620
7621 /* Create the vector that holds the step of the induction. */
7622 if (nested_in_vect_loop)
7623 /* iv_loop is nested in the loop to be vectorized. Generate:
7624 vec_step = [S, S, S, S] */
7625 new_name = step_expr;
7626 else
7627 {
7628 /* iv_loop is the loop to be vectorized. Generate:
7629 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7630 gimple_seq seq = NULL;
7631 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7632 {
7633 expr = build_int_cst (integer_type_node, vf);
7634 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7635 }
7636 else
7637 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7638 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7639 expr, step_expr);
7640 if (seq)
7641 {
7642 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7643 gcc_assert (!new_bb);
7644 }
7645 }
7646
7647 t = unshare_expr (new_name);
7648 gcc_assert (CONSTANT_CLASS_P (new_name)
7649 || TREE_CODE (new_name) == SSA_NAME);
7650 new_vec = build_vector_from_val (vectype, t);
7651 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7652
7653
7654 /* Create the following def-use cycle:
7655 loop prolog:
7656 vec_init = ...
7657 vec_step = ...
7658 loop:
7659 vec_iv = PHI <vec_init, vec_loop>
7660 ...
7661 STMT
7662 ...
7663 vec_loop = vec_iv + vec_step; */
7664
7665 /* Create the induction-phi that defines the induction-operand. */
7666 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7667 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7668 stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7669 induc_def = PHI_RESULT (induction_phi);
7670
7671 /* Create the iv update inside the loop */
7672 vec_def = make_ssa_name (vec_dest);
7673 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7674 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7675 stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7676
7677 /* Set the arguments of the phi node: */
7678 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7679 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7680 UNKNOWN_LOCATION);
7681
7682 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7683
7684 /* In case that vectorization factor (VF) is bigger than the number
7685 of elements that we can fit in a vectype (nunits), we have to generate
7686 more than one vector stmt - i.e - we need to "unroll" the
7687 vector stmt by a factor VF/nunits. For more details see documentation
7688 in vectorizable_operation. */
7689
7690 if (ncopies > 1)
7691 {
7692 gimple_seq seq = NULL;
7693 stmt_vec_info prev_stmt_vinfo;
7694 /* FORNOW. This restriction should be relaxed. */
7695 gcc_assert (!nested_in_vect_loop);
7696
7697 /* Create the vector that holds the step of the induction. */
7698 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7699 {
7700 expr = build_int_cst (integer_type_node, nunits);
7701 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7702 }
7703 else
7704 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7705 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7706 expr, step_expr);
7707 if (seq)
7708 {
7709 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7710 gcc_assert (!new_bb);
7711 }
7712
7713 t = unshare_expr (new_name);
7714 gcc_assert (CONSTANT_CLASS_P (new_name)
7715 || TREE_CODE (new_name) == SSA_NAME);
7716 new_vec = build_vector_from_val (vectype, t);
7717 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7718
7719 vec_def = induc_def;
7720 prev_stmt_vinfo = induction_phi_info;
7721 for (i = 1; i < ncopies; i++)
7722 {
7723 /* vec_i = vec_prev + vec_step */
7724 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7725 vec_def, vec_step);
7726 vec_def = make_ssa_name (vec_dest, new_stmt);
7727 gimple_assign_set_lhs (new_stmt, vec_def);
7728
7729 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7730 new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7731 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7732 prev_stmt_vinfo = new_stmt_info;
7733 }
7734 }
7735
7736 if (nested_in_vect_loop)
7737 {
7738 /* Find the loop-closed exit-phi of the induction, and record
7739 the final vector of induction results: */
7740 exit_phi = NULL;
7741 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7742 {
7743 gimple *use_stmt = USE_STMT (use_p);
7744 if (is_gimple_debug (use_stmt))
7745 continue;
7746
7747 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7748 {
7749 exit_phi = use_stmt;
7750 break;
7751 }
7752 }
7753 if (exit_phi)
7754 {
7755 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7756 /* FORNOW. Currently not supporting the case that an inner-loop induction
7757 is not used in the outer-loop (i.e. only outside the outer-loop). */
7758 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7759 && !STMT_VINFO_LIVE_P (stmt_vinfo));
7760
7761 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7762 if (dump_enabled_p ())
7763 {
7764 dump_printf_loc (MSG_NOTE, vect_location,
7765 "vector of inductions after inner-loop:");
7766 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7767 }
7768 }
7769 }
7770
7771
7772 if (dump_enabled_p ())
7773 {
7774 dump_printf_loc (MSG_NOTE, vect_location,
7775 "transform induction: created def-use cycle: ");
7776 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7777 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7778 SSA_NAME_DEF_STMT (vec_def), 0);
7779 }
7780
7781 return true;
7782 }
7783
7784 /* Function vectorizable_live_operation.
7785
7786 STMT_INFO computes a value that is used outside the loop. Check if
7787 it can be supported. */
7788
7789 bool
7790 vectorizable_live_operation (stmt_vec_info stmt_info,
7791 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7792 slp_tree slp_node, int slp_index,
7793 stmt_vec_info *vec_stmt,
7794 stmt_vector_for_cost *)
7795 {
7796 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7797 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7798 imm_use_iterator imm_iter;
7799 tree lhs, lhs_type, bitsize, vec_bitsize;
7800 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7801 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7802 int ncopies;
7803 gimple *use_stmt;
7804 auto_vec<tree> vec_oprnds;
7805 int vec_entry = 0;
7806 poly_uint64 vec_index = 0;
7807
7808 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7809
7810 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7811 return false;
7812
7813 /* FORNOW. CHECKME. */
7814 if (nested_in_vect_loop_p (loop, stmt_info))
7815 return false;
7816
7817 /* If STMT is not relevant and it is a simple assignment and its inputs are
7818 invariant then it can remain in place, unvectorized. The original last
7819 scalar value that it computes will be used. */
7820 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7821 {
7822 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7823 if (dump_enabled_p ())
7824 dump_printf_loc (MSG_NOTE, vect_location,
7825 "statement is simple and uses invariant. Leaving in "
7826 "place.\n");
7827 return true;
7828 }
7829
7830 if (slp_node)
7831 ncopies = 1;
7832 else
7833 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7834
7835 if (slp_node)
7836 {
7837 gcc_assert (slp_index >= 0);
7838
7839 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7840 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7841
7842 /* Get the last occurrence of the scalar index from the concatenation of
7843 all the slp vectors. Calculate which slp vector it is and the index
7844 within. */
7845 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7846
7847 /* Calculate which vector contains the result, and which lane of
7848 that vector we need. */
7849 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7850 {
7851 if (dump_enabled_p ())
7852 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7853 "Cannot determine which vector holds the"
7854 " final result.\n");
7855 return false;
7856 }
7857 }
7858
7859 if (!vec_stmt)
7860 {
7861 /* No transformation required. */
7862 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7863 {
7864 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7865 OPTIMIZE_FOR_SPEED))
7866 {
7867 if (dump_enabled_p ())
7868 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7869 "can't use a fully-masked loop because "
7870 "the target doesn't support extract last "
7871 "reduction.\n");
7872 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7873 }
7874 else if (slp_node)
7875 {
7876 if (dump_enabled_p ())
7877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7878 "can't use a fully-masked loop because an "
7879 "SLP statement is live after the loop.\n");
7880 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7881 }
7882 else if (ncopies > 1)
7883 {
7884 if (dump_enabled_p ())
7885 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7886 "can't use a fully-masked loop because"
7887 " ncopies is greater than 1.\n");
7888 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7889 }
7890 else
7891 {
7892 gcc_assert (ncopies == 1 && !slp_node);
7893 vect_record_loop_mask (loop_vinfo,
7894 &LOOP_VINFO_MASKS (loop_vinfo),
7895 1, vectype);
7896 }
7897 }
7898 return true;
7899 }
7900
7901 /* If stmt has a related stmt, then use that for getting the lhs. */
7902 gimple *stmt = (is_pattern_stmt_p (stmt_info)
7903 ? STMT_VINFO_RELATED_STMT (stmt_info)->stmt
7904 : stmt_info->stmt);
7905
7906 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7907 : gimple_get_lhs (stmt);
7908 lhs_type = TREE_TYPE (lhs);
7909
7910 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7911 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7912 : TYPE_SIZE (TREE_TYPE (vectype)));
7913 vec_bitsize = TYPE_SIZE (vectype);
7914
7915 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
7916 tree vec_lhs, bitstart;
7917 if (slp_node)
7918 {
7919 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7920
7921 /* Get the correct slp vectorized stmt. */
7922 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7923 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7924 vec_lhs = gimple_phi_result (phi);
7925 else
7926 vec_lhs = gimple_get_lhs (vec_stmt);
7927
7928 /* Get entry to use. */
7929 bitstart = bitsize_int (vec_index);
7930 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7931 }
7932 else
7933 {
7934 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7935 vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7936 gcc_checking_assert (ncopies == 1
7937 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7938
7939 /* For multiple copies, get the last copy. */
7940 for (int i = 1; i < ncopies; ++i)
7941 vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
7942
7943 /* Get the last lane in the vector. */
7944 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7945 }
7946
7947 gimple_seq stmts = NULL;
7948 tree new_tree;
7949 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7950 {
7951 /* Emit:
7952
7953 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7954
7955 where VEC_LHS is the vectorized live-out result and MASK is
7956 the loop mask for the final iteration. */
7957 gcc_assert (ncopies == 1 && !slp_node);
7958 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7959 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
7960 1, vectype, 0);
7961 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
7962 scalar_type, mask, vec_lhs);
7963
7964 /* Convert the extracted vector element to the required scalar type. */
7965 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7966 }
7967 else
7968 {
7969 tree bftype = TREE_TYPE (vectype);
7970 if (VECTOR_BOOLEAN_TYPE_P (vectype))
7971 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7972 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7973 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
7974 &stmts, true, NULL_TREE);
7975 }
7976
7977 if (stmts)
7978 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7979
7980 /* Replace use of lhs with newly computed result. If the use stmt is a
7981 single arg PHI, just replace all uses of PHI result. It's necessary
7982 because lcssa PHI defining lhs may be before newly inserted stmt. */
7983 use_operand_p use_p;
7984 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7985 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7986 && !is_gimple_debug (use_stmt))
7987 {
7988 if (gimple_code (use_stmt) == GIMPLE_PHI
7989 && gimple_phi_num_args (use_stmt) == 1)
7990 {
7991 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7992 }
7993 else
7994 {
7995 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7996 SET_USE (use_p, new_tree);
7997 }
7998 update_stmt (use_stmt);
7999 }
8000
8001 return true;
8002 }
8003
8004 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8005
8006 static void
8007 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info)
8008 {
8009 ssa_op_iter op_iter;
8010 imm_use_iterator imm_iter;
8011 def_operand_p def_p;
8012 gimple *ustmt;
8013
8014 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8015 {
8016 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8017 {
8018 basic_block bb;
8019
8020 if (!is_gimple_debug (ustmt))
8021 continue;
8022
8023 bb = gimple_bb (ustmt);
8024
8025 if (!flow_bb_inside_loop_p (loop, bb))
8026 {
8027 if (gimple_debug_bind_p (ustmt))
8028 {
8029 if (dump_enabled_p ())
8030 dump_printf_loc (MSG_NOTE, vect_location,
8031 "killing debug use\n");
8032
8033 gimple_debug_bind_reset_value (ustmt);
8034 update_stmt (ustmt);
8035 }
8036 else
8037 gcc_unreachable ();
8038 }
8039 }
8040 }
8041 }
8042
8043 /* Given loop represented by LOOP_VINFO, return true if computation of
8044 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8045 otherwise. */
8046
8047 static bool
8048 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8049 {
8050 /* Constant case. */
8051 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8052 {
8053 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8054 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8055
8056 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8057 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8058 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8059 return true;
8060 }
8061
8062 widest_int max;
8063 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8064 /* Check the upper bound of loop niters. */
8065 if (get_max_loop_iterations (loop, &max))
8066 {
8067 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8068 signop sgn = TYPE_SIGN (type);
8069 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8070 if (max < type_max)
8071 return true;
8072 }
8073 return false;
8074 }
8075
8076 /* Return a mask type with half the number of elements as TYPE. */
8077
8078 tree
8079 vect_halve_mask_nunits (tree type)
8080 {
8081 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8082 return build_truth_vector_type (nunits, current_vector_size);
8083 }
8084
8085 /* Return a mask type with twice as many elements as TYPE. */
8086
8087 tree
8088 vect_double_mask_nunits (tree type)
8089 {
8090 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8091 return build_truth_vector_type (nunits, current_vector_size);
8092 }
8093
8094 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8095 contain a sequence of NVECTORS masks that each control a vector of type
8096 VECTYPE. */
8097
8098 void
8099 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8100 unsigned int nvectors, tree vectype)
8101 {
8102 gcc_assert (nvectors != 0);
8103 if (masks->length () < nvectors)
8104 masks->safe_grow_cleared (nvectors);
8105 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8106 /* The number of scalars per iteration and the number of vectors are
8107 both compile-time constants. */
8108 unsigned int nscalars_per_iter
8109 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8110 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8111 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8112 {
8113 rgm->max_nscalars_per_iter = nscalars_per_iter;
8114 rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8115 }
8116 }
8117
8118 /* Given a complete set of masks MASKS, extract mask number INDEX
8119 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8120 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8121
8122 See the comment above vec_loop_masks for more details about the mask
8123 arrangement. */
8124
8125 tree
8126 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8127 unsigned int nvectors, tree vectype, unsigned int index)
8128 {
8129 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8130 tree mask_type = rgm->mask_type;
8131
8132 /* Populate the rgroup's mask array, if this is the first time we've
8133 used it. */
8134 if (rgm->masks.is_empty ())
8135 {
8136 rgm->masks.safe_grow_cleared (nvectors);
8137 for (unsigned int i = 0; i < nvectors; ++i)
8138 {
8139 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8140 /* Provide a dummy definition until the real one is available. */
8141 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8142 rgm->masks[i] = mask;
8143 }
8144 }
8145
8146 tree mask = rgm->masks[index];
8147 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8148 TYPE_VECTOR_SUBPARTS (vectype)))
8149 {
8150 /* A loop mask for data type X can be reused for data type Y
8151 if X has N times more elements than Y and if Y's elements
8152 are N times bigger than X's. In this case each sequence
8153 of N elements in the loop mask will be all-zero or all-one.
8154 We can then view-convert the mask so that each sequence of
8155 N elements is replaced by a single element. */
8156 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8157 TYPE_VECTOR_SUBPARTS (vectype)));
8158 gimple_seq seq = NULL;
8159 mask_type = build_same_sized_truth_vector_type (vectype);
8160 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8161 if (seq)
8162 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8163 }
8164 return mask;
8165 }
8166
8167 /* Scale profiling counters by estimation for LOOP which is vectorized
8168 by factor VF. */
8169
8170 static void
8171 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8172 {
8173 edge preheader = loop_preheader_edge (loop);
8174 /* Reduce loop iterations by the vectorization factor. */
8175 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8176 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8177
8178 if (freq_h.nonzero_p ())
8179 {
8180 profile_probability p;
8181
8182 /* Avoid dropping loop body profile counter to 0 because of zero count
8183 in loop's preheader. */
8184 if (!(freq_e == profile_count::zero ()))
8185 freq_e = freq_e.force_nonzero ();
8186 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8187 scale_loop_frequencies (loop, p);
8188 }
8189
8190 edge exit_e = single_exit (loop);
8191 exit_e->probability = profile_probability::always ()
8192 .apply_scale (1, new_est_niter + 1);
8193
8194 edge exit_l = single_pred_edge (loop->latch);
8195 profile_probability prob = exit_l->probability;
8196 exit_l->probability = exit_e->probability.invert ();
8197 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8198 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8199 }
8200
8201 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8202 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8203 stmt_vec_info. */
8204
8205 static void
8206 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8207 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8208 {
8209 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8210 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8211
8212 if (dump_enabled_p ())
8213 {
8214 dump_printf_loc (MSG_NOTE, vect_location,
8215 "------>vectorizing statement: ");
8216 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
8217 }
8218
8219 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8220 vect_loop_kill_debug_uses (loop, stmt_info);
8221
8222 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8223 && !STMT_VINFO_LIVE_P (stmt_info))
8224 return;
8225
8226 if (STMT_VINFO_VECTYPE (stmt_info))
8227 {
8228 poly_uint64 nunits
8229 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8230 if (!STMT_SLP_TYPE (stmt_info)
8231 && maybe_ne (nunits, vf)
8232 && dump_enabled_p ())
8233 /* For SLP VF is set according to unrolling factor, and not
8234 to vector size, hence for SLP this print is not valid. */
8235 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8236 }
8237
8238 /* Pure SLP statements have already been vectorized. We still need
8239 to apply loop vectorization to hybrid SLP statements. */
8240 if (PURE_SLP_STMT (stmt_info))
8241 return;
8242
8243 if (dump_enabled_p ())
8244 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8245
8246 if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8247 *seen_store = stmt_info;
8248 }
8249
8250 /* Function vect_transform_loop.
8251
8252 The analysis phase has determined that the loop is vectorizable.
8253 Vectorize the loop - created vectorized stmts to replace the scalar
8254 stmts in the loop, and update the loop exit condition.
8255 Returns scalar epilogue loop if any. */
8256
8257 struct loop *
8258 vect_transform_loop (loop_vec_info loop_vinfo)
8259 {
8260 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8261 struct loop *epilogue = NULL;
8262 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8263 int nbbs = loop->num_nodes;
8264 int i;
8265 tree niters_vector = NULL_TREE;
8266 tree step_vector = NULL_TREE;
8267 tree niters_vector_mult_vf = NULL_TREE;
8268 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8269 unsigned int lowest_vf = constant_lower_bound (vf);
8270 gimple *stmt;
8271 bool check_profitability = false;
8272 unsigned int th;
8273
8274 DUMP_VECT_SCOPE ("vec_transform_loop");
8275
8276 loop_vinfo->shared->check_datarefs ();
8277
8278 /* Use the more conservative vectorization threshold. If the number
8279 of iterations is constant assume the cost check has been performed
8280 by our caller. If the threshold makes all loops profitable that
8281 run at least the (estimated) vectorization factor number of times
8282 checking is pointless, too. */
8283 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8284 if (th >= vect_vf_for_cost (loop_vinfo)
8285 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8286 {
8287 if (dump_enabled_p ())
8288 dump_printf_loc (MSG_NOTE, vect_location,
8289 "Profitability threshold is %d loop iterations.\n",
8290 th);
8291 check_profitability = true;
8292 }
8293
8294 /* Make sure there exists a single-predecessor exit bb. Do this before
8295 versioning. */
8296 edge e = single_exit (loop);
8297 if (! single_pred_p (e->dest))
8298 {
8299 split_loop_exit_edge (e);
8300 if (dump_enabled_p ())
8301 dump_printf (MSG_NOTE, "split exit edge\n");
8302 }
8303
8304 /* Version the loop first, if required, so the profitability check
8305 comes first. */
8306
8307 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8308 {
8309 poly_uint64 versioning_threshold
8310 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8311 if (check_profitability
8312 && ordered_p (poly_uint64 (th), versioning_threshold))
8313 {
8314 versioning_threshold = ordered_max (poly_uint64 (th),
8315 versioning_threshold);
8316 check_profitability = false;
8317 }
8318 vect_loop_versioning (loop_vinfo, th, check_profitability,
8319 versioning_threshold);
8320 check_profitability = false;
8321 }
8322
8323 /* Make sure there exists a single-predecessor exit bb also on the
8324 scalar loop copy. Do this after versioning but before peeling
8325 so CFG structure is fine for both scalar and if-converted loop
8326 to make slpeel_duplicate_current_defs_from_edges face matched
8327 loop closed PHI nodes on the exit. */
8328 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8329 {
8330 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8331 if (! single_pred_p (e->dest))
8332 {
8333 split_loop_exit_edge (e);
8334 if (dump_enabled_p ())
8335 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8336 }
8337 }
8338
8339 tree niters = vect_build_loop_niters (loop_vinfo);
8340 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8341 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8342 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8343 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8344 &step_vector, &niters_vector_mult_vf, th,
8345 check_profitability, niters_no_overflow);
8346
8347 if (niters_vector == NULL_TREE)
8348 {
8349 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8350 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8351 && known_eq (lowest_vf, vf))
8352 {
8353 niters_vector
8354 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8355 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8356 step_vector = build_one_cst (TREE_TYPE (niters));
8357 }
8358 else
8359 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8360 &step_vector, niters_no_overflow);
8361 }
8362
8363 /* 1) Make sure the loop header has exactly two entries
8364 2) Make sure we have a preheader basic block. */
8365
8366 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8367
8368 split_edge (loop_preheader_edge (loop));
8369
8370 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8371 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8372 /* This will deal with any possible peeling. */
8373 vect_prepare_for_masked_peels (loop_vinfo);
8374
8375 /* Schedule the SLP instances first, then handle loop vectorization
8376 below. */
8377 if (!loop_vinfo->slp_instances.is_empty ())
8378 {
8379 DUMP_VECT_SCOPE ("scheduling SLP instances");
8380 vect_schedule_slp (loop_vinfo);
8381 }
8382
8383 /* FORNOW: the vectorizer supports only loops which body consist
8384 of one basic block (header + empty latch). When the vectorizer will
8385 support more involved loop forms, the order by which the BBs are
8386 traversed need to be reconsidered. */
8387
8388 for (i = 0; i < nbbs; i++)
8389 {
8390 basic_block bb = bbs[i];
8391 stmt_vec_info stmt_info;
8392
8393 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8394 gsi_next (&si))
8395 {
8396 gphi *phi = si.phi ();
8397 if (dump_enabled_p ())
8398 {
8399 dump_printf_loc (MSG_NOTE, vect_location,
8400 "------>vectorizing phi: ");
8401 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8402 }
8403 stmt_info = loop_vinfo->lookup_stmt (phi);
8404 if (!stmt_info)
8405 continue;
8406
8407 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8408 vect_loop_kill_debug_uses (loop, stmt_info);
8409
8410 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8411 && !STMT_VINFO_LIVE_P (stmt_info))
8412 continue;
8413
8414 if (STMT_VINFO_VECTYPE (stmt_info)
8415 && (maybe_ne
8416 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8417 && dump_enabled_p ())
8418 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8419
8420 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8421 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8422 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8423 && ! PURE_SLP_STMT (stmt_info))
8424 {
8425 if (dump_enabled_p ())
8426 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8427 vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8428 }
8429 }
8430
8431 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8432 !gsi_end_p (si);)
8433 {
8434 stmt = gsi_stmt (si);
8435 /* During vectorization remove existing clobber stmts. */
8436 if (gimple_clobber_p (stmt))
8437 {
8438 unlink_stmt_vdef (stmt);
8439 gsi_remove (&si, true);
8440 release_defs (stmt);
8441 }
8442 else
8443 {
8444 stmt_info = loop_vinfo->lookup_stmt (stmt);
8445
8446 /* vector stmts created in the outer-loop during vectorization of
8447 stmts in an inner-loop may not have a stmt_info, and do not
8448 need to be vectorized. */
8449 stmt_vec_info seen_store = NULL;
8450 if (stmt_info)
8451 {
8452 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8453 {
8454 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8455 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8456 !gsi_end_p (subsi); gsi_next (&subsi))
8457 {
8458 stmt_vec_info pat_stmt_info
8459 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8460 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8461 &si, &seen_store);
8462 }
8463 stmt_vec_info pat_stmt_info
8464 = STMT_VINFO_RELATED_STMT (stmt_info);
8465 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8466 &seen_store);
8467 }
8468 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8469 &seen_store);
8470 }
8471 gsi_next (&si);
8472 if (seen_store)
8473 {
8474 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8475 /* Interleaving. If IS_STORE is TRUE, the
8476 vectorization of the interleaving chain was
8477 completed - free all the stores in the chain. */
8478 vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8479 else
8480 /* Free the attached stmt_vec_info and remove the stmt. */
8481 loop_vinfo->remove_stmt (stmt_info);
8482 }
8483 }
8484 }
8485
8486 /* Stub out scalar statements that must not survive vectorization.
8487 Doing this here helps with grouped statements, or statements that
8488 are involved in patterns. */
8489 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8490 !gsi_end_p (gsi); gsi_next (&gsi))
8491 {
8492 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8493 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8494 {
8495 tree lhs = gimple_get_lhs (call);
8496 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8497 {
8498 tree zero = build_zero_cst (TREE_TYPE (lhs));
8499 gimple *new_stmt = gimple_build_assign (lhs, zero);
8500 gsi_replace (&gsi, new_stmt, true);
8501 }
8502 }
8503 }
8504 } /* BBs in loop */
8505
8506 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8507 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8508 if (integer_onep (step_vector))
8509 niters_no_overflow = true;
8510 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8511 niters_vector_mult_vf, !niters_no_overflow);
8512
8513 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8514 scale_profile_for_vect_loop (loop, assumed_vf);
8515
8516 /* True if the final iteration might not handle a full vector's
8517 worth of scalar iterations. */
8518 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8519 /* The minimum number of iterations performed by the epilogue. This
8520 is 1 when peeling for gaps because we always need a final scalar
8521 iteration. */
8522 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8523 /* +1 to convert latch counts to loop iteration counts,
8524 -min_epilogue_iters to remove iterations that cannot be performed
8525 by the vector code. */
8526 int bias_for_lowest = 1 - min_epilogue_iters;
8527 int bias_for_assumed = bias_for_lowest;
8528 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8529 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8530 {
8531 /* When the amount of peeling is known at compile time, the first
8532 iteration will have exactly alignment_npeels active elements.
8533 In the worst case it will have at least one. */
8534 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8535 bias_for_lowest += lowest_vf - min_first_active;
8536 bias_for_assumed += assumed_vf - min_first_active;
8537 }
8538 /* In these calculations the "- 1" converts loop iteration counts
8539 back to latch counts. */
8540 if (loop->any_upper_bound)
8541 loop->nb_iterations_upper_bound
8542 = (final_iter_may_be_partial
8543 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8544 lowest_vf) - 1
8545 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8546 lowest_vf) - 1);
8547 if (loop->any_likely_upper_bound)
8548 loop->nb_iterations_likely_upper_bound
8549 = (final_iter_may_be_partial
8550 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8551 + bias_for_lowest, lowest_vf) - 1
8552 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8553 + bias_for_lowest, lowest_vf) - 1);
8554 if (loop->any_estimate)
8555 loop->nb_iterations_estimate
8556 = (final_iter_may_be_partial
8557 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8558 assumed_vf) - 1
8559 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8560 assumed_vf) - 1);
8561
8562 if (dump_enabled_p ())
8563 {
8564 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8565 {
8566 dump_printf_loc (MSG_NOTE, vect_location,
8567 "LOOP VECTORIZED\n");
8568 if (loop->inner)
8569 dump_printf_loc (MSG_NOTE, vect_location,
8570 "OUTER LOOP VECTORIZED\n");
8571 dump_printf (MSG_NOTE, "\n");
8572 }
8573 else
8574 {
8575 dump_printf_loc (MSG_NOTE, vect_location,
8576 "LOOP EPILOGUE VECTORIZED (VS=");
8577 dump_dec (MSG_NOTE, current_vector_size);
8578 dump_printf (MSG_NOTE, ")\n");
8579 }
8580 }
8581
8582 /* Free SLP instances here because otherwise stmt reference counting
8583 won't work. */
8584 slp_instance instance;
8585 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8586 vect_free_slp_instance (instance, true);
8587 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8588 /* Clear-up safelen field since its value is invalid after vectorization
8589 since vectorized loop can have loop-carried dependencies. */
8590 loop->safelen = 0;
8591
8592 /* Don't vectorize epilogue for epilogue. */
8593 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8594 epilogue = NULL;
8595
8596 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8597 epilogue = NULL;
8598
8599 if (epilogue)
8600 {
8601 auto_vector_sizes vector_sizes;
8602 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8603 unsigned int next_size = 0;
8604
8605 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8606 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8607 && known_eq (vf, lowest_vf))
8608 {
8609 unsigned int eiters
8610 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8611 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8612 eiters = eiters % lowest_vf;
8613 epilogue->nb_iterations_upper_bound = eiters - 1;
8614
8615 unsigned int ratio;
8616 while (next_size < vector_sizes.length ()
8617 && !(constant_multiple_p (current_vector_size,
8618 vector_sizes[next_size], &ratio)
8619 && eiters >= lowest_vf / ratio))
8620 next_size += 1;
8621 }
8622 else
8623 while (next_size < vector_sizes.length ()
8624 && maybe_lt (current_vector_size, vector_sizes[next_size]))
8625 next_size += 1;
8626
8627 if (next_size == vector_sizes.length ())
8628 epilogue = NULL;
8629 }
8630
8631 if (epilogue)
8632 {
8633 epilogue->force_vectorize = loop->force_vectorize;
8634 epilogue->safelen = loop->safelen;
8635 epilogue->dont_vectorize = false;
8636
8637 /* We may need to if-convert epilogue to vectorize it. */
8638 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8639 tree_if_conversion (epilogue);
8640 }
8641
8642 return epilogue;
8643 }
8644
8645 /* The code below is trying to perform simple optimization - revert
8646 if-conversion for masked stores, i.e. if the mask of a store is zero
8647 do not perform it and all stored value producers also if possible.
8648 For example,
8649 for (i=0; i<n; i++)
8650 if (c[i])
8651 {
8652 p1[i] += 1;
8653 p2[i] = p3[i] +2;
8654 }
8655 this transformation will produce the following semi-hammock:
8656
8657 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8658 {
8659 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8660 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8661 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8662 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8663 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8664 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8665 }
8666 */
8667
8668 void
8669 optimize_mask_stores (struct loop *loop)
8670 {
8671 basic_block *bbs = get_loop_body (loop);
8672 unsigned nbbs = loop->num_nodes;
8673 unsigned i;
8674 basic_block bb;
8675 struct loop *bb_loop;
8676 gimple_stmt_iterator gsi;
8677 gimple *stmt;
8678 auto_vec<gimple *> worklist;
8679
8680 vect_location = find_loop_location (loop);
8681 /* Pick up all masked stores in loop if any. */
8682 for (i = 0; i < nbbs; i++)
8683 {
8684 bb = bbs[i];
8685 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8686 gsi_next (&gsi))
8687 {
8688 stmt = gsi_stmt (gsi);
8689 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8690 worklist.safe_push (stmt);
8691 }
8692 }
8693
8694 free (bbs);
8695 if (worklist.is_empty ())
8696 return;
8697
8698 /* Loop has masked stores. */
8699 while (!worklist.is_empty ())
8700 {
8701 gimple *last, *last_store;
8702 edge e, efalse;
8703 tree mask;
8704 basic_block store_bb, join_bb;
8705 gimple_stmt_iterator gsi_to;
8706 tree vdef, new_vdef;
8707 gphi *phi;
8708 tree vectype;
8709 tree zero;
8710
8711 last = worklist.pop ();
8712 mask = gimple_call_arg (last, 2);
8713 bb = gimple_bb (last);
8714 /* Create then_bb and if-then structure in CFG, then_bb belongs to
8715 the same loop as if_bb. It could be different to LOOP when two
8716 level loop-nest is vectorized and mask_store belongs to the inner
8717 one. */
8718 e = split_block (bb, last);
8719 bb_loop = bb->loop_father;
8720 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8721 join_bb = e->dest;
8722 store_bb = create_empty_bb (bb);
8723 add_bb_to_loop (store_bb, bb_loop);
8724 e->flags = EDGE_TRUE_VALUE;
8725 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8726 /* Put STORE_BB to likely part. */
8727 efalse->probability = profile_probability::unlikely ();
8728 store_bb->count = efalse->count ();
8729 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8730 if (dom_info_available_p (CDI_DOMINATORS))
8731 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8732 if (dump_enabled_p ())
8733 dump_printf_loc (MSG_NOTE, vect_location,
8734 "Create new block %d to sink mask stores.",
8735 store_bb->index);
8736 /* Create vector comparison with boolean result. */
8737 vectype = TREE_TYPE (mask);
8738 zero = build_zero_cst (vectype);
8739 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8740 gsi = gsi_last_bb (bb);
8741 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8742 /* Create new PHI node for vdef of the last masked store:
8743 .MEM_2 = VDEF <.MEM_1>
8744 will be converted to
8745 .MEM.3 = VDEF <.MEM_1>
8746 and new PHI node will be created in join bb
8747 .MEM_2 = PHI <.MEM_1, .MEM_3>
8748 */
8749 vdef = gimple_vdef (last);
8750 new_vdef = make_ssa_name (gimple_vop (cfun), last);
8751 gimple_set_vdef (last, new_vdef);
8752 phi = create_phi_node (vdef, join_bb);
8753 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8754
8755 /* Put all masked stores with the same mask to STORE_BB if possible. */
8756 while (true)
8757 {
8758 gimple_stmt_iterator gsi_from;
8759 gimple *stmt1 = NULL;
8760
8761 /* Move masked store to STORE_BB. */
8762 last_store = last;
8763 gsi = gsi_for_stmt (last);
8764 gsi_from = gsi;
8765 /* Shift GSI to the previous stmt for further traversal. */
8766 gsi_prev (&gsi);
8767 gsi_to = gsi_start_bb (store_bb);
8768 gsi_move_before (&gsi_from, &gsi_to);
8769 /* Setup GSI_TO to the non-empty block start. */
8770 gsi_to = gsi_start_bb (store_bb);
8771 if (dump_enabled_p ())
8772 {
8773 dump_printf_loc (MSG_NOTE, vect_location,
8774 "Move stmt to created bb\n");
8775 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
8776 }
8777 /* Move all stored value producers if possible. */
8778 while (!gsi_end_p (gsi))
8779 {
8780 tree lhs;
8781 imm_use_iterator imm_iter;
8782 use_operand_p use_p;
8783 bool res;
8784
8785 /* Skip debug statements. */
8786 if (is_gimple_debug (gsi_stmt (gsi)))
8787 {
8788 gsi_prev (&gsi);
8789 continue;
8790 }
8791 stmt1 = gsi_stmt (gsi);
8792 /* Do not consider statements writing to memory or having
8793 volatile operand. */
8794 if (gimple_vdef (stmt1)
8795 || gimple_has_volatile_ops (stmt1))
8796 break;
8797 gsi_from = gsi;
8798 gsi_prev (&gsi);
8799 lhs = gimple_get_lhs (stmt1);
8800 if (!lhs)
8801 break;
8802
8803 /* LHS of vectorized stmt must be SSA_NAME. */
8804 if (TREE_CODE (lhs) != SSA_NAME)
8805 break;
8806
8807 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8808 {
8809 /* Remove dead scalar statement. */
8810 if (has_zero_uses (lhs))
8811 {
8812 gsi_remove (&gsi_from, true);
8813 continue;
8814 }
8815 }
8816
8817 /* Check that LHS does not have uses outside of STORE_BB. */
8818 res = true;
8819 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8820 {
8821 gimple *use_stmt;
8822 use_stmt = USE_STMT (use_p);
8823 if (is_gimple_debug (use_stmt))
8824 continue;
8825 if (gimple_bb (use_stmt) != store_bb)
8826 {
8827 res = false;
8828 break;
8829 }
8830 }
8831 if (!res)
8832 break;
8833
8834 if (gimple_vuse (stmt1)
8835 && gimple_vuse (stmt1) != gimple_vuse (last_store))
8836 break;
8837
8838 /* Can move STMT1 to STORE_BB. */
8839 if (dump_enabled_p ())
8840 {
8841 dump_printf_loc (MSG_NOTE, vect_location,
8842 "Move stmt to created bb\n");
8843 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
8844 }
8845 gsi_move_before (&gsi_from, &gsi_to);
8846 /* Shift GSI_TO for further insertion. */
8847 gsi_prev (&gsi_to);
8848 }
8849 /* Put other masked stores with the same mask to STORE_BB. */
8850 if (worklist.is_empty ()
8851 || gimple_call_arg (worklist.last (), 2) != mask
8852 || worklist.last () != stmt1)
8853 break;
8854 last = worklist.pop ();
8855 }
8856 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8857 }
8858 }