tree-vect-loop.c (vect_transform_loop): Properly compute upper bound for the epilogue...
[gcc.git] / gcc / tree-vect-loop.c
1 /* Loop Vectorization
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
57
58 /* Loop Vectorization Pass.
59
60 This pass tries to vectorize loops.
61
62 For example, the vectorizer transforms the following simple loop:
63
64 short a[N]; short b[N]; short c[N]; int i;
65
66 for (i=0; i<N; i++){
67 a[i] = b[i] + c[i];
68 }
69
70 as if it was manually vectorized by rewriting the source code into:
71
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75 v8hi va, vb, vc;
76
77 for (i=0; i<N/8; i++){
78 vb = pb[i];
79 vc = pc[i];
80 va = vb + vc;
81 pa[i] = va;
82 }
83
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
95
96 Analysis phase:
97 ===============
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
101
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
106
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
117
118 For example, say stmt S1 was vectorized into stmt VS1:
119
120 VS1: vb = px[i];
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122 S2: a = b;
123
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
128
129 VS1: vb = px[i];
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 VS2: va = vb;
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
133
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
136
137 Target modeling:
138 =================
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
144
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
151
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
154 */
155
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
157
158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
159 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
160 may already be set for general statements (not just data refs). */
161
162 static opt_result
163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
164 bool vectype_maybe_set_p,
165 poly_uint64 *vf,
166 vec<stmt_vec_info > *mask_producers)
167 {
168 gimple *stmt = stmt_info->stmt;
169
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
173 {
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return opt_result::success ();
177 }
178
179 tree stmt_vectype, nunits_vectype;
180 opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
181 &nunits_vectype);
182 if (!res)
183 return res;
184
185 if (stmt_vectype)
186 {
187 if (STMT_VINFO_VECTYPE (stmt_info))
188 /* The only case when a vectype had been already set is for stmts
189 that contain a data ref, or for "pattern-stmts" (stmts generated
190 by the vectorizer to represent/replace a certain idiom). */
191 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
192 || vectype_maybe_set_p)
193 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
194 else if (stmt_vectype == boolean_type_node)
195 mask_producers->safe_push (stmt_info);
196 else
197 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
198 }
199
200 if (nunits_vectype)
201 vect_update_max_nunits (vf, nunits_vectype);
202
203 return opt_result::success ();
204 }
205
206 /* Subroutine of vect_determine_vectorization_factor. Set the vector
207 types of STMT_INFO and all attached pattern statements and update
208 the vectorization factor VF accordingly. If some of the statements
209 produce a mask result whose vector type can only be calculated later,
210 add them to MASK_PRODUCERS. Return true on success or false if
211 something prevented vectorization. */
212
213 static opt_result
214 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
215 vec<stmt_vec_info > *mask_producers)
216 {
217 vec_info *vinfo = stmt_info->vinfo;
218 if (dump_enabled_p ())
219 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
220 stmt_info->stmt);
221 opt_result res
222 = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers);
223 if (!res)
224 return res;
225
226 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
227 && STMT_VINFO_RELATED_STMT (stmt_info))
228 {
229 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
230 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
231
232 /* If a pattern statement has def stmts, analyze them too. */
233 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
234 !gsi_end_p (si); gsi_next (&si))
235 {
236 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
237 if (dump_enabled_p ())
238 dump_printf_loc (MSG_NOTE, vect_location,
239 "==> examining pattern def stmt: %G",
240 def_stmt_info->stmt);
241 if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
242 vf, mask_producers))
243 res = vect_determine_vf_for_stmt_1 (def_stmt_info, true,
244 vf, mask_producers);
245 if (!res)
246 return res;
247 }
248
249 if (dump_enabled_p ())
250 dump_printf_loc (MSG_NOTE, vect_location,
251 "==> examining pattern statement: %G",
252 stmt_info->stmt);
253 res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers);
254 if (!res)
255 return res;
256 }
257
258 return opt_result::success ();
259 }
260
261 /* Function vect_determine_vectorization_factor
262
263 Determine the vectorization factor (VF). VF is the number of data elements
264 that are operated upon in parallel in a single iteration of the vectorized
265 loop. For example, when vectorizing a loop that operates on 4byte elements,
266 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
267 elements can fit in a single vector register.
268
269 We currently support vectorization of loops in which all types operated upon
270 are of the same size. Therefore this function currently sets VF according to
271 the size of the types operated upon, and fails if there are multiple sizes
272 in the loop.
273
274 VF is also the factor by which the loop iterations are strip-mined, e.g.:
275 original loop:
276 for (i=0; i<N; i++){
277 a[i] = b[i] + c[i];
278 }
279
280 vectorized loop:
281 for (i=0; i<N; i+=VF){
282 a[i:VF] = b[i:VF] + c[i:VF];
283 }
284 */
285
286 static opt_result
287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
288 {
289 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
290 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
291 unsigned nbbs = loop->num_nodes;
292 poly_uint64 vectorization_factor = 1;
293 tree scalar_type = NULL_TREE;
294 gphi *phi;
295 tree vectype;
296 stmt_vec_info stmt_info;
297 unsigned i;
298 auto_vec<stmt_vec_info> mask_producers;
299
300 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
301
302 for (i = 0; i < nbbs; i++)
303 {
304 basic_block bb = bbs[i];
305
306 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
307 gsi_next (&si))
308 {
309 phi = si.phi ();
310 stmt_info = loop_vinfo->lookup_stmt (phi);
311 if (dump_enabled_p ())
312 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
313 phi);
314
315 gcc_assert (stmt_info);
316
317 if (STMT_VINFO_RELEVANT_P (stmt_info)
318 || STMT_VINFO_LIVE_P (stmt_info))
319 {
320 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
321 scalar_type = TREE_TYPE (PHI_RESULT (phi));
322
323 if (dump_enabled_p ())
324 dump_printf_loc (MSG_NOTE, vect_location,
325 "get vectype for scalar type: %T\n",
326 scalar_type);
327
328 vectype = get_vectype_for_scalar_type (scalar_type);
329 if (!vectype)
330 return opt_result::failure_at (phi,
331 "not vectorized: unsupported "
332 "data-type %T\n",
333 scalar_type);
334 STMT_VINFO_VECTYPE (stmt_info) = vectype;
335
336 if (dump_enabled_p ())
337 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
338 vectype);
339
340 if (dump_enabled_p ())
341 {
342 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
343 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
344 dump_printf (MSG_NOTE, "\n");
345 }
346
347 vect_update_max_nunits (&vectorization_factor, vectype);
348 }
349 }
350
351 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
352 gsi_next (&si))
353 {
354 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
355 opt_result res
356 = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
357 &mask_producers);
358 if (!res)
359 return res;
360 }
361 }
362
363 /* TODO: Analyze cost. Decide if worth while to vectorize. */
364 if (dump_enabled_p ())
365 {
366 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
367 dump_dec (MSG_NOTE, vectorization_factor);
368 dump_printf (MSG_NOTE, "\n");
369 }
370
371 if (known_le (vectorization_factor, 1U))
372 return opt_result::failure_at (vect_location,
373 "not vectorized: unsupported data-type\n");
374 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
375
376 for (i = 0; i < mask_producers.length (); i++)
377 {
378 stmt_info = mask_producers[i];
379 opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
380 if (!mask_type)
381 return opt_result::propagate_failure (mask_type);
382 STMT_VINFO_VECTYPE (stmt_info) = mask_type;
383 }
384
385 return opt_result::success ();
386 }
387
388
389 /* Function vect_is_simple_iv_evolution.
390
391 FORNOW: A simple evolution of an induction variables in the loop is
392 considered a polynomial evolution. */
393
394 static bool
395 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
396 tree * step)
397 {
398 tree init_expr;
399 tree step_expr;
400 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
401 basic_block bb;
402
403 /* When there is no evolution in this loop, the evolution function
404 is not "simple". */
405 if (evolution_part == NULL_TREE)
406 return false;
407
408 /* When the evolution is a polynomial of degree >= 2
409 the evolution function is not "simple". */
410 if (tree_is_chrec (evolution_part))
411 return false;
412
413 step_expr = evolution_part;
414 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
415
416 if (dump_enabled_p ())
417 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
418 step_expr, init_expr);
419
420 *init = init_expr;
421 *step = step_expr;
422
423 if (TREE_CODE (step_expr) != INTEGER_CST
424 && (TREE_CODE (step_expr) != SSA_NAME
425 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
426 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
427 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
428 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
429 || !flag_associative_math)))
430 && (TREE_CODE (step_expr) != REAL_CST
431 || !flag_associative_math))
432 {
433 if (dump_enabled_p ())
434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
435 "step unknown.\n");
436 return false;
437 }
438
439 return true;
440 }
441
442 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
443 what we are assuming is a double reduction. For example, given
444 a structure like this:
445
446 outer1:
447 x_1 = PHI <x_4(outer2), ...>;
448 ...
449
450 inner:
451 x_2 = PHI <x_1(outer1), ...>;
452 ...
453 x_3 = ...;
454 ...
455
456 outer2:
457 x_4 = PHI <x_3(inner)>;
458 ...
459
460 outer loop analysis would treat x_1 as a double reduction phi and
461 this function would then return true for x_2. */
462
463 static bool
464 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
465 {
466 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
467 use_operand_p use_p;
468 ssa_op_iter op_iter;
469 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
470 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
471 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
472 return true;
473 return false;
474 }
475
476 /* Function vect_analyze_scalar_cycles_1.
477
478 Examine the cross iteration def-use cycles of scalar variables
479 in LOOP. LOOP_VINFO represents the loop that is now being
480 considered for vectorization (can be LOOP, or an outer-loop
481 enclosing LOOP). */
482
483 static void
484 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
485 {
486 basic_block bb = loop->header;
487 tree init, step;
488 auto_vec<stmt_vec_info, 64> worklist;
489 gphi_iterator gsi;
490 bool double_reduc;
491
492 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
493
494 /* First - identify all inductions. Reduction detection assumes that all the
495 inductions have been identified, therefore, this order must not be
496 changed. */
497 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
498 {
499 gphi *phi = gsi.phi ();
500 tree access_fn = NULL;
501 tree def = PHI_RESULT (phi);
502 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
503
504 if (dump_enabled_p ())
505 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
506
507 /* Skip virtual phi's. The data dependences that are associated with
508 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
509 if (virtual_operand_p (def))
510 continue;
511
512 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
513
514 /* Analyze the evolution function. */
515 access_fn = analyze_scalar_evolution (loop, def);
516 if (access_fn)
517 {
518 STRIP_NOPS (access_fn);
519 if (dump_enabled_p ())
520 dump_printf_loc (MSG_NOTE, vect_location,
521 "Access function of PHI: %T\n", access_fn);
522 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
523 = initial_condition_in_loop_num (access_fn, loop->num);
524 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
525 = evolution_part_in_loop_num (access_fn, loop->num);
526 }
527
528 if (!access_fn
529 || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
530 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
531 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
532 && TREE_CODE (step) != INTEGER_CST))
533 {
534 worklist.safe_push (stmt_vinfo);
535 continue;
536 }
537
538 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
539 != NULL_TREE);
540 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
541
542 if (dump_enabled_p ())
543 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
544 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
545 }
546
547
548 /* Second - identify all reductions and nested cycles. */
549 while (worklist.length () > 0)
550 {
551 stmt_vec_info stmt_vinfo = worklist.pop ();
552 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
553 tree def = PHI_RESULT (phi);
554
555 if (dump_enabled_p ())
556 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
557
558 gcc_assert (!virtual_operand_p (def)
559 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
560
561 stmt_vec_info reduc_stmt_info
562 = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
563 &double_reduc, false);
564 if (reduc_stmt_info)
565 {
566 if (double_reduc)
567 {
568 if (dump_enabled_p ())
569 dump_printf_loc (MSG_NOTE, vect_location,
570 "Detected double reduction.\n");
571
572 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
573 STMT_VINFO_DEF_TYPE (reduc_stmt_info)
574 = vect_double_reduction_def;
575 }
576 else
577 {
578 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
579 {
580 if (dump_enabled_p ())
581 dump_printf_loc (MSG_NOTE, vect_location,
582 "Detected vectorizable nested cycle.\n");
583
584 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
585 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
586 }
587 else
588 {
589 if (dump_enabled_p ())
590 dump_printf_loc (MSG_NOTE, vect_location,
591 "Detected reduction.\n");
592
593 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
594 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
595 /* Store the reduction cycles for possible vectorization in
596 loop-aware SLP if it was not detected as reduction
597 chain. */
598 if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
599 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
600 (reduc_stmt_info);
601 }
602 }
603 }
604 else
605 if (dump_enabled_p ())
606 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
607 "Unknown def-use cycle pattern.\n");
608 }
609 }
610
611
612 /* Function vect_analyze_scalar_cycles.
613
614 Examine the cross iteration def-use cycles of scalar variables, by
615 analyzing the loop-header PHIs of scalar variables. Classify each
616 cycle as one of the following: invariant, induction, reduction, unknown.
617 We do that for the loop represented by LOOP_VINFO, and also to its
618 inner-loop, if exists.
619 Examples for scalar cycles:
620
621 Example1: reduction:
622
623 loop1:
624 for (i=0; i<N; i++)
625 sum += a[i];
626
627 Example2: induction:
628
629 loop2:
630 for (i=0; i<N; i++)
631 a[i] = i; */
632
633 static void
634 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
635 {
636 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
637
638 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
639
640 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
641 Reductions in such inner-loop therefore have different properties than
642 the reductions in the nest that gets vectorized:
643 1. When vectorized, they are executed in the same order as in the original
644 scalar loop, so we can't change the order of computation when
645 vectorizing them.
646 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
647 current checks are too strict. */
648
649 if (loop->inner)
650 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
651 }
652
653 /* Transfer group and reduction information from STMT_INFO to its
654 pattern stmt. */
655
656 static void
657 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
658 {
659 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
660 stmt_vec_info stmtp;
661 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
662 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
663 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
664 do
665 {
666 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
667 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
668 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
669 if (stmt_info)
670 REDUC_GROUP_NEXT_ELEMENT (stmtp)
671 = STMT_VINFO_RELATED_STMT (stmt_info);
672 }
673 while (stmt_info);
674 STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
675 }
676
677 /* Fixup scalar cycles that now have their stmts detected as patterns. */
678
679 static void
680 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
681 {
682 stmt_vec_info first;
683 unsigned i;
684
685 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
686 if (STMT_VINFO_IN_PATTERN_P (first))
687 {
688 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
689 while (next)
690 {
691 if (! STMT_VINFO_IN_PATTERN_P (next))
692 break;
693 next = REDUC_GROUP_NEXT_ELEMENT (next);
694 }
695 /* If not all stmt in the chain are patterns try to handle
696 the chain without patterns. */
697 if (! next)
698 {
699 vect_fixup_reduc_chain (first);
700 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
701 = STMT_VINFO_RELATED_STMT (first);
702 }
703 }
704 }
705
706 /* Function vect_get_loop_niters.
707
708 Determine how many iterations the loop is executed and place it
709 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
710 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
711 niter information holds in ASSUMPTIONS.
712
713 Return the loop exit condition. */
714
715
716 static gcond *
717 vect_get_loop_niters (struct loop *loop, tree *assumptions,
718 tree *number_of_iterations, tree *number_of_iterationsm1)
719 {
720 edge exit = single_exit (loop);
721 struct tree_niter_desc niter_desc;
722 tree niter_assumptions, niter, may_be_zero;
723 gcond *cond = get_loop_exit_condition (loop);
724
725 *assumptions = boolean_true_node;
726 *number_of_iterationsm1 = chrec_dont_know;
727 *number_of_iterations = chrec_dont_know;
728 DUMP_VECT_SCOPE ("get_loop_niters");
729
730 if (!exit)
731 return cond;
732
733 niter = chrec_dont_know;
734 may_be_zero = NULL_TREE;
735 niter_assumptions = boolean_true_node;
736 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
737 || chrec_contains_undetermined (niter_desc.niter))
738 return cond;
739
740 niter_assumptions = niter_desc.assumptions;
741 may_be_zero = niter_desc.may_be_zero;
742 niter = niter_desc.niter;
743
744 if (may_be_zero && integer_zerop (may_be_zero))
745 may_be_zero = NULL_TREE;
746
747 if (may_be_zero)
748 {
749 if (COMPARISON_CLASS_P (may_be_zero))
750 {
751 /* Try to combine may_be_zero with assumptions, this can simplify
752 computation of niter expression. */
753 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
754 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
755 niter_assumptions,
756 fold_build1 (TRUTH_NOT_EXPR,
757 boolean_type_node,
758 may_be_zero));
759 else
760 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
761 build_int_cst (TREE_TYPE (niter), 0),
762 rewrite_to_non_trapping_overflow (niter));
763
764 may_be_zero = NULL_TREE;
765 }
766 else if (integer_nonzerop (may_be_zero))
767 {
768 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
769 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
770 return cond;
771 }
772 else
773 return cond;
774 }
775
776 *assumptions = niter_assumptions;
777 *number_of_iterationsm1 = niter;
778
779 /* We want the number of loop header executions which is the number
780 of latch executions plus one.
781 ??? For UINT_MAX latch executions this number overflows to zero
782 for loops like do { n++; } while (n != 0); */
783 if (niter && !chrec_contains_undetermined (niter))
784 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
785 build_int_cst (TREE_TYPE (niter), 1));
786 *number_of_iterations = niter;
787
788 return cond;
789 }
790
791 /* Function bb_in_loop_p
792
793 Used as predicate for dfs order traversal of the loop bbs. */
794
795 static bool
796 bb_in_loop_p (const_basic_block bb, const void *data)
797 {
798 const struct loop *const loop = (const struct loop *)data;
799 if (flow_bb_inside_loop_p (loop, bb))
800 return true;
801 return false;
802 }
803
804
805 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
806 stmt_vec_info structs for all the stmts in LOOP_IN. */
807
808 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
809 : vec_info (vec_info::loop, init_cost (loop_in), shared),
810 loop (loop_in),
811 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
812 num_itersm1 (NULL_TREE),
813 num_iters (NULL_TREE),
814 num_iters_unchanged (NULL_TREE),
815 num_iters_assumptions (NULL_TREE),
816 th (0),
817 versioning_threshold (0),
818 vectorization_factor (0),
819 max_vectorization_factor (0),
820 mask_skip_niters (NULL_TREE),
821 mask_compare_type (NULL_TREE),
822 unaligned_dr (NULL),
823 peeling_for_alignment (0),
824 ptr_mask (0),
825 ivexpr_map (NULL),
826 slp_unrolling_factor (1),
827 single_scalar_iteration_cost (0),
828 vectorizable (false),
829 can_fully_mask_p (true),
830 fully_masked_p (false),
831 peeling_for_gaps (false),
832 peeling_for_niter (false),
833 operands_swapped (false),
834 no_data_dependencies (false),
835 has_mask_store (false),
836 scalar_loop (NULL),
837 orig_loop_info (NULL)
838 {
839 /* CHECKME: We want to visit all BBs before their successors (except for
840 latch blocks, for which this assertion wouldn't hold). In the simple
841 case of the loop forms we allow, a dfs order of the BBs would the same
842 as reversed postorder traversal, so we are safe. */
843
844 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
845 bbs, loop->num_nodes, loop);
846 gcc_assert (nbbs == loop->num_nodes);
847
848 for (unsigned int i = 0; i < nbbs; i++)
849 {
850 basic_block bb = bbs[i];
851 gimple_stmt_iterator si;
852
853 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
854 {
855 gimple *phi = gsi_stmt (si);
856 gimple_set_uid (phi, 0);
857 add_stmt (phi);
858 }
859
860 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
861 {
862 gimple *stmt = gsi_stmt (si);
863 gimple_set_uid (stmt, 0);
864 add_stmt (stmt);
865 }
866 }
867 }
868
869 /* Free all levels of MASKS. */
870
871 void
872 release_vec_loop_masks (vec_loop_masks *masks)
873 {
874 rgroup_masks *rgm;
875 unsigned int i;
876 FOR_EACH_VEC_ELT (*masks, i, rgm)
877 rgm->masks.release ();
878 masks->release ();
879 }
880
881 /* Free all memory used by the _loop_vec_info, as well as all the
882 stmt_vec_info structs of all the stmts in the loop. */
883
884 _loop_vec_info::~_loop_vec_info ()
885 {
886 int nbbs;
887 gimple_stmt_iterator si;
888 int j;
889
890 nbbs = loop->num_nodes;
891 for (j = 0; j < nbbs; j++)
892 {
893 basic_block bb = bbs[j];
894 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
895 {
896 gimple *stmt = gsi_stmt (si);
897
898 /* We may have broken canonical form by moving a constant
899 into RHS1 of a commutative op. Fix such occurrences. */
900 if (operands_swapped && is_gimple_assign (stmt))
901 {
902 enum tree_code code = gimple_assign_rhs_code (stmt);
903
904 if ((code == PLUS_EXPR
905 || code == POINTER_PLUS_EXPR
906 || code == MULT_EXPR)
907 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
908 swap_ssa_operands (stmt,
909 gimple_assign_rhs1_ptr (stmt),
910 gimple_assign_rhs2_ptr (stmt));
911 else if (code == COND_EXPR
912 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
913 {
914 tree cond_expr = gimple_assign_rhs1 (stmt);
915 enum tree_code cond_code = TREE_CODE (cond_expr);
916
917 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
918 {
919 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
920 0));
921 cond_code = invert_tree_comparison (cond_code,
922 honor_nans);
923 if (cond_code != ERROR_MARK)
924 {
925 TREE_SET_CODE (cond_expr, cond_code);
926 swap_ssa_operands (stmt,
927 gimple_assign_rhs2_ptr (stmt),
928 gimple_assign_rhs3_ptr (stmt));
929 }
930 }
931 }
932 }
933 gsi_next (&si);
934 }
935 }
936
937 free (bbs);
938
939 release_vec_loop_masks (&masks);
940 delete ivexpr_map;
941
942 loop->aux = NULL;
943 }
944
945 /* Return an invariant or register for EXPR and emit necessary
946 computations in the LOOP_VINFO loop preheader. */
947
948 tree
949 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
950 {
951 if (is_gimple_reg (expr)
952 || is_gimple_min_invariant (expr))
953 return expr;
954
955 if (! loop_vinfo->ivexpr_map)
956 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
957 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
958 if (! cached)
959 {
960 gimple_seq stmts = NULL;
961 cached = force_gimple_operand (unshare_expr (expr),
962 &stmts, true, NULL_TREE);
963 if (stmts)
964 {
965 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
966 gsi_insert_seq_on_edge_immediate (e, stmts);
967 }
968 }
969 return cached;
970 }
971
972 /* Return true if we can use CMP_TYPE as the comparison type to produce
973 all masks required to mask LOOP_VINFO. */
974
975 static bool
976 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
977 {
978 rgroup_masks *rgm;
979 unsigned int i;
980 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
981 if (rgm->mask_type != NULL_TREE
982 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
983 cmp_type, rgm->mask_type,
984 OPTIMIZE_FOR_SPEED))
985 return false;
986 return true;
987 }
988
989 /* Calculate the maximum number of scalars per iteration for every
990 rgroup in LOOP_VINFO. */
991
992 static unsigned int
993 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
994 {
995 unsigned int res = 1;
996 unsigned int i;
997 rgroup_masks *rgm;
998 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
999 res = MAX (res, rgm->max_nscalars_per_iter);
1000 return res;
1001 }
1002
1003 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1004 whether we can actually generate the masks required. Return true if so,
1005 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1006
1007 static bool
1008 vect_verify_full_masking (loop_vec_info loop_vinfo)
1009 {
1010 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1011 unsigned int min_ni_width;
1012
1013 /* Use a normal loop if there are no statements that need masking.
1014 This only happens in rare degenerate cases: it means that the loop
1015 has no loads, no stores, and no live-out values. */
1016 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1017 return false;
1018
1019 /* Get the maximum number of iterations that is representable
1020 in the counter type. */
1021 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1022 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1023
1024 /* Get a more refined estimate for the number of iterations. */
1025 widest_int max_back_edges;
1026 if (max_loop_iterations (loop, &max_back_edges))
1027 max_ni = wi::smin (max_ni, max_back_edges + 1);
1028
1029 /* Account for rgroup masks, in which each bit is replicated N times. */
1030 max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1031
1032 /* Work out how many bits we need to represent the limit. */
1033 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1034
1035 /* Find a scalar mode for which WHILE_ULT is supported. */
1036 opt_scalar_int_mode cmp_mode_iter;
1037 tree cmp_type = NULL_TREE;
1038 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1039 {
1040 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1041 if (cmp_bits >= min_ni_width
1042 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1043 {
1044 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1045 if (this_type
1046 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1047 {
1048 /* Although we could stop as soon as we find a valid mode,
1049 it's often better to continue until we hit Pmode, since the
1050 operands to the WHILE are more likely to be reusable in
1051 address calculations. */
1052 cmp_type = this_type;
1053 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1054 break;
1055 }
1056 }
1057 }
1058
1059 if (!cmp_type)
1060 return false;
1061
1062 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1063 return true;
1064 }
1065
1066 /* Calculate the cost of one scalar iteration of the loop. */
1067 static void
1068 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1069 {
1070 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1071 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1072 int nbbs = loop->num_nodes, factor;
1073 int innerloop_iters, i;
1074
1075 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1076
1077 /* Gather costs for statements in the scalar loop. */
1078
1079 /* FORNOW. */
1080 innerloop_iters = 1;
1081 if (loop->inner)
1082 innerloop_iters = 50; /* FIXME */
1083
1084 for (i = 0; i < nbbs; i++)
1085 {
1086 gimple_stmt_iterator si;
1087 basic_block bb = bbs[i];
1088
1089 if (bb->loop_father == loop->inner)
1090 factor = innerloop_iters;
1091 else
1092 factor = 1;
1093
1094 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1095 {
1096 gimple *stmt = gsi_stmt (si);
1097 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1098
1099 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1100 continue;
1101
1102 /* Skip stmts that are not vectorized inside the loop. */
1103 if (stmt_info
1104 && !STMT_VINFO_RELEVANT_P (stmt_info)
1105 && (!STMT_VINFO_LIVE_P (stmt_info)
1106 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1107 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1108 continue;
1109
1110 vect_cost_for_stmt kind;
1111 if (STMT_VINFO_DATA_REF (stmt_info))
1112 {
1113 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1114 kind = scalar_load;
1115 else
1116 kind = scalar_store;
1117 }
1118 else
1119 kind = scalar_stmt;
1120
1121 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1122 factor, kind, stmt_info, 0, vect_prologue);
1123 }
1124 }
1125
1126 /* Now accumulate cost. */
1127 void *target_cost_data = init_cost (loop);
1128 stmt_info_for_cost *si;
1129 int j;
1130 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1131 j, si)
1132 (void) add_stmt_cost (target_cost_data, si->count,
1133 si->kind, si->stmt_info, si->misalign,
1134 vect_body);
1135 unsigned dummy, body_cost = 0;
1136 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1137 destroy_cost_data (target_cost_data);
1138 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1139 }
1140
1141
1142 /* Function vect_analyze_loop_form_1.
1143
1144 Verify that certain CFG restrictions hold, including:
1145 - the loop has a pre-header
1146 - the loop has a single entry and exit
1147 - the loop exit condition is simple enough
1148 - the number of iterations can be analyzed, i.e, a countable loop. The
1149 niter could be analyzed under some assumptions. */
1150
1151 opt_result
1152 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1153 tree *assumptions, tree *number_of_iterationsm1,
1154 tree *number_of_iterations, gcond **inner_loop_cond)
1155 {
1156 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1157
1158 /* Different restrictions apply when we are considering an inner-most loop,
1159 vs. an outer (nested) loop.
1160 (FORNOW. May want to relax some of these restrictions in the future). */
1161
1162 if (!loop->inner)
1163 {
1164 /* Inner-most loop. We currently require that the number of BBs is
1165 exactly 2 (the header and latch). Vectorizable inner-most loops
1166 look like this:
1167
1168 (pre-header)
1169 |
1170 header <--------+
1171 | | |
1172 | +--> latch --+
1173 |
1174 (exit-bb) */
1175
1176 if (loop->num_nodes != 2)
1177 return opt_result::failure_at (vect_location,
1178 "not vectorized:"
1179 " control flow in loop.\n");
1180
1181 if (empty_block_p (loop->header))
1182 return opt_result::failure_at (vect_location,
1183 "not vectorized: empty loop.\n");
1184 }
1185 else
1186 {
1187 struct loop *innerloop = loop->inner;
1188 edge entryedge;
1189
1190 /* Nested loop. We currently require that the loop is doubly-nested,
1191 contains a single inner loop, and the number of BBs is exactly 5.
1192 Vectorizable outer-loops look like this:
1193
1194 (pre-header)
1195 |
1196 header <---+
1197 | |
1198 inner-loop |
1199 | |
1200 tail ------+
1201 |
1202 (exit-bb)
1203
1204 The inner-loop has the properties expected of inner-most loops
1205 as described above. */
1206
1207 if ((loop->inner)->inner || (loop->inner)->next)
1208 return opt_result::failure_at (vect_location,
1209 "not vectorized:"
1210 " multiple nested loops.\n");
1211
1212 if (loop->num_nodes != 5)
1213 return opt_result::failure_at (vect_location,
1214 "not vectorized:"
1215 " control flow in loop.\n");
1216
1217 entryedge = loop_preheader_edge (innerloop);
1218 if (entryedge->src != loop->header
1219 || !single_exit (innerloop)
1220 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1221 return opt_result::failure_at (vect_location,
1222 "not vectorized:"
1223 " unsupported outerloop form.\n");
1224
1225 /* Analyze the inner-loop. */
1226 tree inner_niterm1, inner_niter, inner_assumptions;
1227 opt_result res
1228 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1229 &inner_assumptions, &inner_niterm1,
1230 &inner_niter, NULL);
1231 if (!res)
1232 {
1233 if (dump_enabled_p ())
1234 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1235 "not vectorized: Bad inner loop.\n");
1236 return res;
1237 }
1238
1239 /* Don't support analyzing niter under assumptions for inner
1240 loop. */
1241 if (!integer_onep (inner_assumptions))
1242 return opt_result::failure_at (vect_location,
1243 "not vectorized: Bad inner loop.\n");
1244
1245 if (!expr_invariant_in_loop_p (loop, inner_niter))
1246 return opt_result::failure_at (vect_location,
1247 "not vectorized: inner-loop count not"
1248 " invariant.\n");
1249
1250 if (dump_enabled_p ())
1251 dump_printf_loc (MSG_NOTE, vect_location,
1252 "Considering outer-loop vectorization.\n");
1253 }
1254
1255 if (!single_exit (loop))
1256 return opt_result::failure_at (vect_location,
1257 "not vectorized: multiple exits.\n");
1258 if (EDGE_COUNT (loop->header->preds) != 2)
1259 return opt_result::failure_at (vect_location,
1260 "not vectorized:"
1261 " too many incoming edges.\n");
1262
1263 /* We assume that the loop exit condition is at the end of the loop. i.e,
1264 that the loop is represented as a do-while (with a proper if-guard
1265 before the loop if needed), where the loop header contains all the
1266 executable statements, and the latch is empty. */
1267 if (!empty_block_p (loop->latch)
1268 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1269 return opt_result::failure_at (vect_location,
1270 "not vectorized: latch block not empty.\n");
1271
1272 /* Make sure the exit is not abnormal. */
1273 edge e = single_exit (loop);
1274 if (e->flags & EDGE_ABNORMAL)
1275 return opt_result::failure_at (vect_location,
1276 "not vectorized:"
1277 " abnormal loop exit edge.\n");
1278
1279 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1280 number_of_iterationsm1);
1281 if (!*loop_cond)
1282 return opt_result::failure_at
1283 (vect_location,
1284 "not vectorized: complicated exit condition.\n");
1285
1286 if (integer_zerop (*assumptions)
1287 || !*number_of_iterations
1288 || chrec_contains_undetermined (*number_of_iterations))
1289 return opt_result::failure_at
1290 (*loop_cond,
1291 "not vectorized: number of iterations cannot be computed.\n");
1292
1293 if (integer_zerop (*number_of_iterations))
1294 return opt_result::failure_at
1295 (*loop_cond,
1296 "not vectorized: number of iterations = 0.\n");
1297
1298 return opt_result::success ();
1299 }
1300
1301 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1302
1303 opt_loop_vec_info
1304 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1305 {
1306 tree assumptions, number_of_iterations, number_of_iterationsm1;
1307 gcond *loop_cond, *inner_loop_cond = NULL;
1308
1309 opt_result res
1310 = vect_analyze_loop_form_1 (loop, &loop_cond,
1311 &assumptions, &number_of_iterationsm1,
1312 &number_of_iterations, &inner_loop_cond);
1313 if (!res)
1314 return opt_loop_vec_info::propagate_failure (res);
1315
1316 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1317 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1318 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1319 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1320 if (!integer_onep (assumptions))
1321 {
1322 /* We consider to vectorize this loop by versioning it under
1323 some assumptions. In order to do this, we need to clear
1324 existing information computed by scev and niter analyzer. */
1325 scev_reset_htab ();
1326 free_numbers_of_iterations_estimates (loop);
1327 /* Also set flag for this loop so that following scev and niter
1328 analysis are done under the assumptions. */
1329 loop_constraint_set (loop, LOOP_C_FINITE);
1330 /* Also record the assumptions for versioning. */
1331 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1332 }
1333
1334 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1335 {
1336 if (dump_enabled_p ())
1337 {
1338 dump_printf_loc (MSG_NOTE, vect_location,
1339 "Symbolic number of iterations is ");
1340 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1341 dump_printf (MSG_NOTE, "\n");
1342 }
1343 }
1344
1345 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1346 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1347 if (inner_loop_cond)
1348 {
1349 stmt_vec_info inner_loop_cond_info
1350 = loop_vinfo->lookup_stmt (inner_loop_cond);
1351 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1352 }
1353
1354 gcc_assert (!loop->aux);
1355 loop->aux = loop_vinfo;
1356 return opt_loop_vec_info::success (loop_vinfo);
1357 }
1358
1359
1360
1361 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1362 statements update the vectorization factor. */
1363
1364 static void
1365 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1366 {
1367 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1368 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1369 int nbbs = loop->num_nodes;
1370 poly_uint64 vectorization_factor;
1371 int i;
1372
1373 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1374
1375 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1376 gcc_assert (known_ne (vectorization_factor, 0U));
1377
1378 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1379 vectorization factor of the loop is the unrolling factor required by
1380 the SLP instances. If that unrolling factor is 1, we say, that we
1381 perform pure SLP on loop - cross iteration parallelism is not
1382 exploited. */
1383 bool only_slp_in_loop = true;
1384 for (i = 0; i < nbbs; i++)
1385 {
1386 basic_block bb = bbs[i];
1387 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1388 gsi_next (&si))
1389 {
1390 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1391 stmt_info = vect_stmt_to_vectorize (stmt_info);
1392 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1393 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1394 && !PURE_SLP_STMT (stmt_info))
1395 /* STMT needs both SLP and loop-based vectorization. */
1396 only_slp_in_loop = false;
1397 }
1398 }
1399
1400 if (only_slp_in_loop)
1401 {
1402 if (dump_enabled_p ())
1403 dump_printf_loc (MSG_NOTE, vect_location,
1404 "Loop contains only SLP stmts\n");
1405 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1406 }
1407 else
1408 {
1409 if (dump_enabled_p ())
1410 dump_printf_loc (MSG_NOTE, vect_location,
1411 "Loop contains SLP and non-SLP stmts\n");
1412 /* Both the vectorization factor and unroll factor have the form
1413 current_vector_size * X for some rational X, so they must have
1414 a common multiple. */
1415 vectorization_factor
1416 = force_common_multiple (vectorization_factor,
1417 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1418 }
1419
1420 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1421 if (dump_enabled_p ())
1422 {
1423 dump_printf_loc (MSG_NOTE, vect_location,
1424 "Updating vectorization factor to ");
1425 dump_dec (MSG_NOTE, vectorization_factor);
1426 dump_printf (MSG_NOTE, ".\n");
1427 }
1428 }
1429
1430 /* Return true if STMT_INFO describes a double reduction phi and if
1431 the other phi in the reduction is also relevant for vectorization.
1432 This rejects cases such as:
1433
1434 outer1:
1435 x_1 = PHI <x_3(outer2), ...>;
1436 ...
1437
1438 inner:
1439 x_2 = ...;
1440 ...
1441
1442 outer2:
1443 x_3 = PHI <x_2(inner)>;
1444
1445 if nothing in x_2 or elsewhere makes x_1 relevant. */
1446
1447 static bool
1448 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1449 {
1450 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1451 return false;
1452
1453 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1454 }
1455
1456 /* Function vect_analyze_loop_operations.
1457
1458 Scan the loop stmts and make sure they are all vectorizable. */
1459
1460 static opt_result
1461 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1462 {
1463 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1464 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1465 int nbbs = loop->num_nodes;
1466 int i;
1467 stmt_vec_info stmt_info;
1468 bool need_to_vectorize = false;
1469 bool ok;
1470
1471 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1472
1473 stmt_vector_for_cost cost_vec;
1474 cost_vec.create (2);
1475
1476 for (i = 0; i < nbbs; i++)
1477 {
1478 basic_block bb = bbs[i];
1479
1480 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1481 gsi_next (&si))
1482 {
1483 gphi *phi = si.phi ();
1484 ok = true;
1485
1486 stmt_info = loop_vinfo->lookup_stmt (phi);
1487 if (dump_enabled_p ())
1488 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1489 if (virtual_operand_p (gimple_phi_result (phi)))
1490 continue;
1491
1492 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1493 (i.e., a phi in the tail of the outer-loop). */
1494 if (! is_loop_header_bb_p (bb))
1495 {
1496 /* FORNOW: we currently don't support the case that these phis
1497 are not used in the outerloop (unless it is double reduction,
1498 i.e., this phi is vect_reduction_def), cause this case
1499 requires to actually do something here. */
1500 if (STMT_VINFO_LIVE_P (stmt_info)
1501 && !vect_active_double_reduction_p (stmt_info))
1502 return opt_result::failure_at (phi,
1503 "Unsupported loop-closed phi"
1504 " in outer-loop.\n");
1505
1506 /* If PHI is used in the outer loop, we check that its operand
1507 is defined in the inner loop. */
1508 if (STMT_VINFO_RELEVANT_P (stmt_info))
1509 {
1510 tree phi_op;
1511
1512 if (gimple_phi_num_args (phi) != 1)
1513 return opt_result::failure_at (phi, "unsupported phi");
1514
1515 phi_op = PHI_ARG_DEF (phi, 0);
1516 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1517 if (!op_def_info)
1518 return opt_result::failure_at (phi, "unsupported phi");
1519
1520 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1521 && (STMT_VINFO_RELEVANT (op_def_info)
1522 != vect_used_in_outer_by_reduction))
1523 return opt_result::failure_at (phi, "unsupported phi");
1524 }
1525
1526 continue;
1527 }
1528
1529 gcc_assert (stmt_info);
1530
1531 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1532 || STMT_VINFO_LIVE_P (stmt_info))
1533 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1534 /* A scalar-dependence cycle that we don't support. */
1535 return opt_result::failure_at (phi,
1536 "not vectorized:"
1537 " scalar dependence cycle.\n");
1538
1539 if (STMT_VINFO_RELEVANT_P (stmt_info))
1540 {
1541 need_to_vectorize = true;
1542 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1543 && ! PURE_SLP_STMT (stmt_info))
1544 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1545 &cost_vec);
1546 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1547 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1548 && ! PURE_SLP_STMT (stmt_info))
1549 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
1550 &cost_vec);
1551 }
1552
1553 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1554 if (ok
1555 && STMT_VINFO_LIVE_P (stmt_info)
1556 && !PURE_SLP_STMT (stmt_info))
1557 ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
1558 &cost_vec);
1559
1560 if (!ok)
1561 return opt_result::failure_at (phi,
1562 "not vectorized: relevant phi not "
1563 "supported: %G",
1564 static_cast <gimple *> (phi));
1565 }
1566
1567 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1568 gsi_next (&si))
1569 {
1570 gimple *stmt = gsi_stmt (si);
1571 if (!gimple_clobber_p (stmt))
1572 {
1573 opt_result res
1574 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1575 &need_to_vectorize,
1576 NULL, NULL, &cost_vec);
1577 if (!res)
1578 return res;
1579 }
1580 }
1581 } /* bbs */
1582
1583 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1584 cost_vec.release ();
1585
1586 /* All operations in the loop are either irrelevant (deal with loop
1587 control, or dead), or only used outside the loop and can be moved
1588 out of the loop (e.g. invariants, inductions). The loop can be
1589 optimized away by scalar optimizations. We're better off not
1590 touching this loop. */
1591 if (!need_to_vectorize)
1592 {
1593 if (dump_enabled_p ())
1594 dump_printf_loc (MSG_NOTE, vect_location,
1595 "All the computation can be taken out of the loop.\n");
1596 return opt_result::failure_at
1597 (vect_location,
1598 "not vectorized: redundant loop. no profit to vectorize.\n");
1599 }
1600
1601 return opt_result::success ();
1602 }
1603
1604 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1605 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1606 definitely no, or -1 if it's worth retrying. */
1607
1608 static int
1609 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1610 {
1611 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1612 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1613
1614 /* Only fully-masked loops can have iteration counts less than the
1615 vectorization factor. */
1616 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1617 {
1618 HOST_WIDE_INT max_niter;
1619
1620 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1621 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1622 else
1623 max_niter = max_stmt_executions_int (loop);
1624
1625 if (max_niter != -1
1626 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1627 {
1628 if (dump_enabled_p ())
1629 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1630 "not vectorized: iteration count smaller than "
1631 "vectorization factor.\n");
1632 return 0;
1633 }
1634 }
1635
1636 int min_profitable_iters, min_profitable_estimate;
1637 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1638 &min_profitable_estimate);
1639
1640 if (min_profitable_iters < 0)
1641 {
1642 if (dump_enabled_p ())
1643 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1644 "not vectorized: vectorization not profitable.\n");
1645 if (dump_enabled_p ())
1646 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1647 "not vectorized: vector version will never be "
1648 "profitable.\n");
1649 return -1;
1650 }
1651
1652 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1653 * assumed_vf);
1654
1655 /* Use the cost model only if it is more conservative than user specified
1656 threshold. */
1657 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1658 min_profitable_iters);
1659
1660 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1661
1662 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1663 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1664 {
1665 if (dump_enabled_p ())
1666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1667 "not vectorized: vectorization not profitable.\n");
1668 if (dump_enabled_p ())
1669 dump_printf_loc (MSG_NOTE, vect_location,
1670 "not vectorized: iteration count smaller than user "
1671 "specified loop bound parameter or minimum profitable "
1672 "iterations (whichever is more conservative).\n");
1673 return 0;
1674 }
1675
1676 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1677 if (estimated_niter == -1)
1678 estimated_niter = likely_max_stmt_executions_int (loop);
1679 if (estimated_niter != -1
1680 && ((unsigned HOST_WIDE_INT) estimated_niter
1681 < MAX (th, (unsigned) min_profitable_estimate)))
1682 {
1683 if (dump_enabled_p ())
1684 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1685 "not vectorized: estimated iteration count too "
1686 "small.\n");
1687 if (dump_enabled_p ())
1688 dump_printf_loc (MSG_NOTE, vect_location,
1689 "not vectorized: estimated iteration count smaller "
1690 "than specified loop bound parameter or minimum "
1691 "profitable iterations (whichever is more "
1692 "conservative).\n");
1693 return -1;
1694 }
1695
1696 return 1;
1697 }
1698
1699 static opt_result
1700 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1701 vec<data_reference_p> *datarefs,
1702 unsigned int *n_stmts)
1703 {
1704 *n_stmts = 0;
1705 for (unsigned i = 0; i < loop->num_nodes; i++)
1706 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1707 !gsi_end_p (gsi); gsi_next (&gsi))
1708 {
1709 gimple *stmt = gsi_stmt (gsi);
1710 if (is_gimple_debug (stmt))
1711 continue;
1712 ++(*n_stmts);
1713 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1714 if (!res)
1715 {
1716 if (is_gimple_call (stmt) && loop->safelen)
1717 {
1718 tree fndecl = gimple_call_fndecl (stmt), op;
1719 if (fndecl != NULL_TREE)
1720 {
1721 cgraph_node *node = cgraph_node::get (fndecl);
1722 if (node != NULL && node->simd_clones != NULL)
1723 {
1724 unsigned int j, n = gimple_call_num_args (stmt);
1725 for (j = 0; j < n; j++)
1726 {
1727 op = gimple_call_arg (stmt, j);
1728 if (DECL_P (op)
1729 || (REFERENCE_CLASS_P (op)
1730 && get_base_address (op)))
1731 break;
1732 }
1733 op = gimple_call_lhs (stmt);
1734 /* Ignore #pragma omp declare simd functions
1735 if they don't have data references in the
1736 call stmt itself. */
1737 if (j == n
1738 && !(op
1739 && (DECL_P (op)
1740 || (REFERENCE_CLASS_P (op)
1741 && get_base_address (op)))))
1742 continue;
1743 }
1744 }
1745 }
1746 return res;
1747 }
1748 /* If dependence analysis will give up due to the limit on the
1749 number of datarefs stop here and fail fatally. */
1750 if (datarefs->length ()
1751 > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1752 return opt_result::failure_at (stmt, "exceeded param "
1753 "loop-max-datarefs-for-datadeps\n");
1754 }
1755 return opt_result::success ();
1756 }
1757
1758 /* Function vect_analyze_loop_2.
1759
1760 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1761 for it. The different analyses will record information in the
1762 loop_vec_info struct. */
1763 static opt_result
1764 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1765 {
1766 opt_result ok = opt_result::success ();
1767 int res;
1768 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1769 poly_uint64 min_vf = 2;
1770
1771 /* The first group of checks is independent of the vector size. */
1772 fatal = true;
1773
1774 /* Find all data references in the loop (which correspond to vdefs/vuses)
1775 and analyze their evolution in the loop. */
1776
1777 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1778
1779 /* Gather the data references and count stmts in the loop. */
1780 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1781 {
1782 opt_result res
1783 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1784 &LOOP_VINFO_DATAREFS (loop_vinfo),
1785 n_stmts);
1786 if (!res)
1787 {
1788 if (dump_enabled_p ())
1789 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1790 "not vectorized: loop contains function "
1791 "calls or data references that cannot "
1792 "be analyzed\n");
1793 return res;
1794 }
1795 loop_vinfo->shared->save_datarefs ();
1796 }
1797 else
1798 loop_vinfo->shared->check_datarefs ();
1799
1800 /* Analyze the data references and also adjust the minimal
1801 vectorization factor according to the loads and stores. */
1802
1803 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1804 if (!ok)
1805 {
1806 if (dump_enabled_p ())
1807 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1808 "bad data references.\n");
1809 return ok;
1810 }
1811
1812 /* Classify all cross-iteration scalar data-flow cycles.
1813 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1814 vect_analyze_scalar_cycles (loop_vinfo);
1815
1816 vect_pattern_recog (loop_vinfo);
1817
1818 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1819
1820 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1821 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1822
1823 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1824 if (!ok)
1825 {
1826 if (dump_enabled_p ())
1827 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1828 "bad data access.\n");
1829 return ok;
1830 }
1831
1832 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1833
1834 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1835 if (!ok)
1836 {
1837 if (dump_enabled_p ())
1838 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1839 "unexpected pattern.\n");
1840 return ok;
1841 }
1842
1843 /* While the rest of the analysis below depends on it in some way. */
1844 fatal = false;
1845
1846 /* Analyze data dependences between the data-refs in the loop
1847 and adjust the maximum vectorization factor according to
1848 the dependences.
1849 FORNOW: fail at the first data dependence that we encounter. */
1850
1851 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1852 if (!ok)
1853 {
1854 if (dump_enabled_p ())
1855 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1856 "bad data dependence.\n");
1857 return ok;
1858 }
1859 if (max_vf != MAX_VECTORIZATION_FACTOR
1860 && maybe_lt (max_vf, min_vf))
1861 return opt_result::failure_at (vect_location, "bad data dependence.\n");
1862 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1863
1864 ok = vect_determine_vectorization_factor (loop_vinfo);
1865 if (!ok)
1866 {
1867 if (dump_enabled_p ())
1868 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1869 "can't determine vectorization factor.\n");
1870 return ok;
1871 }
1872 if (max_vf != MAX_VECTORIZATION_FACTOR
1873 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1874 return opt_result::failure_at (vect_location, "bad data dependence.\n");
1875
1876 /* Compute the scalar iteration cost. */
1877 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1878
1879 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1880 unsigned th;
1881
1882 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1883 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1884 if (!ok)
1885 return ok;
1886
1887 /* If there are any SLP instances mark them as pure_slp. */
1888 bool slp = vect_make_slp_decision (loop_vinfo);
1889 if (slp)
1890 {
1891 /* Find stmts that need to be both vectorized and SLPed. */
1892 vect_detect_hybrid_slp (loop_vinfo);
1893
1894 /* Update the vectorization factor based on the SLP decision. */
1895 vect_update_vf_for_slp (loop_vinfo);
1896 }
1897
1898 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1899
1900 /* We don't expect to have to roll back to anything other than an empty
1901 set of rgroups. */
1902 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1903
1904 /* This is the point where we can re-start analysis with SLP forced off. */
1905 start_over:
1906
1907 /* Now the vectorization factor is final. */
1908 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1909 gcc_assert (known_ne (vectorization_factor, 0U));
1910
1911 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1912 {
1913 dump_printf_loc (MSG_NOTE, vect_location,
1914 "vectorization_factor = ");
1915 dump_dec (MSG_NOTE, vectorization_factor);
1916 dump_printf (MSG_NOTE, ", niters = %wd\n",
1917 LOOP_VINFO_INT_NITERS (loop_vinfo));
1918 }
1919
1920 HOST_WIDE_INT max_niter
1921 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1922
1923 /* Analyze the alignment of the data-refs in the loop.
1924 Fail if a data reference is found that cannot be vectorized. */
1925
1926 ok = vect_analyze_data_refs_alignment (loop_vinfo);
1927 if (!ok)
1928 {
1929 if (dump_enabled_p ())
1930 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1931 "bad data alignment.\n");
1932 return ok;
1933 }
1934
1935 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1936 It is important to call pruning after vect_analyze_data_ref_accesses,
1937 since we use grouping information gathered by interleaving analysis. */
1938 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1939 if (!ok)
1940 return ok;
1941
1942 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
1943 vectorization, since we do not want to add extra peeling or
1944 add versioning for alignment. */
1945 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1946 /* This pass will decide on using loop versioning and/or loop peeling in
1947 order to enhance the alignment of data references in the loop. */
1948 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1949 else
1950 ok = vect_verify_datarefs_alignment (loop_vinfo);
1951 if (!ok)
1952 return ok;
1953
1954 if (slp)
1955 {
1956 /* Analyze operations in the SLP instances. Note this may
1957 remove unsupported SLP instances which makes the above
1958 SLP kind detection invalid. */
1959 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1960 vect_slp_analyze_operations (loop_vinfo);
1961 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1962 {
1963 ok = opt_result::failure_at (vect_location,
1964 "unsupported SLP instances\n");
1965 goto again;
1966 }
1967 }
1968
1969 /* Scan all the remaining operations in the loop that are not subject
1970 to SLP and make sure they are vectorizable. */
1971 ok = vect_analyze_loop_operations (loop_vinfo);
1972 if (!ok)
1973 {
1974 if (dump_enabled_p ())
1975 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1976 "bad operation or unsupported loop bound.\n");
1977 return ok;
1978 }
1979
1980 /* Decide whether to use a fully-masked loop for this vectorization
1981 factor. */
1982 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
1983 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
1984 && vect_verify_full_masking (loop_vinfo));
1985 if (dump_enabled_p ())
1986 {
1987 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1988 dump_printf_loc (MSG_NOTE, vect_location,
1989 "using a fully-masked loop.\n");
1990 else
1991 dump_printf_loc (MSG_NOTE, vect_location,
1992 "not using a fully-masked loop.\n");
1993 }
1994
1995 /* If epilog loop is required because of data accesses with gaps,
1996 one additional iteration needs to be peeled. Check if there is
1997 enough iterations for vectorization. */
1998 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1999 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2000 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2001 {
2002 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2003 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2004
2005 if (known_lt (wi::to_widest (scalar_niters), vf))
2006 return opt_result::failure_at (vect_location,
2007 "loop has no enough iterations to"
2008 " support peeling for gaps.\n");
2009 }
2010
2011 /* Check the costings of the loop make vectorizing worthwhile. */
2012 res = vect_analyze_loop_costing (loop_vinfo);
2013 if (res < 0)
2014 {
2015 ok = opt_result::failure_at (vect_location,
2016 "Loop costings may not be worthwhile.\n");
2017 goto again;
2018 }
2019 if (!res)
2020 return opt_result::failure_at (vect_location,
2021 "Loop costings not worthwhile.\n");
2022
2023 /* Decide whether we need to create an epilogue loop to handle
2024 remaining scalar iterations. */
2025 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2026
2027 unsigned HOST_WIDE_INT const_vf;
2028 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2029 /* The main loop handles all iterations. */
2030 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2031 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2032 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2033 {
2034 /* Work out the (constant) number of iterations that need to be
2035 peeled for reasons other than niters. */
2036 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2037 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2038 peel_niter += 1;
2039 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
2040 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2041 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2042 }
2043 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2044 /* ??? When peeling for gaps but not alignment, we could
2045 try to check whether the (variable) niters is known to be
2046 VF * N + 1. That's something of a niche case though. */
2047 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2048 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2049 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2050 < (unsigned) exact_log2 (const_vf))
2051 /* In case of versioning, check if the maximum number of
2052 iterations is greater than th. If they are identical,
2053 the epilogue is unnecessary. */
2054 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2055 || ((unsigned HOST_WIDE_INT) max_niter
2056 > (th / const_vf) * const_vf))))
2057 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2058
2059 /* If an epilogue loop is required make sure we can create one. */
2060 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2061 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2062 {
2063 if (dump_enabled_p ())
2064 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2065 if (!vect_can_advance_ivs_p (loop_vinfo)
2066 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2067 single_exit (LOOP_VINFO_LOOP
2068 (loop_vinfo))))
2069 {
2070 ok = opt_result::failure_at (vect_location,
2071 "not vectorized: can't create required "
2072 "epilog loop\n");
2073 goto again;
2074 }
2075 }
2076
2077 /* During peeling, we need to check if number of loop iterations is
2078 enough for both peeled prolog loop and vector loop. This check
2079 can be merged along with threshold check of loop versioning, so
2080 increase threshold for this case if necessary. */
2081 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2082 {
2083 poly_uint64 niters_th = 0;
2084
2085 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2086 {
2087 /* Niters for peeled prolog loop. */
2088 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2089 {
2090 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2091 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2092 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2093 }
2094 else
2095 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2096 }
2097
2098 /* Niters for at least one iteration of vectorized loop. */
2099 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2100 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2101 /* One additional iteration because of peeling for gap. */
2102 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2103 niters_th += 1;
2104 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2105 }
2106
2107 gcc_assert (known_eq (vectorization_factor,
2108 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2109
2110 /* Ok to vectorize! */
2111 return opt_result::success ();
2112
2113 again:
2114 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2115 gcc_assert (!ok);
2116
2117 /* Try again with SLP forced off but if we didn't do any SLP there is
2118 no point in re-trying. */
2119 if (!slp)
2120 return ok;
2121
2122 /* If there are reduction chains re-trying will fail anyway. */
2123 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2124 return ok;
2125
2126 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2127 via interleaving or lane instructions. */
2128 slp_instance instance;
2129 slp_tree node;
2130 unsigned i, j;
2131 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2132 {
2133 stmt_vec_info vinfo;
2134 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2135 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2136 continue;
2137 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2138 unsigned int size = DR_GROUP_SIZE (vinfo);
2139 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2140 if (! vect_store_lanes_supported (vectype, size, false)
2141 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2142 && ! vect_grouped_store_supported (vectype, size))
2143 return opt_result::failure_at (vinfo->stmt,
2144 "unsupported grouped store\n");
2145 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2146 {
2147 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2148 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2149 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2150 size = DR_GROUP_SIZE (vinfo);
2151 vectype = STMT_VINFO_VECTYPE (vinfo);
2152 if (! vect_load_lanes_supported (vectype, size, false)
2153 && ! vect_grouped_load_supported (vectype, single_element_p,
2154 size))
2155 return opt_result::failure_at (vinfo->stmt,
2156 "unsupported grouped load\n");
2157 }
2158 }
2159
2160 if (dump_enabled_p ())
2161 dump_printf_loc (MSG_NOTE, vect_location,
2162 "re-trying with SLP disabled\n");
2163
2164 /* Roll back state appropriately. No SLP this time. */
2165 slp = false;
2166 /* Restore vectorization factor as it were without SLP. */
2167 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2168 /* Free the SLP instances. */
2169 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2170 vect_free_slp_instance (instance, false);
2171 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2172 /* Reset SLP type to loop_vect on all stmts. */
2173 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2174 {
2175 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2176 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2177 !gsi_end_p (si); gsi_next (&si))
2178 {
2179 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2180 STMT_SLP_TYPE (stmt_info) = loop_vect;
2181 }
2182 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2183 !gsi_end_p (si); gsi_next (&si))
2184 {
2185 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2186 STMT_SLP_TYPE (stmt_info) = loop_vect;
2187 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2188 {
2189 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2190 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2191 STMT_SLP_TYPE (stmt_info) = loop_vect;
2192 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2193 !gsi_end_p (pi); gsi_next (&pi))
2194 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2195 = loop_vect;
2196 }
2197 }
2198 }
2199 /* Free optimized alias test DDRS. */
2200 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2201 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2202 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2203 /* Reset target cost data. */
2204 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2205 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2206 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2207 /* Reset accumulated rgroup information. */
2208 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2209 /* Reset assorted flags. */
2210 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2211 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2212 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2213 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2214 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2215
2216 goto start_over;
2217 }
2218
2219 /* Function vect_analyze_loop.
2220
2221 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2222 for it. The different analyses will record information in the
2223 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2224 be vectorized. */
2225 opt_loop_vec_info
2226 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2227 vec_info_shared *shared)
2228 {
2229 auto_vector_sizes vector_sizes;
2230
2231 /* Autodetect first vector size we try. */
2232 current_vector_size = 0;
2233 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2234 unsigned int next_size = 0;
2235
2236 DUMP_VECT_SCOPE ("analyze_loop_nest");
2237
2238 if (loop_outer (loop)
2239 && loop_vec_info_for_loop (loop_outer (loop))
2240 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2241 return opt_loop_vec_info::failure_at (vect_location,
2242 "outer-loop already vectorized.\n");
2243
2244 if (!find_loop_nest (loop, &shared->loop_nest))
2245 return opt_loop_vec_info::failure_at
2246 (vect_location,
2247 "not vectorized: loop nest containing two or more consecutive inner"
2248 " loops cannot be vectorized\n");
2249
2250 unsigned n_stmts = 0;
2251 poly_uint64 autodetected_vector_size = 0;
2252 while (1)
2253 {
2254 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2255 opt_loop_vec_info loop_vinfo
2256 = vect_analyze_loop_form (loop, shared);
2257 if (!loop_vinfo)
2258 {
2259 if (dump_enabled_p ())
2260 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2261 "bad loop form.\n");
2262 return loop_vinfo;
2263 }
2264
2265 bool fatal = false;
2266
2267 if (orig_loop_vinfo)
2268 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2269
2270 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2271 if (res)
2272 {
2273 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2274
2275 return loop_vinfo;
2276 }
2277
2278 delete loop_vinfo;
2279
2280 if (next_size == 0)
2281 autodetected_vector_size = current_vector_size;
2282
2283 if (next_size < vector_sizes.length ()
2284 && known_eq (vector_sizes[next_size], autodetected_vector_size))
2285 next_size += 1;
2286
2287 if (fatal
2288 || next_size == vector_sizes.length ()
2289 || known_eq (current_vector_size, 0U))
2290 return opt_loop_vec_info::propagate_failure (res);
2291
2292 /* Try the next biggest vector size. */
2293 current_vector_size = vector_sizes[next_size++];
2294 if (dump_enabled_p ())
2295 {
2296 dump_printf_loc (MSG_NOTE, vect_location,
2297 "***** Re-trying analysis with "
2298 "vector size ");
2299 dump_dec (MSG_NOTE, current_vector_size);
2300 dump_printf (MSG_NOTE, "\n");
2301 }
2302 }
2303 }
2304
2305 /* Return true if there is an in-order reduction function for CODE, storing
2306 it in *REDUC_FN if so. */
2307
2308 static bool
2309 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2310 {
2311 switch (code)
2312 {
2313 case PLUS_EXPR:
2314 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2315 return true;
2316
2317 default:
2318 return false;
2319 }
2320 }
2321
2322 /* Function reduction_fn_for_scalar_code
2323
2324 Input:
2325 CODE - tree_code of a reduction operations.
2326
2327 Output:
2328 REDUC_FN - the corresponding internal function to be used to reduce the
2329 vector of partial results into a single scalar result, or IFN_LAST
2330 if the operation is a supported reduction operation, but does not have
2331 such an internal function.
2332
2333 Return FALSE if CODE currently cannot be vectorized as reduction. */
2334
2335 static bool
2336 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2337 {
2338 switch (code)
2339 {
2340 case MAX_EXPR:
2341 *reduc_fn = IFN_REDUC_MAX;
2342 return true;
2343
2344 case MIN_EXPR:
2345 *reduc_fn = IFN_REDUC_MIN;
2346 return true;
2347
2348 case PLUS_EXPR:
2349 *reduc_fn = IFN_REDUC_PLUS;
2350 return true;
2351
2352 case BIT_AND_EXPR:
2353 *reduc_fn = IFN_REDUC_AND;
2354 return true;
2355
2356 case BIT_IOR_EXPR:
2357 *reduc_fn = IFN_REDUC_IOR;
2358 return true;
2359
2360 case BIT_XOR_EXPR:
2361 *reduc_fn = IFN_REDUC_XOR;
2362 return true;
2363
2364 case MULT_EXPR:
2365 case MINUS_EXPR:
2366 *reduc_fn = IFN_LAST;
2367 return true;
2368
2369 default:
2370 return false;
2371 }
2372 }
2373
2374 /* If there is a neutral value X such that SLP reduction NODE would not
2375 be affected by the introduction of additional X elements, return that X,
2376 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2377 is true if the SLP statements perform a single reduction, false if each
2378 statement performs an independent reduction. */
2379
2380 static tree
2381 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2382 bool reduc_chain)
2383 {
2384 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2385 stmt_vec_info stmt_vinfo = stmts[0];
2386 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2387 tree scalar_type = TREE_TYPE (vector_type);
2388 struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2389 gcc_assert (loop);
2390
2391 switch (code)
2392 {
2393 case WIDEN_SUM_EXPR:
2394 case DOT_PROD_EXPR:
2395 case SAD_EXPR:
2396 case PLUS_EXPR:
2397 case MINUS_EXPR:
2398 case BIT_IOR_EXPR:
2399 case BIT_XOR_EXPR:
2400 return build_zero_cst (scalar_type);
2401
2402 case MULT_EXPR:
2403 return build_one_cst (scalar_type);
2404
2405 case BIT_AND_EXPR:
2406 return build_all_ones_cst (scalar_type);
2407
2408 case MAX_EXPR:
2409 case MIN_EXPR:
2410 /* For MIN/MAX the initial values are neutral. A reduction chain
2411 has only a single initial value, so that value is neutral for
2412 all statements. */
2413 if (reduc_chain)
2414 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2415 loop_preheader_edge (loop));
2416 return NULL_TREE;
2417
2418 default:
2419 return NULL_TREE;
2420 }
2421 }
2422
2423 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2424 STMT is printed with a message MSG. */
2425
2426 static void
2427 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2428 {
2429 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2430 }
2431
2432 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2433 operation. Return true if the results of DEF_STMT_INFO are something
2434 that can be accumulated by such a reduction. */
2435
2436 static bool
2437 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2438 {
2439 return (is_gimple_assign (def_stmt_info->stmt)
2440 || is_gimple_call (def_stmt_info->stmt)
2441 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2442 || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2443 && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2444 && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2445 }
2446
2447 /* Detect SLP reduction of the form:
2448
2449 #a1 = phi <a5, a0>
2450 a2 = operation (a1)
2451 a3 = operation (a2)
2452 a4 = operation (a3)
2453 a5 = operation (a4)
2454
2455 #a = phi <a5>
2456
2457 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2458 FIRST_STMT is the first reduction stmt in the chain
2459 (a2 = operation (a1)).
2460
2461 Return TRUE if a reduction chain was detected. */
2462
2463 static bool
2464 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2465 gimple *first_stmt)
2466 {
2467 struct loop *loop = (gimple_bb (phi))->loop_father;
2468 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2469 enum tree_code code;
2470 gimple *loop_use_stmt = NULL;
2471 stmt_vec_info use_stmt_info;
2472 tree lhs;
2473 imm_use_iterator imm_iter;
2474 use_operand_p use_p;
2475 int nloop_uses, size = 0, n_out_of_loop_uses;
2476 bool found = false;
2477
2478 if (loop != vect_loop)
2479 return false;
2480
2481 auto_vec<stmt_vec_info, 8> reduc_chain;
2482 lhs = PHI_RESULT (phi);
2483 code = gimple_assign_rhs_code (first_stmt);
2484 while (1)
2485 {
2486 nloop_uses = 0;
2487 n_out_of_loop_uses = 0;
2488 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2489 {
2490 gimple *use_stmt = USE_STMT (use_p);
2491 if (is_gimple_debug (use_stmt))
2492 continue;
2493
2494 /* Check if we got back to the reduction phi. */
2495 if (use_stmt == phi)
2496 {
2497 loop_use_stmt = use_stmt;
2498 found = true;
2499 break;
2500 }
2501
2502 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2503 {
2504 loop_use_stmt = use_stmt;
2505 nloop_uses++;
2506 }
2507 else
2508 n_out_of_loop_uses++;
2509
2510 /* There are can be either a single use in the loop or two uses in
2511 phi nodes. */
2512 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2513 return false;
2514 }
2515
2516 if (found)
2517 break;
2518
2519 /* We reached a statement with no loop uses. */
2520 if (nloop_uses == 0)
2521 return false;
2522
2523 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2524 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2525 return false;
2526
2527 if (!is_gimple_assign (loop_use_stmt)
2528 || code != gimple_assign_rhs_code (loop_use_stmt)
2529 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2530 return false;
2531
2532 /* Insert USE_STMT into reduction chain. */
2533 use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2534 reduc_chain.safe_push (use_stmt_info);
2535
2536 lhs = gimple_assign_lhs (loop_use_stmt);
2537 size++;
2538 }
2539
2540 if (!found || loop_use_stmt != phi || size < 2)
2541 return false;
2542
2543 /* Swap the operands, if needed, to make the reduction operand be the second
2544 operand. */
2545 lhs = PHI_RESULT (phi);
2546 for (unsigned i = 0; i < reduc_chain.length (); ++i)
2547 {
2548 gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt);
2549 if (gimple_assign_rhs2 (next_stmt) == lhs)
2550 {
2551 tree op = gimple_assign_rhs1 (next_stmt);
2552 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2553
2554 /* Check that the other def is either defined in the loop
2555 ("vect_internal_def"), or it's an induction (defined by a
2556 loop-header phi-node). */
2557 if (def_stmt_info
2558 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2559 && vect_valid_reduction_input_p (def_stmt_info))
2560 {
2561 lhs = gimple_assign_lhs (next_stmt);
2562 continue;
2563 }
2564
2565 return false;
2566 }
2567 else
2568 {
2569 tree op = gimple_assign_rhs2 (next_stmt);
2570 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2571
2572 /* Check that the other def is either defined in the loop
2573 ("vect_internal_def"), or it's an induction (defined by a
2574 loop-header phi-node). */
2575 if (def_stmt_info
2576 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2577 && vect_valid_reduction_input_p (def_stmt_info))
2578 {
2579 if (dump_enabled_p ())
2580 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
2581 next_stmt);
2582
2583 swap_ssa_operands (next_stmt,
2584 gimple_assign_rhs1_ptr (next_stmt),
2585 gimple_assign_rhs2_ptr (next_stmt));
2586 update_stmt (next_stmt);
2587
2588 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2589 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2590 }
2591 else
2592 return false;
2593 }
2594
2595 lhs = gimple_assign_lhs (next_stmt);
2596 }
2597
2598 /* Build up the actual chain. */
2599 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
2600 {
2601 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
2602 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
2603 }
2604 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
2605 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
2606
2607 /* Save the chain for further analysis in SLP detection. */
2608 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
2609 REDUC_GROUP_SIZE (reduc_chain[0]) = size;
2610
2611 return true;
2612 }
2613
2614 /* Return true if we need an in-order reduction for operation CODE
2615 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2616 overflow must wrap. */
2617
2618 static bool
2619 needs_fold_left_reduction_p (tree type, tree_code code,
2620 bool need_wrapping_integral_overflow)
2621 {
2622 /* CHECKME: check for !flag_finite_math_only too? */
2623 if (SCALAR_FLOAT_TYPE_P (type))
2624 switch (code)
2625 {
2626 case MIN_EXPR:
2627 case MAX_EXPR:
2628 return false;
2629
2630 default:
2631 return !flag_associative_math;
2632 }
2633
2634 if (INTEGRAL_TYPE_P (type))
2635 {
2636 if (!operation_no_trapping_overflow (type, code))
2637 return true;
2638 if (need_wrapping_integral_overflow
2639 && !TYPE_OVERFLOW_WRAPS (type)
2640 && operation_can_overflow (code))
2641 return true;
2642 return false;
2643 }
2644
2645 if (SAT_FIXED_POINT_TYPE_P (type))
2646 return true;
2647
2648 return false;
2649 }
2650
2651 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2652 reduction operation CODE has a handled computation expression. */
2653
2654 bool
2655 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2656 tree loop_arg, enum tree_code code)
2657 {
2658 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2659 auto_bitmap visited;
2660 tree lookfor = PHI_RESULT (phi);
2661 ssa_op_iter curri;
2662 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2663 while (USE_FROM_PTR (curr) != loop_arg)
2664 curr = op_iter_next_use (&curri);
2665 curri.i = curri.numops;
2666 do
2667 {
2668 path.safe_push (std::make_pair (curri, curr));
2669 tree use = USE_FROM_PTR (curr);
2670 if (use == lookfor)
2671 break;
2672 gimple *def = SSA_NAME_DEF_STMT (use);
2673 if (gimple_nop_p (def)
2674 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2675 {
2676 pop:
2677 do
2678 {
2679 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2680 curri = x.first;
2681 curr = x.second;
2682 do
2683 curr = op_iter_next_use (&curri);
2684 /* Skip already visited or non-SSA operands (from iterating
2685 over PHI args). */
2686 while (curr != NULL_USE_OPERAND_P
2687 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2688 || ! bitmap_set_bit (visited,
2689 SSA_NAME_VERSION
2690 (USE_FROM_PTR (curr)))));
2691 }
2692 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2693 if (curr == NULL_USE_OPERAND_P)
2694 break;
2695 }
2696 else
2697 {
2698 if (gimple_code (def) == GIMPLE_PHI)
2699 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2700 else
2701 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2702 while (curr != NULL_USE_OPERAND_P
2703 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2704 || ! bitmap_set_bit (visited,
2705 SSA_NAME_VERSION
2706 (USE_FROM_PTR (curr)))))
2707 curr = op_iter_next_use (&curri);
2708 if (curr == NULL_USE_OPERAND_P)
2709 goto pop;
2710 }
2711 }
2712 while (1);
2713 if (dump_file && (dump_flags & TDF_DETAILS))
2714 {
2715 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2716 unsigned i;
2717 std::pair<ssa_op_iter, use_operand_p> *x;
2718 FOR_EACH_VEC_ELT (path, i, x)
2719 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2720 dump_printf (MSG_NOTE, "\n");
2721 }
2722
2723 /* Check whether the reduction path detected is valid. */
2724 bool fail = path.length () == 0;
2725 bool neg = false;
2726 for (unsigned i = 1; i < path.length (); ++i)
2727 {
2728 gimple *use_stmt = USE_STMT (path[i].second);
2729 tree op = USE_FROM_PTR (path[i].second);
2730 if (! has_single_use (op)
2731 || ! is_gimple_assign (use_stmt))
2732 {
2733 fail = true;
2734 break;
2735 }
2736 if (gimple_assign_rhs_code (use_stmt) != code)
2737 {
2738 if (code == PLUS_EXPR
2739 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2740 {
2741 /* Track whether we negate the reduction value each iteration. */
2742 if (gimple_assign_rhs2 (use_stmt) == op)
2743 neg = ! neg;
2744 }
2745 else
2746 {
2747 fail = true;
2748 break;
2749 }
2750 }
2751 }
2752 return ! fail && ! neg;
2753 }
2754
2755
2756 /* Function vect_is_simple_reduction
2757
2758 (1) Detect a cross-iteration def-use cycle that represents a simple
2759 reduction computation. We look for the following pattern:
2760
2761 loop_header:
2762 a1 = phi < a0, a2 >
2763 a3 = ...
2764 a2 = operation (a3, a1)
2765
2766 or
2767
2768 a3 = ...
2769 loop_header:
2770 a1 = phi < a0, a2 >
2771 a2 = operation (a3, a1)
2772
2773 such that:
2774 1. operation is commutative and associative and it is safe to
2775 change the order of the computation
2776 2. no uses for a2 in the loop (a2 is used out of the loop)
2777 3. no uses of a1 in the loop besides the reduction operation
2778 4. no uses of a1 outside the loop.
2779
2780 Conditions 1,4 are tested here.
2781 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2782
2783 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2784 nested cycles.
2785
2786 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2787 reductions:
2788
2789 a1 = phi < a0, a2 >
2790 inner loop (def of a3)
2791 a2 = phi < a3 >
2792
2793 (4) Detect condition expressions, ie:
2794 for (int i = 0; i < N; i++)
2795 if (a[i] < val)
2796 ret_val = a[i];
2797
2798 */
2799
2800 static stmt_vec_info
2801 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2802 bool *double_reduc,
2803 bool need_wrapping_integral_overflow,
2804 enum vect_reduction_type *v_reduc_type)
2805 {
2806 gphi *phi = as_a <gphi *> (phi_info->stmt);
2807 struct loop *loop = (gimple_bb (phi))->loop_father;
2808 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2809 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2810 gimple *phi_use_stmt = NULL;
2811 enum tree_code orig_code, code;
2812 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2813 tree type;
2814 tree name;
2815 imm_use_iterator imm_iter;
2816 use_operand_p use_p;
2817 bool phi_def;
2818
2819 *double_reduc = false;
2820 *v_reduc_type = TREE_CODE_REDUCTION;
2821
2822 tree phi_name = PHI_RESULT (phi);
2823 /* ??? If there are no uses of the PHI result the inner loop reduction
2824 won't be detected as possibly double-reduction by vectorizable_reduction
2825 because that tries to walk the PHI arg from the preheader edge which
2826 can be constant. See PR60382. */
2827 if (has_zero_uses (phi_name))
2828 return NULL;
2829 unsigned nphi_def_loop_uses = 0;
2830 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2831 {
2832 gimple *use_stmt = USE_STMT (use_p);
2833 if (is_gimple_debug (use_stmt))
2834 continue;
2835
2836 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2837 {
2838 if (dump_enabled_p ())
2839 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2840 "intermediate value used outside loop.\n");
2841
2842 return NULL;
2843 }
2844
2845 nphi_def_loop_uses++;
2846 phi_use_stmt = use_stmt;
2847 }
2848
2849 edge latch_e = loop_latch_edge (loop);
2850 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2851 if (TREE_CODE (loop_arg) != SSA_NAME)
2852 {
2853 if (dump_enabled_p ())
2854 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2855 "reduction: not ssa_name: %T\n", loop_arg);
2856 return NULL;
2857 }
2858
2859 stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
2860 if (!def_stmt_info
2861 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
2862 return NULL;
2863
2864 if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
2865 {
2866 name = gimple_assign_lhs (def_stmt);
2867 phi_def = false;
2868 }
2869 else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2870 {
2871 name = PHI_RESULT (def_stmt);
2872 phi_def = true;
2873 }
2874 else
2875 {
2876 if (dump_enabled_p ())
2877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2878 "reduction: unhandled reduction operation: %G",
2879 def_stmt_info->stmt);
2880 return NULL;
2881 }
2882
2883 unsigned nlatch_def_loop_uses = 0;
2884 auto_vec<gphi *, 3> lcphis;
2885 bool inner_loop_of_double_reduc = false;
2886 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2887 {
2888 gimple *use_stmt = USE_STMT (use_p);
2889 if (is_gimple_debug (use_stmt))
2890 continue;
2891 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2892 nlatch_def_loop_uses++;
2893 else
2894 {
2895 /* We can have more than one loop-closed PHI. */
2896 lcphis.safe_push (as_a <gphi *> (use_stmt));
2897 if (nested_in_vect_loop
2898 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
2899 == vect_double_reduction_def))
2900 inner_loop_of_double_reduc = true;
2901 }
2902 }
2903
2904 /* If this isn't a nested cycle or if the nested cycle reduction value
2905 is used ouside of the inner loop we cannot handle uses of the reduction
2906 value. */
2907 if ((!nested_in_vect_loop || inner_loop_of_double_reduc)
2908 && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1))
2909 {
2910 if (dump_enabled_p ())
2911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2912 "reduction used in loop.\n");
2913 return NULL;
2914 }
2915
2916 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2917 defined in the inner loop. */
2918 if (phi_def)
2919 {
2920 gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
2921 op1 = PHI_ARG_DEF (def_stmt, 0);
2922
2923 if (gimple_phi_num_args (def_stmt) != 1
2924 || TREE_CODE (op1) != SSA_NAME)
2925 {
2926 if (dump_enabled_p ())
2927 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2928 "unsupported phi node definition.\n");
2929
2930 return NULL;
2931 }
2932
2933 gimple *def1 = SSA_NAME_DEF_STMT (op1);
2934 if (gimple_bb (def1)
2935 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2936 && loop->inner
2937 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2938 && is_gimple_assign (def1)
2939 && is_a <gphi *> (phi_use_stmt)
2940 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2941 {
2942 if (dump_enabled_p ())
2943 report_vect_op (MSG_NOTE, def_stmt,
2944 "detected double reduction: ");
2945
2946 *double_reduc = true;
2947 return def_stmt_info;
2948 }
2949
2950 return NULL;
2951 }
2952
2953 /* If we are vectorizing an inner reduction we are executing that
2954 in the original order only in case we are not dealing with a
2955 double reduction. */
2956 bool check_reduction = true;
2957 if (flow_loop_nested_p (vect_loop, loop))
2958 {
2959 gphi *lcphi;
2960 unsigned i;
2961 check_reduction = false;
2962 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2963 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2964 {
2965 gimple *use_stmt = USE_STMT (use_p);
2966 if (is_gimple_debug (use_stmt))
2967 continue;
2968 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2969 check_reduction = true;
2970 }
2971 }
2972
2973 gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
2974 code = orig_code = gimple_assign_rhs_code (def_stmt);
2975
2976 if (nested_in_vect_loop && !check_reduction)
2977 {
2978 /* FIXME: Even for non-reductions code generation is funneled
2979 through vectorizable_reduction for the stmt defining the
2980 PHI latch value. So we have to artificially restrict ourselves
2981 for the supported operations. */
2982 switch (get_gimple_rhs_class (code))
2983 {
2984 case GIMPLE_BINARY_RHS:
2985 case GIMPLE_TERNARY_RHS:
2986 break;
2987 default:
2988 /* Not supported by vectorizable_reduction. */
2989 if (dump_enabled_p ())
2990 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2991 "nested cycle: not handled operation: ");
2992 return NULL;
2993 }
2994 if (dump_enabled_p ())
2995 report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
2996 return def_stmt_info;
2997 }
2998
2999 /* We can handle "res -= x[i]", which is non-associative by
3000 simply rewriting this into "res += -x[i]". Avoid changing
3001 gimple instruction for the first simple tests and only do this
3002 if we're allowed to change code at all. */
3003 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3004 code = PLUS_EXPR;
3005
3006 if (code == COND_EXPR)
3007 {
3008 if (! nested_in_vect_loop)
3009 *v_reduc_type = COND_REDUCTION;
3010
3011 op3 = gimple_assign_rhs1 (def_stmt);
3012 if (COMPARISON_CLASS_P (op3))
3013 {
3014 op4 = TREE_OPERAND (op3, 1);
3015 op3 = TREE_OPERAND (op3, 0);
3016 }
3017 if (op3 == phi_name || op4 == phi_name)
3018 {
3019 if (dump_enabled_p ())
3020 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3021 "reduction: condition depends on previous"
3022 " iteration: ");
3023 return NULL;
3024 }
3025
3026 op1 = gimple_assign_rhs2 (def_stmt);
3027 op2 = gimple_assign_rhs3 (def_stmt);
3028 }
3029 else if (!commutative_tree_code (code) || !associative_tree_code (code))
3030 {
3031 if (dump_enabled_p ())
3032 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3033 "reduction: not commutative/associative: ");
3034 return NULL;
3035 }
3036 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3037 {
3038 op1 = gimple_assign_rhs1 (def_stmt);
3039 op2 = gimple_assign_rhs2 (def_stmt);
3040 }
3041 else
3042 {
3043 if (dump_enabled_p ())
3044 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3045 "reduction: not handled operation: ");
3046 return NULL;
3047 }
3048
3049 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3050 {
3051 if (dump_enabled_p ())
3052 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3053 "reduction: both uses not ssa_names: ");
3054
3055 return NULL;
3056 }
3057
3058 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3059 if ((TREE_CODE (op1) == SSA_NAME
3060 && !types_compatible_p (type,TREE_TYPE (op1)))
3061 || (TREE_CODE (op2) == SSA_NAME
3062 && !types_compatible_p (type, TREE_TYPE (op2)))
3063 || (op3 && TREE_CODE (op3) == SSA_NAME
3064 && !types_compatible_p (type, TREE_TYPE (op3)))
3065 || (op4 && TREE_CODE (op4) == SSA_NAME
3066 && !types_compatible_p (type, TREE_TYPE (op4))))
3067 {
3068 if (dump_enabled_p ())
3069 {
3070 dump_printf_loc (MSG_NOTE, vect_location,
3071 "reduction: multiple types: operation type: "
3072 "%T, operands types: %T,%T",
3073 type, TREE_TYPE (op1), TREE_TYPE (op2));
3074 if (op3)
3075 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
3076
3077 if (op4)
3078 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
3079 dump_printf (MSG_NOTE, "\n");
3080 }
3081
3082 return NULL;
3083 }
3084
3085 /* Check whether it's ok to change the order of the computation.
3086 Generally, when vectorizing a reduction we change the order of the
3087 computation. This may change the behavior of the program in some
3088 cases, so we need to check that this is ok. One exception is when
3089 vectorizing an outer-loop: the inner-loop is executed sequentially,
3090 and therefore vectorizing reductions in the inner-loop during
3091 outer-loop vectorization is safe. */
3092 if (check_reduction
3093 && *v_reduc_type == TREE_CODE_REDUCTION
3094 && needs_fold_left_reduction_p (type, code,
3095 need_wrapping_integral_overflow))
3096 *v_reduc_type = FOLD_LEFT_REDUCTION;
3097
3098 /* Reduction is safe. We're dealing with one of the following:
3099 1) integer arithmetic and no trapv
3100 2) floating point arithmetic, and special flags permit this optimization
3101 3) nested cycle (i.e., outer loop vectorization). */
3102 stmt_vec_info def1_info = loop_info->lookup_def (op1);
3103 stmt_vec_info def2_info = loop_info->lookup_def (op2);
3104 if (code != COND_EXPR && !def1_info && !def2_info)
3105 {
3106 if (dump_enabled_p ())
3107 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3108 return NULL;
3109 }
3110
3111 /* Check that one def is the reduction def, defined by PHI,
3112 the other def is either defined in the loop ("vect_internal_def"),
3113 or it's an induction (defined by a loop-header phi-node). */
3114
3115 if (def2_info
3116 && def2_info->stmt == phi
3117 && (code == COND_EXPR
3118 || !def1_info
3119 || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
3120 || vect_valid_reduction_input_p (def1_info)))
3121 {
3122 if (dump_enabled_p ())
3123 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3124 return def_stmt_info;
3125 }
3126
3127 if (def1_info
3128 && def1_info->stmt == phi
3129 && (code == COND_EXPR
3130 || !def2_info
3131 || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
3132 || vect_valid_reduction_input_p (def2_info)))
3133 {
3134 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3135 {
3136 /* Check if we can swap operands (just for simplicity - so that
3137 the rest of the code can assume that the reduction variable
3138 is always the last (second) argument). */
3139 if (code == COND_EXPR)
3140 {
3141 /* Swap cond_expr by inverting the condition. */
3142 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3143 enum tree_code invert_code = ERROR_MARK;
3144 enum tree_code cond_code = TREE_CODE (cond_expr);
3145
3146 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3147 {
3148 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3149 invert_code = invert_tree_comparison (cond_code, honor_nans);
3150 }
3151 if (invert_code != ERROR_MARK)
3152 {
3153 TREE_SET_CODE (cond_expr, invert_code);
3154 swap_ssa_operands (def_stmt,
3155 gimple_assign_rhs2_ptr (def_stmt),
3156 gimple_assign_rhs3_ptr (def_stmt));
3157 }
3158 else
3159 {
3160 if (dump_enabled_p ())
3161 report_vect_op (MSG_NOTE, def_stmt,
3162 "detected reduction: cannot swap operands "
3163 "for cond_expr");
3164 return NULL;
3165 }
3166 }
3167 else
3168 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3169 gimple_assign_rhs2_ptr (def_stmt));
3170
3171 if (dump_enabled_p ())
3172 report_vect_op (MSG_NOTE, def_stmt,
3173 "detected reduction: need to swap operands: ");
3174
3175 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3176 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3177 }
3178 else
3179 {
3180 if (dump_enabled_p ())
3181 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3182 }
3183
3184 return def_stmt_info;
3185 }
3186
3187 /* Try to find SLP reduction chain. */
3188 if (! nested_in_vect_loop
3189 && code != COND_EXPR
3190 && orig_code != MINUS_EXPR
3191 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3192 {
3193 if (dump_enabled_p ())
3194 report_vect_op (MSG_NOTE, def_stmt,
3195 "reduction: detected reduction chain: ");
3196
3197 return def_stmt_info;
3198 }
3199
3200 /* Look for the expression computing loop_arg from loop PHI result. */
3201 if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3202 return def_stmt_info;
3203
3204 if (dump_enabled_p ())
3205 {
3206 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3207 "reduction: unknown pattern: ");
3208 }
3209
3210 return NULL;
3211 }
3212
3213 /* Wrapper around vect_is_simple_reduction, which will modify code
3214 in-place if it enables detection of more reductions. Arguments
3215 as there. */
3216
3217 stmt_vec_info
3218 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3219 bool *double_reduc,
3220 bool need_wrapping_integral_overflow)
3221 {
3222 enum vect_reduction_type v_reduc_type;
3223 stmt_vec_info def_info
3224 = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3225 need_wrapping_integral_overflow,
3226 &v_reduc_type);
3227 if (def_info)
3228 {
3229 STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3230 STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3231 STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3232 STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3233 }
3234 return def_info;
3235 }
3236
3237 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3238 int
3239 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3240 int *peel_iters_epilogue,
3241 stmt_vector_for_cost *scalar_cost_vec,
3242 stmt_vector_for_cost *prologue_cost_vec,
3243 stmt_vector_for_cost *epilogue_cost_vec)
3244 {
3245 int retval = 0;
3246 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3247
3248 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3249 {
3250 *peel_iters_epilogue = assumed_vf / 2;
3251 if (dump_enabled_p ())
3252 dump_printf_loc (MSG_NOTE, vect_location,
3253 "cost model: epilogue peel iters set to vf/2 "
3254 "because loop iterations are unknown .\n");
3255
3256 /* If peeled iterations are known but number of scalar loop
3257 iterations are unknown, count a taken branch per peeled loop. */
3258 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3259 NULL, 0, vect_prologue);
3260 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3261 NULL, 0, vect_epilogue);
3262 }
3263 else
3264 {
3265 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3266 peel_iters_prologue = niters < peel_iters_prologue ?
3267 niters : peel_iters_prologue;
3268 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3269 /* If we need to peel for gaps, but no peeling is required, we have to
3270 peel VF iterations. */
3271 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3272 *peel_iters_epilogue = assumed_vf;
3273 }
3274
3275 stmt_info_for_cost *si;
3276 int j;
3277 if (peel_iters_prologue)
3278 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3279 retval += record_stmt_cost (prologue_cost_vec,
3280 si->count * peel_iters_prologue,
3281 si->kind, si->stmt_info, si->misalign,
3282 vect_prologue);
3283 if (*peel_iters_epilogue)
3284 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3285 retval += record_stmt_cost (epilogue_cost_vec,
3286 si->count * *peel_iters_epilogue,
3287 si->kind, si->stmt_info, si->misalign,
3288 vect_epilogue);
3289
3290 return retval;
3291 }
3292
3293 /* Function vect_estimate_min_profitable_iters
3294
3295 Return the number of iterations required for the vector version of the
3296 loop to be profitable relative to the cost of the scalar version of the
3297 loop.
3298
3299 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3300 of iterations for vectorization. -1 value means loop vectorization
3301 is not profitable. This returned value may be used for dynamic
3302 profitability check.
3303
3304 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3305 for static check against estimated number of iterations. */
3306
3307 static void
3308 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3309 int *ret_min_profitable_niters,
3310 int *ret_min_profitable_estimate)
3311 {
3312 int min_profitable_iters;
3313 int min_profitable_estimate;
3314 int peel_iters_prologue;
3315 int peel_iters_epilogue;
3316 unsigned vec_inside_cost = 0;
3317 int vec_outside_cost = 0;
3318 unsigned vec_prologue_cost = 0;
3319 unsigned vec_epilogue_cost = 0;
3320 int scalar_single_iter_cost = 0;
3321 int scalar_outside_cost = 0;
3322 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3323 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3324 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3325
3326 /* Cost model disabled. */
3327 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3328 {
3329 if (dump_enabled_p ())
3330 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3331 *ret_min_profitable_niters = 0;
3332 *ret_min_profitable_estimate = 0;
3333 return;
3334 }
3335
3336 /* Requires loop versioning tests to handle misalignment. */
3337 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3338 {
3339 /* FIXME: Make cost depend on complexity of individual check. */
3340 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3341 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3342 vect_prologue);
3343 if (dump_enabled_p ())
3344 dump_printf (MSG_NOTE,
3345 "cost model: Adding cost of checks for loop "
3346 "versioning to treat misalignment.\n");
3347 }
3348
3349 /* Requires loop versioning with alias checks. */
3350 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3351 {
3352 /* FIXME: Make cost depend on complexity of individual check. */
3353 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3354 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3355 vect_prologue);
3356 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3357 if (len)
3358 /* Count LEN - 1 ANDs and LEN comparisons. */
3359 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3360 NULL, 0, vect_prologue);
3361 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3362 if (len)
3363 {
3364 /* Count LEN - 1 ANDs and LEN comparisons. */
3365 unsigned int nstmts = len * 2 - 1;
3366 /* +1 for each bias that needs adding. */
3367 for (unsigned int i = 0; i < len; ++i)
3368 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3369 nstmts += 1;
3370 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3371 NULL, 0, vect_prologue);
3372 }
3373 if (dump_enabled_p ())
3374 dump_printf (MSG_NOTE,
3375 "cost model: Adding cost of checks for loop "
3376 "versioning aliasing.\n");
3377 }
3378
3379 /* Requires loop versioning with niter checks. */
3380 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3381 {
3382 /* FIXME: Make cost depend on complexity of individual check. */
3383 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3384 vect_prologue);
3385 if (dump_enabled_p ())
3386 dump_printf (MSG_NOTE,
3387 "cost model: Adding cost of checks for loop "
3388 "versioning niters.\n");
3389 }
3390
3391 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3392 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3393 vect_prologue);
3394
3395 /* Count statements in scalar loop. Using this as scalar cost for a single
3396 iteration for now.
3397
3398 TODO: Add outer loop support.
3399
3400 TODO: Consider assigning different costs to different scalar
3401 statements. */
3402
3403 scalar_single_iter_cost
3404 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3405
3406 /* Add additional cost for the peeled instructions in prologue and epilogue
3407 loop. (For fully-masked loops there will be no peeling.)
3408
3409 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3410 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3411
3412 TODO: Build an expression that represents peel_iters for prologue and
3413 epilogue to be used in a run-time test. */
3414
3415 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3416 {
3417 peel_iters_prologue = 0;
3418 peel_iters_epilogue = 0;
3419
3420 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3421 {
3422 /* We need to peel exactly one iteration. */
3423 peel_iters_epilogue += 1;
3424 stmt_info_for_cost *si;
3425 int j;
3426 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3427 j, si)
3428 (void) add_stmt_cost (target_cost_data, si->count,
3429 si->kind, si->stmt_info, si->misalign,
3430 vect_epilogue);
3431 }
3432 }
3433 else if (npeel < 0)
3434 {
3435 peel_iters_prologue = assumed_vf / 2;
3436 if (dump_enabled_p ())
3437 dump_printf (MSG_NOTE, "cost model: "
3438 "prologue peel iters set to vf/2.\n");
3439
3440 /* If peeling for alignment is unknown, loop bound of main loop becomes
3441 unknown. */
3442 peel_iters_epilogue = assumed_vf / 2;
3443 if (dump_enabled_p ())
3444 dump_printf (MSG_NOTE, "cost model: "
3445 "epilogue peel iters set to vf/2 because "
3446 "peeling for alignment is unknown.\n");
3447
3448 /* If peeled iterations are unknown, count a taken branch and a not taken
3449 branch per peeled loop. Even if scalar loop iterations are known,
3450 vector iterations are not known since peeled prologue iterations are
3451 not known. Hence guards remain the same. */
3452 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3453 NULL, 0, vect_prologue);
3454 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3455 NULL, 0, vect_prologue);
3456 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3457 NULL, 0, vect_epilogue);
3458 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3459 NULL, 0, vect_epilogue);
3460 stmt_info_for_cost *si;
3461 int j;
3462 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3463 {
3464 (void) add_stmt_cost (target_cost_data,
3465 si->count * peel_iters_prologue,
3466 si->kind, si->stmt_info, si->misalign,
3467 vect_prologue);
3468 (void) add_stmt_cost (target_cost_data,
3469 si->count * peel_iters_epilogue,
3470 si->kind, si->stmt_info, si->misalign,
3471 vect_epilogue);
3472 }
3473 }
3474 else
3475 {
3476 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3477 stmt_info_for_cost *si;
3478 int j;
3479 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3480
3481 prologue_cost_vec.create (2);
3482 epilogue_cost_vec.create (2);
3483 peel_iters_prologue = npeel;
3484
3485 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3486 &peel_iters_epilogue,
3487 &LOOP_VINFO_SCALAR_ITERATION_COST
3488 (loop_vinfo),
3489 &prologue_cost_vec,
3490 &epilogue_cost_vec);
3491
3492 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3493 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3494 si->misalign, vect_prologue);
3495
3496 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3497 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3498 si->misalign, vect_epilogue);
3499
3500 prologue_cost_vec.release ();
3501 epilogue_cost_vec.release ();
3502 }
3503
3504 /* FORNOW: The scalar outside cost is incremented in one of the
3505 following ways:
3506
3507 1. The vectorizer checks for alignment and aliasing and generates
3508 a condition that allows dynamic vectorization. A cost model
3509 check is ANDED with the versioning condition. Hence scalar code
3510 path now has the added cost of the versioning check.
3511
3512 if (cost > th & versioning_check)
3513 jmp to vector code
3514
3515 Hence run-time scalar is incremented by not-taken branch cost.
3516
3517 2. The vectorizer then checks if a prologue is required. If the
3518 cost model check was not done before during versioning, it has to
3519 be done before the prologue check.
3520
3521 if (cost <= th)
3522 prologue = scalar_iters
3523 if (prologue == 0)
3524 jmp to vector code
3525 else
3526 execute prologue
3527 if (prologue == num_iters)
3528 go to exit
3529
3530 Hence the run-time scalar cost is incremented by a taken branch,
3531 plus a not-taken branch, plus a taken branch cost.
3532
3533 3. The vectorizer then checks if an epilogue is required. If the
3534 cost model check was not done before during prologue check, it
3535 has to be done with the epilogue check.
3536
3537 if (prologue == 0)
3538 jmp to vector code
3539 else
3540 execute prologue
3541 if (prologue == num_iters)
3542 go to exit
3543 vector code:
3544 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3545 jmp to epilogue
3546
3547 Hence the run-time scalar cost should be incremented by 2 taken
3548 branches.
3549
3550 TODO: The back end may reorder the BBS's differently and reverse
3551 conditions/branch directions. Change the estimates below to
3552 something more reasonable. */
3553
3554 /* If the number of iterations is known and we do not do versioning, we can
3555 decide whether to vectorize at compile time. Hence the scalar version
3556 do not carry cost model guard costs. */
3557 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3558 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3559 {
3560 /* Cost model check occurs at versioning. */
3561 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3562 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3563 else
3564 {
3565 /* Cost model check occurs at prologue generation. */
3566 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3567 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3568 + vect_get_stmt_cost (cond_branch_not_taken);
3569 /* Cost model check occurs at epilogue generation. */
3570 else
3571 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3572 }
3573 }
3574
3575 /* Complete the target-specific cost calculations. */
3576 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3577 &vec_inside_cost, &vec_epilogue_cost);
3578
3579 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3580
3581 if (dump_enabled_p ())
3582 {
3583 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3584 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3585 vec_inside_cost);
3586 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3587 vec_prologue_cost);
3588 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3589 vec_epilogue_cost);
3590 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3591 scalar_single_iter_cost);
3592 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3593 scalar_outside_cost);
3594 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3595 vec_outside_cost);
3596 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3597 peel_iters_prologue);
3598 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3599 peel_iters_epilogue);
3600 }
3601
3602 /* Calculate number of iterations required to make the vector version
3603 profitable, relative to the loop bodies only. The following condition
3604 must hold true:
3605 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3606 where
3607 SIC = scalar iteration cost, VIC = vector iteration cost,
3608 VOC = vector outside cost, VF = vectorization factor,
3609 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3610 SOC = scalar outside cost for run time cost model check. */
3611
3612 if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3613 {
3614 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3615 * assumed_vf
3616 - vec_inside_cost * peel_iters_prologue
3617 - vec_inside_cost * peel_iters_epilogue);
3618 if (min_profitable_iters <= 0)
3619 min_profitable_iters = 0;
3620 else
3621 {
3622 min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3623 - vec_inside_cost);
3624
3625 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3626 <= (((int) vec_inside_cost * min_profitable_iters)
3627 + (((int) vec_outside_cost - scalar_outside_cost)
3628 * assumed_vf)))
3629 min_profitable_iters++;
3630 }
3631 }
3632 /* vector version will never be profitable. */
3633 else
3634 {
3635 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3636 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3637 "vectorization did not happen for a simd loop");
3638
3639 if (dump_enabled_p ())
3640 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3641 "cost model: the vector iteration cost = %d "
3642 "divided by the scalar iteration cost = %d "
3643 "is greater or equal to the vectorization factor = %d"
3644 ".\n",
3645 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3646 *ret_min_profitable_niters = -1;
3647 *ret_min_profitable_estimate = -1;
3648 return;
3649 }
3650
3651 if (dump_enabled_p ())
3652 dump_printf (MSG_NOTE,
3653 " Calculated minimum iters for profitability: %d\n",
3654 min_profitable_iters);
3655
3656 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3657 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3658 /* We want the vectorized loop to execute at least once. */
3659 min_profitable_iters = assumed_vf + peel_iters_prologue;
3660
3661 if (dump_enabled_p ())
3662 dump_printf_loc (MSG_NOTE, vect_location,
3663 " Runtime profitability threshold = %d\n",
3664 min_profitable_iters);
3665
3666 *ret_min_profitable_niters = min_profitable_iters;
3667
3668 /* Calculate number of iterations required to make the vector version
3669 profitable, relative to the loop bodies only.
3670
3671 Non-vectorized variant is SIC * niters and it must win over vector
3672 variant on the expected loop trip count. The following condition must hold true:
3673 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3674
3675 if (vec_outside_cost <= 0)
3676 min_profitable_estimate = 0;
3677 else
3678 {
3679 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3680 * assumed_vf
3681 - vec_inside_cost * peel_iters_prologue
3682 - vec_inside_cost * peel_iters_epilogue)
3683 / ((scalar_single_iter_cost * assumed_vf)
3684 - vec_inside_cost);
3685 }
3686 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3687 if (dump_enabled_p ())
3688 dump_printf_loc (MSG_NOTE, vect_location,
3689 " Static estimate profitability threshold = %d\n",
3690 min_profitable_estimate);
3691
3692 *ret_min_profitable_estimate = min_profitable_estimate;
3693 }
3694
3695 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3696 vector elements (not bits) for a vector with NELT elements. */
3697 static void
3698 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3699 vec_perm_builder *sel)
3700 {
3701 /* The encoding is a single stepped pattern. Any wrap-around is handled
3702 by vec_perm_indices. */
3703 sel->new_vector (nelt, 1, 3);
3704 for (unsigned int i = 0; i < 3; i++)
3705 sel->quick_push (i + offset);
3706 }
3707
3708 /* Checks whether the target supports whole-vector shifts for vectors of mode
3709 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3710 it supports vec_perm_const with masks for all necessary shift amounts. */
3711 static bool
3712 have_whole_vector_shift (machine_mode mode)
3713 {
3714 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3715 return true;
3716
3717 /* Variable-length vectors should be handled via the optab. */
3718 unsigned int nelt;
3719 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3720 return false;
3721
3722 vec_perm_builder sel;
3723 vec_perm_indices indices;
3724 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3725 {
3726 calc_vec_perm_mask_for_shift (i, nelt, &sel);
3727 indices.new_vector (sel, 2, nelt);
3728 if (!can_vec_perm_const_p (mode, indices, false))
3729 return false;
3730 }
3731 return true;
3732 }
3733
3734 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3735 functions. Design better to avoid maintenance issues. */
3736
3737 /* Function vect_model_reduction_cost.
3738
3739 Models cost for a reduction operation, including the vector ops
3740 generated within the strip-mine loop, the initial definition before
3741 the loop, and the epilogue code that must be generated. */
3742
3743 static void
3744 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3745 int ncopies, stmt_vector_for_cost *cost_vec)
3746 {
3747 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3748 enum tree_code code;
3749 optab optab;
3750 tree vectype;
3751 machine_mode mode;
3752 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3753 struct loop *loop = NULL;
3754
3755 if (loop_vinfo)
3756 loop = LOOP_VINFO_LOOP (loop_vinfo);
3757
3758 /* Condition reductions generate two reductions in the loop. */
3759 vect_reduction_type reduction_type
3760 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3761 if (reduction_type == COND_REDUCTION)
3762 ncopies *= 2;
3763
3764 vectype = STMT_VINFO_VECTYPE (stmt_info);
3765 mode = TYPE_MODE (vectype);
3766 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3767
3768 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3769
3770 if (reduction_type == EXTRACT_LAST_REDUCTION
3771 || reduction_type == FOLD_LEFT_REDUCTION)
3772 {
3773 /* No extra instructions needed in the prologue. */
3774 prologue_cost = 0;
3775
3776 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3777 /* Count one reduction-like operation per vector. */
3778 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3779 stmt_info, 0, vect_body);
3780 else
3781 {
3782 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
3783 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3784 inside_cost = record_stmt_cost (cost_vec, nelements,
3785 vec_to_scalar, stmt_info, 0,
3786 vect_body);
3787 inside_cost += record_stmt_cost (cost_vec, nelements,
3788 scalar_stmt, stmt_info, 0,
3789 vect_body);
3790 }
3791 }
3792 else
3793 {
3794 /* Add in cost for initial definition.
3795 For cond reduction we have four vectors: initial index, step,
3796 initial result of the data reduction, initial value of the index
3797 reduction. */
3798 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3799 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3800 scalar_to_vec, stmt_info, 0,
3801 vect_prologue);
3802
3803 /* Cost of reduction op inside loop. */
3804 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3805 stmt_info, 0, vect_body);
3806 }
3807
3808 /* Determine cost of epilogue code.
3809
3810 We have a reduction operator that will reduce the vector in one statement.
3811 Also requires scalar extract. */
3812
3813 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3814 {
3815 if (reduc_fn != IFN_LAST)
3816 {
3817 if (reduction_type == COND_REDUCTION)
3818 {
3819 /* An EQ stmt and an COND_EXPR stmt. */
3820 epilogue_cost += record_stmt_cost (cost_vec, 2,
3821 vector_stmt, stmt_info, 0,
3822 vect_epilogue);
3823 /* Reduction of the max index and a reduction of the found
3824 values. */
3825 epilogue_cost += record_stmt_cost (cost_vec, 2,
3826 vec_to_scalar, stmt_info, 0,
3827 vect_epilogue);
3828 /* A broadcast of the max value. */
3829 epilogue_cost += record_stmt_cost (cost_vec, 1,
3830 scalar_to_vec, stmt_info, 0,
3831 vect_epilogue);
3832 }
3833 else
3834 {
3835 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3836 stmt_info, 0, vect_epilogue);
3837 epilogue_cost += record_stmt_cost (cost_vec, 1,
3838 vec_to_scalar, stmt_info, 0,
3839 vect_epilogue);
3840 }
3841 }
3842 else if (reduction_type == COND_REDUCTION)
3843 {
3844 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3845 /* Extraction of scalar elements. */
3846 epilogue_cost += record_stmt_cost (cost_vec,
3847 2 * estimated_nunits,
3848 vec_to_scalar, stmt_info, 0,
3849 vect_epilogue);
3850 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
3851 epilogue_cost += record_stmt_cost (cost_vec,
3852 2 * estimated_nunits - 3,
3853 scalar_stmt, stmt_info, 0,
3854 vect_epilogue);
3855 }
3856 else if (reduction_type == EXTRACT_LAST_REDUCTION
3857 || reduction_type == FOLD_LEFT_REDUCTION)
3858 /* No extra instructions need in the epilogue. */
3859 ;
3860 else
3861 {
3862 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3863 tree bitsize =
3864 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3865 int element_bitsize = tree_to_uhwi (bitsize);
3866 int nelements = vec_size_in_bits / element_bitsize;
3867
3868 if (code == COND_EXPR)
3869 code = MAX_EXPR;
3870
3871 optab = optab_for_tree_code (code, vectype, optab_default);
3872
3873 /* We have a whole vector shift available. */
3874 if (optab != unknown_optab
3875 && VECTOR_MODE_P (mode)
3876 && optab_handler (optab, mode) != CODE_FOR_nothing
3877 && have_whole_vector_shift (mode))
3878 {
3879 /* Final reduction via vector shifts and the reduction operator.
3880 Also requires scalar extract. */
3881 epilogue_cost += record_stmt_cost (cost_vec,
3882 exact_log2 (nelements) * 2,
3883 vector_stmt, stmt_info, 0,
3884 vect_epilogue);
3885 epilogue_cost += record_stmt_cost (cost_vec, 1,
3886 vec_to_scalar, stmt_info, 0,
3887 vect_epilogue);
3888 }
3889 else
3890 /* Use extracts and reduction op for final reduction. For N
3891 elements, we have N extracts and N-1 reduction ops. */
3892 epilogue_cost += record_stmt_cost (cost_vec,
3893 nelements + nelements - 1,
3894 vector_stmt, stmt_info, 0,
3895 vect_epilogue);
3896 }
3897 }
3898
3899 if (dump_enabled_p ())
3900 dump_printf (MSG_NOTE,
3901 "vect_model_reduction_cost: inside_cost = %d, "
3902 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3903 prologue_cost, epilogue_cost);
3904 }
3905
3906
3907 /* Function vect_model_induction_cost.
3908
3909 Models cost for induction operations. */
3910
3911 static void
3912 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
3913 stmt_vector_for_cost *cost_vec)
3914 {
3915 unsigned inside_cost, prologue_cost;
3916
3917 if (PURE_SLP_STMT (stmt_info))
3918 return;
3919
3920 /* loop cost for vec_loop. */
3921 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3922 stmt_info, 0, vect_body);
3923
3924 /* prologue cost for vec_init and vec_step. */
3925 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
3926 stmt_info, 0, vect_prologue);
3927
3928 if (dump_enabled_p ())
3929 dump_printf_loc (MSG_NOTE, vect_location,
3930 "vect_model_induction_cost: inside_cost = %d, "
3931 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3932 }
3933
3934
3935
3936 /* Function get_initial_def_for_reduction
3937
3938 Input:
3939 STMT_VINFO - a stmt that performs a reduction operation in the loop.
3940 INIT_VAL - the initial value of the reduction variable
3941
3942 Output:
3943 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3944 of the reduction (used for adjusting the epilog - see below).
3945 Return a vector variable, initialized according to the operation that
3946 STMT_VINFO performs. This vector will be used as the initial value
3947 of the vector of partial results.
3948
3949 Option1 (adjust in epilog): Initialize the vector as follows:
3950 add/bit or/xor: [0,0,...,0,0]
3951 mult/bit and: [1,1,...,1,1]
3952 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3953 and when necessary (e.g. add/mult case) let the caller know
3954 that it needs to adjust the result by init_val.
3955
3956 Option2: Initialize the vector as follows:
3957 add/bit or/xor: [init_val,0,0,...,0]
3958 mult/bit and: [init_val,1,1,...,1]
3959 min/max/cond_expr: [init_val,init_val,...,init_val]
3960 and no adjustments are needed.
3961
3962 For example, for the following code:
3963
3964 s = init_val;
3965 for (i=0;i<n;i++)
3966 s = s + a[i];
3967
3968 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
3969 For a vector of 4 units, we want to return either [0,0,0,init_val],
3970 or [0,0,0,0] and let the caller know that it needs to adjust
3971 the result at the end by 'init_val'.
3972
3973 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3974 initialization vector is simpler (same element in all entries), if
3975 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3976
3977 A cost model should help decide between these two schemes. */
3978
3979 tree
3980 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
3981 tree *adjustment_def)
3982 {
3983 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3984 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3985 tree scalar_type = TREE_TYPE (init_val);
3986 tree vectype = get_vectype_for_scalar_type (scalar_type);
3987 enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
3988 tree def_for_init;
3989 tree init_def;
3990 REAL_VALUE_TYPE real_init_val = dconst0;
3991 int int_init_val = 0;
3992 gimple_seq stmts = NULL;
3993
3994 gcc_assert (vectype);
3995
3996 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3997 || SCALAR_FLOAT_TYPE_P (scalar_type));
3998
3999 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4000 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4001
4002 vect_reduction_type reduction_type
4003 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4004
4005 switch (code)
4006 {
4007 case WIDEN_SUM_EXPR:
4008 case DOT_PROD_EXPR:
4009 case SAD_EXPR:
4010 case PLUS_EXPR:
4011 case MINUS_EXPR:
4012 case BIT_IOR_EXPR:
4013 case BIT_XOR_EXPR:
4014 case MULT_EXPR:
4015 case BIT_AND_EXPR:
4016 {
4017 /* ADJUSTMENT_DEF is NULL when called from
4018 vect_create_epilog_for_reduction to vectorize double reduction. */
4019 if (adjustment_def)
4020 *adjustment_def = init_val;
4021
4022 if (code == MULT_EXPR)
4023 {
4024 real_init_val = dconst1;
4025 int_init_val = 1;
4026 }
4027
4028 if (code == BIT_AND_EXPR)
4029 int_init_val = -1;
4030
4031 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4032 def_for_init = build_real (scalar_type, real_init_val);
4033 else
4034 def_for_init = build_int_cst (scalar_type, int_init_val);
4035
4036 if (adjustment_def)
4037 /* Option1: the first element is '0' or '1' as well. */
4038 init_def = gimple_build_vector_from_val (&stmts, vectype,
4039 def_for_init);
4040 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4041 {
4042 /* Option2 (variable length): the first element is INIT_VAL. */
4043 init_def = gimple_build_vector_from_val (&stmts, vectype,
4044 def_for_init);
4045 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4046 vectype, init_def, init_val);
4047 }
4048 else
4049 {
4050 /* Option2: the first element is INIT_VAL. */
4051 tree_vector_builder elts (vectype, 1, 2);
4052 elts.quick_push (init_val);
4053 elts.quick_push (def_for_init);
4054 init_def = gimple_build_vector (&stmts, &elts);
4055 }
4056 }
4057 break;
4058
4059 case MIN_EXPR:
4060 case MAX_EXPR:
4061 case COND_EXPR:
4062 {
4063 if (adjustment_def)
4064 {
4065 *adjustment_def = NULL_TREE;
4066 if (reduction_type != COND_REDUCTION
4067 && reduction_type != EXTRACT_LAST_REDUCTION)
4068 {
4069 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
4070 break;
4071 }
4072 }
4073 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4074 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4075 }
4076 break;
4077
4078 default:
4079 gcc_unreachable ();
4080 }
4081
4082 if (stmts)
4083 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4084 return init_def;
4085 }
4086
4087 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4088 NUMBER_OF_VECTORS is the number of vector defs to create.
4089 If NEUTRAL_OP is nonnull, introducing extra elements of that
4090 value will not change the result. */
4091
4092 static void
4093 get_initial_defs_for_reduction (slp_tree slp_node,
4094 vec<tree> *vec_oprnds,
4095 unsigned int number_of_vectors,
4096 bool reduc_chain, tree neutral_op)
4097 {
4098 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4099 stmt_vec_info stmt_vinfo = stmts[0];
4100 unsigned HOST_WIDE_INT nunits;
4101 unsigned j, number_of_places_left_in_vector;
4102 tree vector_type;
4103 tree vop;
4104 int group_size = stmts.length ();
4105 unsigned int vec_num, i;
4106 unsigned number_of_copies = 1;
4107 vec<tree> voprnds;
4108 voprnds.create (number_of_vectors);
4109 struct loop *loop;
4110 auto_vec<tree, 16> permute_results;
4111
4112 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4113
4114 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4115
4116 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4117 gcc_assert (loop);
4118 edge pe = loop_preheader_edge (loop);
4119
4120 gcc_assert (!reduc_chain || neutral_op);
4121
4122 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4123 created vectors. It is greater than 1 if unrolling is performed.
4124
4125 For example, we have two scalar operands, s1 and s2 (e.g., group of
4126 strided accesses of size two), while NUNITS is four (i.e., four scalars
4127 of this type can be packed in a vector). The output vector will contain
4128 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4129 will be 2).
4130
4131 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4132 vectors containing the operands.
4133
4134 For example, NUNITS is four as before, and the group size is 8
4135 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4136 {s5, s6, s7, s8}. */
4137
4138 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4139 nunits = group_size;
4140
4141 number_of_copies = nunits * number_of_vectors / group_size;
4142
4143 number_of_places_left_in_vector = nunits;
4144 bool constant_p = true;
4145 tree_vector_builder elts (vector_type, nunits, 1);
4146 elts.quick_grow (nunits);
4147 for (j = 0; j < number_of_copies; j++)
4148 {
4149 for (i = group_size - 1; stmts.iterate (i, &stmt_vinfo); i--)
4150 {
4151 tree op;
4152 /* Get the def before the loop. In reduction chain we have only
4153 one initial value. */
4154 if ((j != (number_of_copies - 1)
4155 || (reduc_chain && i != 0))
4156 && neutral_op)
4157 op = neutral_op;
4158 else
4159 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4160
4161 /* Create 'vect_ = {op0,op1,...,opn}'. */
4162 number_of_places_left_in_vector--;
4163 elts[number_of_places_left_in_vector] = op;
4164 if (!CONSTANT_CLASS_P (op))
4165 constant_p = false;
4166
4167 if (number_of_places_left_in_vector == 0)
4168 {
4169 gimple_seq ctor_seq = NULL;
4170 tree init;
4171 if (constant_p && !neutral_op
4172 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4173 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4174 /* Build the vector directly from ELTS. */
4175 init = gimple_build_vector (&ctor_seq, &elts);
4176 else if (neutral_op)
4177 {
4178 /* Build a vector of the neutral value and shift the
4179 other elements into place. */
4180 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4181 neutral_op);
4182 int k = nunits;
4183 while (k > 0 && elts[k - 1] == neutral_op)
4184 k -= 1;
4185 while (k > 0)
4186 {
4187 k -= 1;
4188 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4189 vector_type, init, elts[k]);
4190 }
4191 }
4192 else
4193 {
4194 /* First time round, duplicate ELTS to fill the
4195 required number of vectors, then cherry pick the
4196 appropriate result for each iteration. */
4197 if (vec_oprnds->is_empty ())
4198 duplicate_and_interleave (&ctor_seq, vector_type, elts,
4199 number_of_vectors,
4200 permute_results);
4201 init = permute_results[number_of_vectors - j - 1];
4202 }
4203 if (ctor_seq != NULL)
4204 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4205 voprnds.quick_push (init);
4206
4207 number_of_places_left_in_vector = nunits;
4208 elts.new_vector (vector_type, nunits, 1);
4209 elts.quick_grow (nunits);
4210 constant_p = true;
4211 }
4212 }
4213 }
4214
4215 /* Since the vectors are created in the reverse order, we should invert
4216 them. */
4217 vec_num = voprnds.length ();
4218 for (j = vec_num; j != 0; j--)
4219 {
4220 vop = voprnds[j - 1];
4221 vec_oprnds->quick_push (vop);
4222 }
4223
4224 voprnds.release ();
4225
4226 /* In case that VF is greater than the unrolling factor needed for the SLP
4227 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4228 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4229 to replicate the vectors. */
4230 tree neutral_vec = NULL;
4231 while (number_of_vectors > vec_oprnds->length ())
4232 {
4233 if (neutral_op)
4234 {
4235 if (!neutral_vec)
4236 {
4237 gimple_seq ctor_seq = NULL;
4238 neutral_vec = gimple_build_vector_from_val
4239 (&ctor_seq, vector_type, neutral_op);
4240 if (ctor_seq != NULL)
4241 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4242 }
4243 vec_oprnds->quick_push (neutral_vec);
4244 }
4245 else
4246 {
4247 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4248 vec_oprnds->quick_push (vop);
4249 }
4250 }
4251 }
4252
4253
4254 /* Function vect_create_epilog_for_reduction
4255
4256 Create code at the loop-epilog to finalize the result of a reduction
4257 computation.
4258
4259 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4260 reduction statements.
4261 STMT_INFO is the scalar reduction stmt that is being vectorized.
4262 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4263 number of elements that we can fit in a vectype (nunits). In this case
4264 we have to generate more than one vector stmt - i.e - we need to "unroll"
4265 the vector stmt by a factor VF/nunits. For more details see documentation
4266 in vectorizable_operation.
4267 REDUC_FN is the internal function for the epilog reduction.
4268 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4269 computation.
4270 REDUC_INDEX is the index of the operand in the right hand side of the
4271 statement that is defined by REDUCTION_PHI.
4272 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4273 SLP_NODE is an SLP node containing a group of reduction statements. The
4274 first one in this group is STMT_INFO.
4275 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4276 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4277 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4278 any value of the IV in the loop.
4279 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4280 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4281 null if this is not an SLP reduction
4282
4283 This function:
4284 1. Creates the reduction def-use cycles: sets the arguments for
4285 REDUCTION_PHIS:
4286 The loop-entry argument is the vectorized initial-value of the reduction.
4287 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4288 sums.
4289 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4290 by calling the function specified by REDUC_FN if available, or by
4291 other means (whole-vector shifts or a scalar loop).
4292 The function also creates a new phi node at the loop exit to preserve
4293 loop-closed form, as illustrated below.
4294
4295 The flow at the entry to this function:
4296
4297 loop:
4298 vec_def = phi <null, null> # REDUCTION_PHI
4299 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4300 s_loop = scalar_stmt # (scalar) STMT_INFO
4301 loop_exit:
4302 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4303 use <s_out0>
4304 use <s_out0>
4305
4306 The above is transformed by this function into:
4307
4308 loop:
4309 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4310 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4311 s_loop = scalar_stmt # (scalar) STMT_INFO
4312 loop_exit:
4313 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4314 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4315 v_out2 = reduce <v_out1>
4316 s_out3 = extract_field <v_out2, 0>
4317 s_out4 = adjust_result <s_out3>
4318 use <s_out4>
4319 use <s_out4>
4320 */
4321
4322 static void
4323 vect_create_epilog_for_reduction (vec<tree> vect_defs,
4324 stmt_vec_info stmt_info,
4325 gimple *reduc_def_stmt,
4326 int ncopies, internal_fn reduc_fn,
4327 vec<stmt_vec_info> reduction_phis,
4328 bool double_reduc,
4329 slp_tree slp_node,
4330 slp_instance slp_node_instance,
4331 tree induc_val, enum tree_code induc_code,
4332 tree neutral_op)
4333 {
4334 stmt_vec_info prev_phi_info;
4335 tree vectype;
4336 machine_mode mode;
4337 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4338 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4339 basic_block exit_bb;
4340 tree scalar_dest;
4341 tree scalar_type;
4342 gimple *new_phi = NULL, *phi;
4343 stmt_vec_info phi_info;
4344 gimple_stmt_iterator exit_gsi;
4345 tree vec_dest;
4346 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4347 gimple *epilog_stmt = NULL;
4348 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
4349 gimple *exit_phi;
4350 tree bitsize;
4351 tree adjustment_def = NULL;
4352 tree vec_initial_def = NULL;
4353 tree expr, def, initial_def = NULL;
4354 tree orig_name, scalar_result;
4355 imm_use_iterator imm_iter, phi_imm_iter;
4356 use_operand_p use_p, phi_use_p;
4357 gimple *use_stmt;
4358 stmt_vec_info reduction_phi_info = NULL;
4359 bool nested_in_vect_loop = false;
4360 auto_vec<gimple *> new_phis;
4361 auto_vec<stmt_vec_info> inner_phis;
4362 int j, i;
4363 auto_vec<tree> scalar_results;
4364 unsigned int group_size = 1, k, ratio;
4365 auto_vec<tree> vec_initial_defs;
4366 auto_vec<gimple *> phis;
4367 bool slp_reduc = false;
4368 bool direct_slp_reduc;
4369 tree new_phi_result;
4370 stmt_vec_info inner_phi = NULL;
4371 tree induction_index = NULL_TREE;
4372
4373 if (slp_node)
4374 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4375
4376 if (nested_in_vect_loop_p (loop, stmt_info))
4377 {
4378 outer_loop = loop;
4379 loop = loop->inner;
4380 nested_in_vect_loop = true;
4381 gcc_assert (!slp_node);
4382 }
4383
4384 vectype = STMT_VINFO_VECTYPE (stmt_info);
4385 gcc_assert (vectype);
4386 mode = TYPE_MODE (vectype);
4387
4388 /* 1. Create the reduction def-use cycle:
4389 Set the arguments of REDUCTION_PHIS, i.e., transform
4390
4391 loop:
4392 vec_def = phi <null, null> # REDUCTION_PHI
4393 VECT_DEF = vector_stmt # vectorized form of STMT
4394 ...
4395
4396 into:
4397
4398 loop:
4399 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4400 VECT_DEF = vector_stmt # vectorized form of STMT
4401 ...
4402
4403 (in case of SLP, do it for all the phis). */
4404
4405 /* Get the loop-entry arguments. */
4406 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4407 if (slp_node)
4408 {
4409 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4410 vec_initial_defs.reserve (vec_num);
4411 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4412 &vec_initial_defs, vec_num,
4413 REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4414 neutral_op);
4415 }
4416 else
4417 {
4418 /* Get at the scalar def before the loop, that defines the initial value
4419 of the reduction variable. */
4420 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4421 loop_preheader_edge (loop));
4422 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4423 and we can't use zero for induc_val, use initial_def. Similarly
4424 for REDUC_MIN and initial_def larger than the base. */
4425 if (TREE_CODE (initial_def) == INTEGER_CST
4426 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4427 == INTEGER_INDUC_COND_REDUCTION)
4428 && !integer_zerop (induc_val)
4429 && ((induc_code == MAX_EXPR
4430 && tree_int_cst_lt (initial_def, induc_val))
4431 || (induc_code == MIN_EXPR
4432 && tree_int_cst_lt (induc_val, initial_def))))
4433 induc_val = initial_def;
4434
4435 if (double_reduc)
4436 /* In case of double reduction we only create a vector variable
4437 to be put in the reduction phi node. The actual statement
4438 creation is done later in this function. */
4439 vec_initial_def = vect_create_destination_var (initial_def, vectype);
4440 else if (nested_in_vect_loop)
4441 {
4442 /* Do not use an adjustment def as that case is not supported
4443 correctly if ncopies is not one. */
4444 vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4445 vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4446 stmt_info);
4447 }
4448 else
4449 vec_initial_def
4450 = get_initial_def_for_reduction (stmt_info, initial_def,
4451 &adjustment_def);
4452 vec_initial_defs.create (1);
4453 vec_initial_defs.quick_push (vec_initial_def);
4454 }
4455
4456 /* Set phi nodes arguments. */
4457 FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4458 {
4459 tree vec_init_def = vec_initial_defs[i];
4460 tree def = vect_defs[i];
4461 for (j = 0; j < ncopies; j++)
4462 {
4463 if (j != 0)
4464 {
4465 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4466 if (nested_in_vect_loop)
4467 vec_init_def
4468 = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
4469 }
4470
4471 /* Set the loop-entry arg of the reduction-phi. */
4472
4473 gphi *phi = as_a <gphi *> (phi_info->stmt);
4474 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4475 == INTEGER_INDUC_COND_REDUCTION)
4476 {
4477 /* Initialise the reduction phi to zero. This prevents initial
4478 values of non-zero interferring with the reduction op. */
4479 gcc_assert (ncopies == 1);
4480 gcc_assert (i == 0);
4481
4482 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4483 tree induc_val_vec
4484 = build_vector_from_val (vec_init_def_type, induc_val);
4485
4486 add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4487 UNKNOWN_LOCATION);
4488 }
4489 else
4490 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4491 UNKNOWN_LOCATION);
4492
4493 /* Set the loop-latch arg for the reduction-phi. */
4494 if (j > 0)
4495 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4496
4497 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4498
4499 if (dump_enabled_p ())
4500 dump_printf_loc (MSG_NOTE, vect_location,
4501 "transform reduction: created def-use cycle: %G%G",
4502 phi, SSA_NAME_DEF_STMT (def));
4503 }
4504 }
4505
4506 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4507 which is updated with the current index of the loop for every match of
4508 the original loop's cond_expr (VEC_STMT). This results in a vector
4509 containing the last time the condition passed for that vector lane.
4510 The first match will be a 1 to allow 0 to be used for non-matching
4511 indexes. If there are no matches at all then the vector will be all
4512 zeroes. */
4513 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4514 {
4515 tree indx_before_incr, indx_after_incr;
4516 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4517
4518 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4519 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4520
4521 int scalar_precision
4522 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4523 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4524 tree cr_index_vector_type = build_vector_type
4525 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4526
4527 /* First we create a simple vector induction variable which starts
4528 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4529 vector size (STEP). */
4530
4531 /* Create a {1,2,3,...} vector. */
4532 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4533
4534 /* Create a vector of the step value. */
4535 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4536 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4537
4538 /* Create an induction variable. */
4539 gimple_stmt_iterator incr_gsi;
4540 bool insert_after;
4541 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4542 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4543 insert_after, &indx_before_incr, &indx_after_incr);
4544
4545 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4546 filled with zeros (VEC_ZERO). */
4547
4548 /* Create a vector of 0s. */
4549 tree zero = build_zero_cst (cr_index_scalar_type);
4550 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4551
4552 /* Create a vector phi node. */
4553 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4554 new_phi = create_phi_node (new_phi_tree, loop->header);
4555 loop_vinfo->add_stmt (new_phi);
4556 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4557 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4558
4559 /* Now take the condition from the loops original cond_expr
4560 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4561 every match uses values from the induction variable
4562 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4563 (NEW_PHI_TREE).
4564 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4565 the new cond_expr (INDEX_COND_EXPR). */
4566
4567 /* Duplicate the condition from vec_stmt. */
4568 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4569
4570 /* Create a conditional, where the condition is taken from vec_stmt
4571 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4572 else is the phi (NEW_PHI_TREE). */
4573 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4574 ccompare, indx_before_incr,
4575 new_phi_tree);
4576 induction_index = make_ssa_name (cr_index_vector_type);
4577 gimple *index_condition = gimple_build_assign (induction_index,
4578 index_cond_expr);
4579 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4580 stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4581 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4582
4583 /* Update the phi with the vec cond. */
4584 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4585 loop_latch_edge (loop), UNKNOWN_LOCATION);
4586 }
4587
4588 /* 2. Create epilog code.
4589 The reduction epilog code operates across the elements of the vector
4590 of partial results computed by the vectorized loop.
4591 The reduction epilog code consists of:
4592
4593 step 1: compute the scalar result in a vector (v_out2)
4594 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4595 step 3: adjust the scalar result (s_out3) if needed.
4596
4597 Step 1 can be accomplished using one the following three schemes:
4598 (scheme 1) using reduc_fn, if available.
4599 (scheme 2) using whole-vector shifts, if available.
4600 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4601 combined.
4602
4603 The overall epilog code looks like this:
4604
4605 s_out0 = phi <s_loop> # original EXIT_PHI
4606 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4607 v_out2 = reduce <v_out1> # step 1
4608 s_out3 = extract_field <v_out2, 0> # step 2
4609 s_out4 = adjust_result <s_out3> # step 3
4610
4611 (step 3 is optional, and steps 1 and 2 may be combined).
4612 Lastly, the uses of s_out0 are replaced by s_out4. */
4613
4614
4615 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4616 v_out1 = phi <VECT_DEF>
4617 Store them in NEW_PHIS. */
4618
4619 exit_bb = single_exit (loop)->dest;
4620 prev_phi_info = NULL;
4621 new_phis.create (vect_defs.length ());
4622 FOR_EACH_VEC_ELT (vect_defs, i, def)
4623 {
4624 for (j = 0; j < ncopies; j++)
4625 {
4626 tree new_def = copy_ssa_name (def);
4627 phi = create_phi_node (new_def, exit_bb);
4628 stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4629 if (j == 0)
4630 new_phis.quick_push (phi);
4631 else
4632 {
4633 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4634 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4635 }
4636
4637 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4638 prev_phi_info = phi_info;
4639 }
4640 }
4641
4642 /* The epilogue is created for the outer-loop, i.e., for the loop being
4643 vectorized. Create exit phis for the outer loop. */
4644 if (double_reduc)
4645 {
4646 loop = outer_loop;
4647 exit_bb = single_exit (loop)->dest;
4648 inner_phis.create (vect_defs.length ());
4649 FOR_EACH_VEC_ELT (new_phis, i, phi)
4650 {
4651 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4652 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4653 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4654 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4655 PHI_RESULT (phi));
4656 prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4657 inner_phis.quick_push (phi_info);
4658 new_phis[i] = outer_phi;
4659 while (STMT_VINFO_RELATED_STMT (phi_info))
4660 {
4661 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4662 new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4663 outer_phi = create_phi_node (new_result, exit_bb);
4664 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4665 PHI_RESULT (phi_info->stmt));
4666 stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4667 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4668 prev_phi_info = outer_phi_info;
4669 }
4670 }
4671 }
4672
4673 exit_gsi = gsi_after_labels (exit_bb);
4674
4675 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4676 (i.e. when reduc_fn is not available) and in the final adjustment
4677 code (if needed). Also get the original scalar reduction variable as
4678 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4679 represents a reduction pattern), the tree-code and scalar-def are
4680 taken from the original stmt that the pattern-stmt (STMT) replaces.
4681 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4682 are taken from STMT. */
4683
4684 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4685 if (orig_stmt_info != stmt_info)
4686 {
4687 /* Reduction pattern */
4688 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4689 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4690 }
4691
4692 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4693 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4694 partial results are added and not subtracted. */
4695 if (code == MINUS_EXPR)
4696 code = PLUS_EXPR;
4697
4698 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4699 scalar_type = TREE_TYPE (scalar_dest);
4700 scalar_results.create (group_size);
4701 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4702 bitsize = TYPE_SIZE (scalar_type);
4703
4704 /* In case this is a reduction in an inner-loop while vectorizing an outer
4705 loop - we don't need to extract a single scalar result at the end of the
4706 inner-loop (unless it is double reduction, i.e., the use of reduction is
4707 outside the outer-loop). The final vector of partial results will be used
4708 in the vectorized outer-loop, or reduced to a scalar result at the end of
4709 the outer-loop. */
4710 if (nested_in_vect_loop && !double_reduc)
4711 goto vect_finalize_reduction;
4712
4713 /* SLP reduction without reduction chain, e.g.,
4714 # a1 = phi <a2, a0>
4715 # b1 = phi <b2, b0>
4716 a2 = operation (a1)
4717 b2 = operation (b1) */
4718 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4719
4720 /* True if we should implement SLP_REDUC using native reduction operations
4721 instead of scalar operations. */
4722 direct_slp_reduc = (reduc_fn != IFN_LAST
4723 && slp_reduc
4724 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4725
4726 /* In case of reduction chain, e.g.,
4727 # a1 = phi <a3, a0>
4728 a2 = operation (a1)
4729 a3 = operation (a2),
4730
4731 we may end up with more than one vector result. Here we reduce them to
4732 one vector. */
4733 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4734 {
4735 tree first_vect = PHI_RESULT (new_phis[0]);
4736 gassign *new_vec_stmt = NULL;
4737 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4738 for (k = 1; k < new_phis.length (); k++)
4739 {
4740 gimple *next_phi = new_phis[k];
4741 tree second_vect = PHI_RESULT (next_phi);
4742 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4743 new_vec_stmt = gimple_build_assign (tem, code,
4744 first_vect, second_vect);
4745 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4746 first_vect = tem;
4747 }
4748
4749 new_phi_result = first_vect;
4750 if (new_vec_stmt)
4751 {
4752 new_phis.truncate (0);
4753 new_phis.safe_push (new_vec_stmt);
4754 }
4755 }
4756 /* Likewise if we couldn't use a single defuse cycle. */
4757 else if (ncopies > 1)
4758 {
4759 gcc_assert (new_phis.length () == 1);
4760 tree first_vect = PHI_RESULT (new_phis[0]);
4761 gassign *new_vec_stmt = NULL;
4762 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4763 stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4764 for (int k = 1; k < ncopies; ++k)
4765 {
4766 next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4767 tree second_vect = PHI_RESULT (next_phi_info->stmt);
4768 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4769 new_vec_stmt = gimple_build_assign (tem, code,
4770 first_vect, second_vect);
4771 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4772 first_vect = tem;
4773 }
4774 new_phi_result = first_vect;
4775 new_phis.truncate (0);
4776 new_phis.safe_push (new_vec_stmt);
4777 }
4778 else
4779 new_phi_result = PHI_RESULT (new_phis[0]);
4780
4781 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4782 && reduc_fn != IFN_LAST)
4783 {
4784 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4785 various data values where the condition matched and another vector
4786 (INDUCTION_INDEX) containing all the indexes of those matches. We
4787 need to extract the last matching index (which will be the index with
4788 highest value) and use this to index into the data vector.
4789 For the case where there were no matches, the data vector will contain
4790 all default values and the index vector will be all zeros. */
4791
4792 /* Get various versions of the type of the vector of indexes. */
4793 tree index_vec_type = TREE_TYPE (induction_index);
4794 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4795 tree index_scalar_type = TREE_TYPE (index_vec_type);
4796 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4797 (index_vec_type);
4798
4799 /* Get an unsigned integer version of the type of the data vector. */
4800 int scalar_precision
4801 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4802 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4803 tree vectype_unsigned = build_vector_type
4804 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4805
4806 /* First we need to create a vector (ZERO_VEC) of zeros and another
4807 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4808 can create using a MAX reduction and then expanding.
4809 In the case where the loop never made any matches, the max index will
4810 be zero. */
4811
4812 /* Vector of {0, 0, 0,...}. */
4813 tree zero_vec = make_ssa_name (vectype);
4814 tree zero_vec_rhs = build_zero_cst (vectype);
4815 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4816 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4817
4818 /* Find maximum value from the vector of found indexes. */
4819 tree max_index = make_ssa_name (index_scalar_type);
4820 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4821 1, induction_index);
4822 gimple_call_set_lhs (max_index_stmt, max_index);
4823 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4824
4825 /* Vector of {max_index, max_index, max_index,...}. */
4826 tree max_index_vec = make_ssa_name (index_vec_type);
4827 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4828 max_index);
4829 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4830 max_index_vec_rhs);
4831 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4832
4833 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4834 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4835 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4836 otherwise. Only one value should match, resulting in a vector
4837 (VEC_COND) with one data value and the rest zeros.
4838 In the case where the loop never made any matches, every index will
4839 match, resulting in a vector with all data values (which will all be
4840 the default value). */
4841
4842 /* Compare the max index vector to the vector of found indexes to find
4843 the position of the max value. */
4844 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4845 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4846 induction_index,
4847 max_index_vec);
4848 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4849
4850 /* Use the compare to choose either values from the data vector or
4851 zero. */
4852 tree vec_cond = make_ssa_name (vectype);
4853 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4854 vec_compare, new_phi_result,
4855 zero_vec);
4856 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4857
4858 /* Finally we need to extract the data value from the vector (VEC_COND)
4859 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4860 reduction, but because this doesn't exist, we can use a MAX reduction
4861 instead. The data value might be signed or a float so we need to cast
4862 it first.
4863 In the case where the loop never made any matches, the data values are
4864 all identical, and so will reduce down correctly. */
4865
4866 /* Make the matched data values unsigned. */
4867 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4868 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4869 vec_cond);
4870 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4871 VIEW_CONVERT_EXPR,
4872 vec_cond_cast_rhs);
4873 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4874
4875 /* Reduce down to a scalar value. */
4876 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4877 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4878 1, vec_cond_cast);
4879 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4880 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4881
4882 /* Convert the reduced value back to the result type and set as the
4883 result. */
4884 gimple_seq stmts = NULL;
4885 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4886 data_reduc);
4887 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4888 scalar_results.safe_push (new_temp);
4889 }
4890 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4891 && reduc_fn == IFN_LAST)
4892 {
4893 /* Condition reduction without supported IFN_REDUC_MAX. Generate
4894 idx = 0;
4895 idx_val = induction_index[0];
4896 val = data_reduc[0];
4897 for (idx = 0, val = init, i = 0; i < nelts; ++i)
4898 if (induction_index[i] > idx_val)
4899 val = data_reduc[i], idx_val = induction_index[i];
4900 return val; */
4901
4902 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4903 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4904 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4905 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4906 /* Enforced by vectorizable_reduction, which ensures we have target
4907 support before allowing a conditional reduction on variable-length
4908 vectors. */
4909 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4910 tree idx_val = NULL_TREE, val = NULL_TREE;
4911 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4912 {
4913 tree old_idx_val = idx_val;
4914 tree old_val = val;
4915 idx_val = make_ssa_name (idx_eltype);
4916 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4917 build3 (BIT_FIELD_REF, idx_eltype,
4918 induction_index,
4919 bitsize_int (el_size),
4920 bitsize_int (off)));
4921 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4922 val = make_ssa_name (data_eltype);
4923 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4924 build3 (BIT_FIELD_REF,
4925 data_eltype,
4926 new_phi_result,
4927 bitsize_int (el_size),
4928 bitsize_int (off)));
4929 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4930 if (off != 0)
4931 {
4932 tree new_idx_val = idx_val;
4933 tree new_val = val;
4934 if (off != v_size - el_size)
4935 {
4936 new_idx_val = make_ssa_name (idx_eltype);
4937 epilog_stmt = gimple_build_assign (new_idx_val,
4938 MAX_EXPR, idx_val,
4939 old_idx_val);
4940 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4941 }
4942 new_val = make_ssa_name (data_eltype);
4943 epilog_stmt = gimple_build_assign (new_val,
4944 COND_EXPR,
4945 build2 (GT_EXPR,
4946 boolean_type_node,
4947 idx_val,
4948 old_idx_val),
4949 val, old_val);
4950 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4951 idx_val = new_idx_val;
4952 val = new_val;
4953 }
4954 }
4955 /* Convert the reduced value back to the result type and set as the
4956 result. */
4957 gimple_seq stmts = NULL;
4958 val = gimple_convert (&stmts, scalar_type, val);
4959 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4960 scalar_results.safe_push (val);
4961 }
4962
4963 /* 2.3 Create the reduction code, using one of the three schemes described
4964 above. In SLP we simply need to extract all the elements from the
4965 vector (without reducing them), so we use scalar shifts. */
4966 else if (reduc_fn != IFN_LAST && !slp_reduc)
4967 {
4968 tree tmp;
4969 tree vec_elem_type;
4970
4971 /* Case 1: Create:
4972 v_out2 = reduc_expr <v_out1> */
4973
4974 if (dump_enabled_p ())
4975 dump_printf_loc (MSG_NOTE, vect_location,
4976 "Reduce using direct vector reduction.\n");
4977
4978 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4979 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4980 {
4981 tree tmp_dest
4982 = vect_create_destination_var (scalar_dest, vec_elem_type);
4983 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4984 new_phi_result);
4985 gimple_set_lhs (epilog_stmt, tmp_dest);
4986 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4987 gimple_set_lhs (epilog_stmt, new_temp);
4988 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4989
4990 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
4991 new_temp);
4992 }
4993 else
4994 {
4995 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4996 new_phi_result);
4997 gimple_set_lhs (epilog_stmt, new_scalar_dest);
4998 }
4999
5000 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5001 gimple_set_lhs (epilog_stmt, new_temp);
5002 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5003
5004 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5005 == INTEGER_INDUC_COND_REDUCTION)
5006 && !operand_equal_p (initial_def, induc_val, 0))
5007 {
5008 /* Earlier we set the initial value to be a vector if induc_val
5009 values. Check the result and if it is induc_val then replace
5010 with the original initial value, unless induc_val is
5011 the same as initial_def already. */
5012 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5013 induc_val);
5014
5015 tmp = make_ssa_name (new_scalar_dest);
5016 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5017 initial_def, new_temp);
5018 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5019 new_temp = tmp;
5020 }
5021
5022 scalar_results.safe_push (new_temp);
5023 }
5024 else if (direct_slp_reduc)
5025 {
5026 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5027 with the elements for other SLP statements replaced with the
5028 neutral value. We can then do a normal reduction on each vector. */
5029
5030 /* Enforced by vectorizable_reduction. */
5031 gcc_assert (new_phis.length () == 1);
5032 gcc_assert (pow2p_hwi (group_size));
5033
5034 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5035 vec<stmt_vec_info> orig_phis
5036 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5037 gimple_seq seq = NULL;
5038
5039 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5040 and the same element size as VECTYPE. */
5041 tree index = build_index_vector (vectype, 0, 1);
5042 tree index_type = TREE_TYPE (index);
5043 tree index_elt_type = TREE_TYPE (index_type);
5044 tree mask_type = build_same_sized_truth_vector_type (index_type);
5045
5046 /* Create a vector that, for each element, identifies which of
5047 the REDUC_GROUP_SIZE results should use it. */
5048 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5049 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5050 build_vector_from_val (index_type, index_mask));
5051
5052 /* Get a neutral vector value. This is simply a splat of the neutral
5053 scalar value if we have one, otherwise the initial scalar value
5054 is itself a neutral value. */
5055 tree vector_identity = NULL_TREE;
5056 if (neutral_op)
5057 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5058 neutral_op);
5059 for (unsigned int i = 0; i < group_size; ++i)
5060 {
5061 /* If there's no univeral neutral value, we can use the
5062 initial scalar value from the original PHI. This is used
5063 for MIN and MAX reduction, for example. */
5064 if (!neutral_op)
5065 {
5066 tree scalar_value
5067 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5068 loop_preheader_edge (loop));
5069 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5070 scalar_value);
5071 }
5072
5073 /* Calculate the equivalent of:
5074
5075 sel[j] = (index[j] == i);
5076
5077 which selects the elements of NEW_PHI_RESULT that should
5078 be included in the result. */
5079 tree compare_val = build_int_cst (index_elt_type, i);
5080 compare_val = build_vector_from_val (index_type, compare_val);
5081 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5082 index, compare_val);
5083
5084 /* Calculate the equivalent of:
5085
5086 vec = seq ? new_phi_result : vector_identity;
5087
5088 VEC is now suitable for a full vector reduction. */
5089 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5090 sel, new_phi_result, vector_identity);
5091
5092 /* Do the reduction and convert it to the appropriate type. */
5093 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5094 TREE_TYPE (vectype), vec);
5095 scalar = gimple_convert (&seq, scalar_type, scalar);
5096 scalar_results.safe_push (scalar);
5097 }
5098 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5099 }
5100 else
5101 {
5102 bool reduce_with_shift;
5103 tree vec_temp;
5104
5105 /* COND reductions all do the final reduction with MAX_EXPR
5106 or MIN_EXPR. */
5107 if (code == COND_EXPR)
5108 {
5109 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5110 == INTEGER_INDUC_COND_REDUCTION)
5111 code = induc_code;
5112 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5113 == CONST_COND_REDUCTION)
5114 code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5115 else
5116 code = MAX_EXPR;
5117 }
5118
5119 /* See if the target wants to do the final (shift) reduction
5120 in a vector mode of smaller size and first reduce upper/lower
5121 halves against each other. */
5122 enum machine_mode mode1 = mode;
5123 tree vectype1 = vectype;
5124 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5125 unsigned sz1 = sz;
5126 if (!slp_reduc
5127 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5128 sz1 = GET_MODE_SIZE (mode1).to_constant ();
5129
5130 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5131 reduce_with_shift = have_whole_vector_shift (mode1);
5132 if (!VECTOR_MODE_P (mode1))
5133 reduce_with_shift = false;
5134 else
5135 {
5136 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5137 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5138 reduce_with_shift = false;
5139 }
5140
5141 /* First reduce the vector to the desired vector size we should
5142 do shift reduction on by combining upper and lower halves. */
5143 new_temp = new_phi_result;
5144 while (sz > sz1)
5145 {
5146 gcc_assert (!slp_reduc);
5147 sz /= 2;
5148 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5149
5150 /* The target has to make sure we support lowpart/highpart
5151 extraction, either via direct vector extract or through
5152 an integer mode punning. */
5153 tree dst1, dst2;
5154 if (convert_optab_handler (vec_extract_optab,
5155 TYPE_MODE (TREE_TYPE (new_temp)),
5156 TYPE_MODE (vectype1))
5157 != CODE_FOR_nothing)
5158 {
5159 /* Extract sub-vectors directly once vec_extract becomes
5160 a conversion optab. */
5161 dst1 = make_ssa_name (vectype1);
5162 epilog_stmt
5163 = gimple_build_assign (dst1, BIT_FIELD_REF,
5164 build3 (BIT_FIELD_REF, vectype1,
5165 new_temp, TYPE_SIZE (vectype1),
5166 bitsize_int (0)));
5167 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5168 dst2 = make_ssa_name (vectype1);
5169 epilog_stmt
5170 = gimple_build_assign (dst2, BIT_FIELD_REF,
5171 build3 (BIT_FIELD_REF, vectype1,
5172 new_temp, TYPE_SIZE (vectype1),
5173 bitsize_int (sz * BITS_PER_UNIT)));
5174 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5175 }
5176 else
5177 {
5178 /* Extract via punning to appropriately sized integer mode
5179 vector. */
5180 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5181 1);
5182 tree etype = build_vector_type (eltype, 2);
5183 gcc_assert (convert_optab_handler (vec_extract_optab,
5184 TYPE_MODE (etype),
5185 TYPE_MODE (eltype))
5186 != CODE_FOR_nothing);
5187 tree tem = make_ssa_name (etype);
5188 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5189 build1 (VIEW_CONVERT_EXPR,
5190 etype, new_temp));
5191 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5192 new_temp = tem;
5193 tem = make_ssa_name (eltype);
5194 epilog_stmt
5195 = gimple_build_assign (tem, BIT_FIELD_REF,
5196 build3 (BIT_FIELD_REF, eltype,
5197 new_temp, TYPE_SIZE (eltype),
5198 bitsize_int (0)));
5199 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5200 dst1 = make_ssa_name (vectype1);
5201 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5202 build1 (VIEW_CONVERT_EXPR,
5203 vectype1, tem));
5204 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5205 tem = make_ssa_name (eltype);
5206 epilog_stmt
5207 = gimple_build_assign (tem, BIT_FIELD_REF,
5208 build3 (BIT_FIELD_REF, eltype,
5209 new_temp, TYPE_SIZE (eltype),
5210 bitsize_int (sz * BITS_PER_UNIT)));
5211 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5212 dst2 = make_ssa_name (vectype1);
5213 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5214 build1 (VIEW_CONVERT_EXPR,
5215 vectype1, tem));
5216 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5217 }
5218
5219 new_temp = make_ssa_name (vectype1);
5220 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5221 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5222 }
5223
5224 if (reduce_with_shift && !slp_reduc)
5225 {
5226 int element_bitsize = tree_to_uhwi (bitsize);
5227 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5228 for variable-length vectors and also requires direct target support
5229 for loop reductions. */
5230 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5231 int nelements = vec_size_in_bits / element_bitsize;
5232 vec_perm_builder sel;
5233 vec_perm_indices indices;
5234
5235 int elt_offset;
5236
5237 tree zero_vec = build_zero_cst (vectype1);
5238 /* Case 2: Create:
5239 for (offset = nelements/2; offset >= 1; offset/=2)
5240 {
5241 Create: va' = vec_shift <va, offset>
5242 Create: va = vop <va, va'>
5243 } */
5244
5245 tree rhs;
5246
5247 if (dump_enabled_p ())
5248 dump_printf_loc (MSG_NOTE, vect_location,
5249 "Reduce using vector shifts\n");
5250
5251 mode1 = TYPE_MODE (vectype1);
5252 vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5253 for (elt_offset = nelements / 2;
5254 elt_offset >= 1;
5255 elt_offset /= 2)
5256 {
5257 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5258 indices.new_vector (sel, 2, nelements);
5259 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5260 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5261 new_temp, zero_vec, mask);
5262 new_name = make_ssa_name (vec_dest, epilog_stmt);
5263 gimple_assign_set_lhs (epilog_stmt, new_name);
5264 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5265
5266 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5267 new_temp);
5268 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5269 gimple_assign_set_lhs (epilog_stmt, new_temp);
5270 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5271 }
5272
5273 /* 2.4 Extract the final scalar result. Create:
5274 s_out3 = extract_field <v_out2, bitpos> */
5275
5276 if (dump_enabled_p ())
5277 dump_printf_loc (MSG_NOTE, vect_location,
5278 "extract scalar result\n");
5279
5280 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5281 bitsize, bitsize_zero_node);
5282 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5283 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5284 gimple_assign_set_lhs (epilog_stmt, new_temp);
5285 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5286 scalar_results.safe_push (new_temp);
5287 }
5288 else
5289 {
5290 /* Case 3: Create:
5291 s = extract_field <v_out2, 0>
5292 for (offset = element_size;
5293 offset < vector_size;
5294 offset += element_size;)
5295 {
5296 Create: s' = extract_field <v_out2, offset>
5297 Create: s = op <s, s'> // For non SLP cases
5298 } */
5299
5300 if (dump_enabled_p ())
5301 dump_printf_loc (MSG_NOTE, vect_location,
5302 "Reduce using scalar code.\n");
5303
5304 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5305 int element_bitsize = tree_to_uhwi (bitsize);
5306 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5307 {
5308 int bit_offset;
5309 if (gimple_code (new_phi) == GIMPLE_PHI)
5310 vec_temp = PHI_RESULT (new_phi);
5311 else
5312 vec_temp = gimple_assign_lhs (new_phi);
5313 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5314 bitsize_zero_node);
5315 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5316 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5317 gimple_assign_set_lhs (epilog_stmt, new_temp);
5318 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5319
5320 /* In SLP we don't need to apply reduction operation, so we just
5321 collect s' values in SCALAR_RESULTS. */
5322 if (slp_reduc)
5323 scalar_results.safe_push (new_temp);
5324
5325 for (bit_offset = element_bitsize;
5326 bit_offset < vec_size_in_bits;
5327 bit_offset += element_bitsize)
5328 {
5329 tree bitpos = bitsize_int (bit_offset);
5330 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5331 bitsize, bitpos);
5332
5333 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5334 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5335 gimple_assign_set_lhs (epilog_stmt, new_name);
5336 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5337
5338 if (slp_reduc)
5339 {
5340 /* In SLP we don't need to apply reduction operation, so
5341 we just collect s' values in SCALAR_RESULTS. */
5342 new_temp = new_name;
5343 scalar_results.safe_push (new_name);
5344 }
5345 else
5346 {
5347 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5348 new_name, new_temp);
5349 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5350 gimple_assign_set_lhs (epilog_stmt, new_temp);
5351 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5352 }
5353 }
5354 }
5355
5356 /* The only case where we need to reduce scalar results in SLP, is
5357 unrolling. If the size of SCALAR_RESULTS is greater than
5358 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5359 REDUC_GROUP_SIZE. */
5360 if (slp_reduc)
5361 {
5362 tree res, first_res, new_res;
5363 gimple *new_stmt;
5364
5365 /* Reduce multiple scalar results in case of SLP unrolling. */
5366 for (j = group_size; scalar_results.iterate (j, &res);
5367 j++)
5368 {
5369 first_res = scalar_results[j % group_size];
5370 new_stmt = gimple_build_assign (new_scalar_dest, code,
5371 first_res, res);
5372 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5373 gimple_assign_set_lhs (new_stmt, new_res);
5374 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5375 scalar_results[j % group_size] = new_res;
5376 }
5377 }
5378 else
5379 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5380 scalar_results.safe_push (new_temp);
5381 }
5382
5383 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5384 == INTEGER_INDUC_COND_REDUCTION)
5385 && !operand_equal_p (initial_def, induc_val, 0))
5386 {
5387 /* Earlier we set the initial value to be a vector if induc_val
5388 values. Check the result and if it is induc_val then replace
5389 with the original initial value, unless induc_val is
5390 the same as initial_def already. */
5391 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5392 induc_val);
5393
5394 tree tmp = make_ssa_name (new_scalar_dest);
5395 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5396 initial_def, new_temp);
5397 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5398 scalar_results[0] = tmp;
5399 }
5400 }
5401
5402 vect_finalize_reduction:
5403
5404 if (double_reduc)
5405 loop = loop->inner;
5406
5407 /* 2.5 Adjust the final result by the initial value of the reduction
5408 variable. (When such adjustment is not needed, then
5409 'adjustment_def' is zero). For example, if code is PLUS we create:
5410 new_temp = loop_exit_def + adjustment_def */
5411
5412 if (adjustment_def)
5413 {
5414 gcc_assert (!slp_reduc);
5415 if (nested_in_vect_loop)
5416 {
5417 new_phi = new_phis[0];
5418 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5419 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5420 new_dest = vect_create_destination_var (scalar_dest, vectype);
5421 }
5422 else
5423 {
5424 new_temp = scalar_results[0];
5425 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5426 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5427 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5428 }
5429
5430 epilog_stmt = gimple_build_assign (new_dest, expr);
5431 new_temp = make_ssa_name (new_dest, epilog_stmt);
5432 gimple_assign_set_lhs (epilog_stmt, new_temp);
5433 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5434 if (nested_in_vect_loop)
5435 {
5436 stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5437 STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5438 = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5439
5440 if (!double_reduc)
5441 scalar_results.quick_push (new_temp);
5442 else
5443 scalar_results[0] = new_temp;
5444 }
5445 else
5446 scalar_results[0] = new_temp;
5447
5448 new_phis[0] = epilog_stmt;
5449 }
5450
5451 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5452 phis with new adjusted scalar results, i.e., replace use <s_out0>
5453 with use <s_out4>.
5454
5455 Transform:
5456 loop_exit:
5457 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5458 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5459 v_out2 = reduce <v_out1>
5460 s_out3 = extract_field <v_out2, 0>
5461 s_out4 = adjust_result <s_out3>
5462 use <s_out0>
5463 use <s_out0>
5464
5465 into:
5466
5467 loop_exit:
5468 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5469 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5470 v_out2 = reduce <v_out1>
5471 s_out3 = extract_field <v_out2, 0>
5472 s_out4 = adjust_result <s_out3>
5473 use <s_out4>
5474 use <s_out4> */
5475
5476
5477 /* In SLP reduction chain we reduce vector results into one vector if
5478 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5479 LHS of the last stmt in the reduction chain, since we are looking for
5480 the loop exit phi node. */
5481 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5482 {
5483 stmt_vec_info dest_stmt_info
5484 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5485 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5486 group_size = 1;
5487 }
5488
5489 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5490 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5491 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5492 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5493 correspond to the first vector stmt, etc.
5494 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5495 if (group_size > new_phis.length ())
5496 {
5497 ratio = group_size / new_phis.length ();
5498 gcc_assert (!(group_size % new_phis.length ()));
5499 }
5500 else
5501 ratio = 1;
5502
5503 stmt_vec_info epilog_stmt_info = NULL;
5504 for (k = 0; k < group_size; k++)
5505 {
5506 if (k % ratio == 0)
5507 {
5508 epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
5509 reduction_phi_info = reduction_phis[k / ratio];
5510 if (double_reduc)
5511 inner_phi = inner_phis[k / ratio];
5512 }
5513
5514 if (slp_reduc)
5515 {
5516 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5517
5518 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5519 /* SLP statements can't participate in patterns. */
5520 gcc_assert (!orig_stmt_info);
5521 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5522 }
5523
5524 phis.create (3);
5525 /* Find the loop-closed-use at the loop exit of the original scalar
5526 result. (The reduction result is expected to have two immediate uses -
5527 one at the latch block, and one at the loop exit). */
5528 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5529 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5530 && !is_gimple_debug (USE_STMT (use_p)))
5531 phis.safe_push (USE_STMT (use_p));
5532
5533 /* While we expect to have found an exit_phi because of loop-closed-ssa
5534 form we can end up without one if the scalar cycle is dead. */
5535
5536 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5537 {
5538 if (outer_loop)
5539 {
5540 stmt_vec_info exit_phi_vinfo
5541 = loop_vinfo->lookup_stmt (exit_phi);
5542 gphi *vect_phi;
5543
5544 /* FORNOW. Currently not supporting the case that an inner-loop
5545 reduction is not used in the outer-loop (but only outside the
5546 outer-loop), unless it is double reduction. */
5547 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5548 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5549 || double_reduc);
5550
5551 if (double_reduc)
5552 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5553 else
5554 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
5555 if (!double_reduc
5556 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5557 != vect_double_reduction_def)
5558 continue;
5559
5560 /* Handle double reduction:
5561
5562 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5563 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5564 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5565 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5566
5567 At that point the regular reduction (stmt2 and stmt3) is
5568 already vectorized, as well as the exit phi node, stmt4.
5569 Here we vectorize the phi node of double reduction, stmt1, and
5570 update all relevant statements. */
5571
5572 /* Go through all the uses of s2 to find double reduction phi
5573 node, i.e., stmt1 above. */
5574 orig_name = PHI_RESULT (exit_phi);
5575 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5576 {
5577 stmt_vec_info use_stmt_vinfo;
5578 tree vect_phi_init, preheader_arg, vect_phi_res;
5579 basic_block bb = gimple_bb (use_stmt);
5580
5581 /* Check that USE_STMT is really double reduction phi
5582 node. */
5583 if (gimple_code (use_stmt) != GIMPLE_PHI
5584 || gimple_phi_num_args (use_stmt) != 2
5585 || bb->loop_father != outer_loop)
5586 continue;
5587 use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5588 if (!use_stmt_vinfo
5589 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5590 != vect_double_reduction_def)
5591 continue;
5592
5593 /* Create vector phi node for double reduction:
5594 vs1 = phi <vs0, vs2>
5595 vs1 was created previously in this function by a call to
5596 vect_get_vec_def_for_operand and is stored in
5597 vec_initial_def;
5598 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5599 vs0 is created here. */
5600
5601 /* Create vector phi node. */
5602 vect_phi = create_phi_node (vec_initial_def, bb);
5603 loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5604
5605 /* Create vs0 - initial def of the double reduction phi. */
5606 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5607 loop_preheader_edge (outer_loop));
5608 vect_phi_init = get_initial_def_for_reduction
5609 (stmt_info, preheader_arg, NULL);
5610
5611 /* Update phi node arguments with vs0 and vs2. */
5612 add_phi_arg (vect_phi, vect_phi_init,
5613 loop_preheader_edge (outer_loop),
5614 UNKNOWN_LOCATION);
5615 add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5616 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5617 if (dump_enabled_p ())
5618 dump_printf_loc (MSG_NOTE, vect_location,
5619 "created double reduction phi node: %G",
5620 vect_phi);
5621
5622 vect_phi_res = PHI_RESULT (vect_phi);
5623
5624 /* Replace the use, i.e., set the correct vs1 in the regular
5625 reduction phi node. FORNOW, NCOPIES is always 1, so the
5626 loop is redundant. */
5627 stmt_vec_info use_info = reduction_phi_info;
5628 for (j = 0; j < ncopies; j++)
5629 {
5630 edge pr_edge = loop_preheader_edge (loop);
5631 SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5632 pr_edge->dest_idx, vect_phi_res);
5633 use_info = STMT_VINFO_RELATED_STMT (use_info);
5634 }
5635 }
5636 }
5637 }
5638
5639 phis.release ();
5640 if (nested_in_vect_loop)
5641 {
5642 if (double_reduc)
5643 loop = outer_loop;
5644 else
5645 continue;
5646 }
5647
5648 phis.create (3);
5649 /* Find the loop-closed-use at the loop exit of the original scalar
5650 result. (The reduction result is expected to have two immediate uses,
5651 one at the latch block, and one at the loop exit). For double
5652 reductions we are looking for exit phis of the outer loop. */
5653 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5654 {
5655 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5656 {
5657 if (!is_gimple_debug (USE_STMT (use_p)))
5658 phis.safe_push (USE_STMT (use_p));
5659 }
5660 else
5661 {
5662 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5663 {
5664 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5665
5666 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5667 {
5668 if (!flow_bb_inside_loop_p (loop,
5669 gimple_bb (USE_STMT (phi_use_p)))
5670 && !is_gimple_debug (USE_STMT (phi_use_p)))
5671 phis.safe_push (USE_STMT (phi_use_p));
5672 }
5673 }
5674 }
5675 }
5676
5677 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5678 {
5679 /* Replace the uses: */
5680 orig_name = PHI_RESULT (exit_phi);
5681 scalar_result = scalar_results[k];
5682 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5683 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5684 SET_USE (use_p, scalar_result);
5685 }
5686
5687 phis.release ();
5688 }
5689 }
5690
5691 /* Return a vector of type VECTYPE that is equal to the vector select
5692 operation "MASK ? VEC : IDENTITY". Insert the select statements
5693 before GSI. */
5694
5695 static tree
5696 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5697 tree vec, tree identity)
5698 {
5699 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5700 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5701 mask, vec, identity);
5702 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5703 return cond;
5704 }
5705
5706 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5707 order, starting with LHS. Insert the extraction statements before GSI and
5708 associate the new scalar SSA names with variable SCALAR_DEST.
5709 Return the SSA name for the result. */
5710
5711 static tree
5712 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5713 tree_code code, tree lhs, tree vector_rhs)
5714 {
5715 tree vectype = TREE_TYPE (vector_rhs);
5716 tree scalar_type = TREE_TYPE (vectype);
5717 tree bitsize = TYPE_SIZE (scalar_type);
5718 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5719 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5720
5721 for (unsigned HOST_WIDE_INT bit_offset = 0;
5722 bit_offset < vec_size_in_bits;
5723 bit_offset += element_bitsize)
5724 {
5725 tree bitpos = bitsize_int (bit_offset);
5726 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5727 bitsize, bitpos);
5728
5729 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5730 rhs = make_ssa_name (scalar_dest, stmt);
5731 gimple_assign_set_lhs (stmt, rhs);
5732 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5733
5734 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5735 tree new_name = make_ssa_name (scalar_dest, stmt);
5736 gimple_assign_set_lhs (stmt, new_name);
5737 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5738 lhs = new_name;
5739 }
5740 return lhs;
5741 }
5742
5743 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5744 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5745 statement. CODE is the operation performed by STMT_INFO and OPS are
5746 its scalar operands. REDUC_INDEX is the index of the operand in
5747 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5748 implements in-order reduction, or IFN_LAST if we should open-code it.
5749 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5750 that should be used to control the operation in a fully-masked loop. */
5751
5752 static bool
5753 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5754 gimple_stmt_iterator *gsi,
5755 stmt_vec_info *vec_stmt, slp_tree slp_node,
5756 gimple *reduc_def_stmt,
5757 tree_code code, internal_fn reduc_fn,
5758 tree ops[3], tree vectype_in,
5759 int reduc_index, vec_loop_masks *masks)
5760 {
5761 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5762 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5763 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5764 stmt_vec_info new_stmt_info = NULL;
5765
5766 int ncopies;
5767 if (slp_node)
5768 ncopies = 1;
5769 else
5770 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5771
5772 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5773 gcc_assert (ncopies == 1);
5774 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5775 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5776 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5777 == FOLD_LEFT_REDUCTION);
5778
5779 if (slp_node)
5780 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5781 TYPE_VECTOR_SUBPARTS (vectype_in)));
5782
5783 tree op0 = ops[1 - reduc_index];
5784
5785 int group_size = 1;
5786 stmt_vec_info scalar_dest_def_info;
5787 auto_vec<tree> vec_oprnds0;
5788 if (slp_node)
5789 {
5790 vect_get_vec_defs (op0, NULL_TREE, stmt_info, &vec_oprnds0, NULL,
5791 slp_node);
5792 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5793 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5794 }
5795 else
5796 {
5797 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5798 vec_oprnds0.create (1);
5799 vec_oprnds0.quick_push (loop_vec_def0);
5800 scalar_dest_def_info = stmt_info;
5801 }
5802
5803 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5804 tree scalar_type = TREE_TYPE (scalar_dest);
5805 tree reduc_var = gimple_phi_result (reduc_def_stmt);
5806
5807 int vec_num = vec_oprnds0.length ();
5808 gcc_assert (vec_num == 1 || slp_node);
5809 tree vec_elem_type = TREE_TYPE (vectype_out);
5810 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5811
5812 tree vector_identity = NULL_TREE;
5813 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5814 vector_identity = build_zero_cst (vectype_out);
5815
5816 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5817 int i;
5818 tree def0;
5819 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5820 {
5821 gimple *new_stmt;
5822 tree mask = NULL_TREE;
5823 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5824 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5825
5826 /* Handle MINUS by adding the negative. */
5827 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5828 {
5829 tree negated = make_ssa_name (vectype_out);
5830 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5831 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5832 def0 = negated;
5833 }
5834
5835 if (mask)
5836 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5837 vector_identity);
5838
5839 /* On the first iteration the input is simply the scalar phi
5840 result, and for subsequent iterations it is the output of
5841 the preceding operation. */
5842 if (reduc_fn != IFN_LAST)
5843 {
5844 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5845 /* For chained SLP reductions the output of the previous reduction
5846 operation serves as the input of the next. For the final statement
5847 the output cannot be a temporary - we reuse the original
5848 scalar destination of the last statement. */
5849 if (i != vec_num - 1)
5850 {
5851 gimple_set_lhs (new_stmt, scalar_dest_var);
5852 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5853 gimple_set_lhs (new_stmt, reduc_var);
5854 }
5855 }
5856 else
5857 {
5858 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5859 reduc_var, def0);
5860 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5861 /* Remove the statement, so that we can use the same code paths
5862 as for statements that we've just created. */
5863 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5864 gsi_remove (&tmp_gsi, true);
5865 }
5866
5867 if (i == vec_num - 1)
5868 {
5869 gimple_set_lhs (new_stmt, scalar_dest);
5870 new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5871 new_stmt);
5872 }
5873 else
5874 new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5875 new_stmt, gsi);
5876
5877 if (slp_node)
5878 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5879 }
5880
5881 if (!slp_node)
5882 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5883
5884 return true;
5885 }
5886
5887 /* Function is_nonwrapping_integer_induction.
5888
5889 Check if STMT_VINO (which is part of loop LOOP) both increments and
5890 does not cause overflow. */
5891
5892 static bool
5893 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
5894 {
5895 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5896 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5897 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5898 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5899 widest_int ni, max_loop_value, lhs_max;
5900 wi::overflow_type overflow = wi::OVF_NONE;
5901
5902 /* Make sure the loop is integer based. */
5903 if (TREE_CODE (base) != INTEGER_CST
5904 || TREE_CODE (step) != INTEGER_CST)
5905 return false;
5906
5907 /* Check that the max size of the loop will not wrap. */
5908
5909 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5910 return true;
5911
5912 if (! max_stmt_executions (loop, &ni))
5913 return false;
5914
5915 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5916 &overflow);
5917 if (overflow)
5918 return false;
5919
5920 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5921 TYPE_SIGN (lhs_type), &overflow);
5922 if (overflow)
5923 return false;
5924
5925 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5926 <= TYPE_PRECISION (lhs_type));
5927 }
5928
5929 /* Function vectorizable_reduction.
5930
5931 Check if STMT_INFO performs a reduction operation that can be vectorized.
5932 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5933 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5934 Return true if STMT_INFO is vectorizable in this way.
5935
5936 This function also handles reduction idioms (patterns) that have been
5937 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
5938 may be of this form:
5939 X = pattern_expr (arg0, arg1, ..., X)
5940 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5941 sequence that had been detected and replaced by the pattern-stmt
5942 (STMT_INFO).
5943
5944 This function also handles reduction of condition expressions, for example:
5945 for (int i = 0; i < N; i++)
5946 if (a[i] < value)
5947 last = a[i];
5948 This is handled by vectorising the loop and creating an additional vector
5949 containing the loop indexes for which "a[i] < value" was true. In the
5950 function epilogue this is reduced to a single max value and then used to
5951 index into the vector of results.
5952
5953 In some cases of reduction patterns, the type of the reduction variable X is
5954 different than the type of the other arguments of STMT_INFO.
5955 In such cases, the vectype that is used when transforming STMT_INFO into
5956 a vector stmt is different than the vectype that is used to determine the
5957 vectorization factor, because it consists of a different number of elements
5958 than the actual number of elements that are being operated upon in parallel.
5959
5960 For example, consider an accumulation of shorts into an int accumulator.
5961 On some targets it's possible to vectorize this pattern operating on 8
5962 shorts at a time (hence, the vectype for purposes of determining the
5963 vectorization factor should be V8HI); on the other hand, the vectype that
5964 is used to create the vector form is actually V4SI (the type of the result).
5965
5966 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5967 indicates what is the actual level of parallelism (V8HI in the example), so
5968 that the right vectorization factor would be derived. This vectype
5969 corresponds to the type of arguments to the reduction stmt, and should *NOT*
5970 be used to create the vectorized stmt. The right vectype for the vectorized
5971 stmt is obtained from the type of the result X:
5972 get_vectype_for_scalar_type (TREE_TYPE (X))
5973
5974 This means that, contrary to "regular" reductions (or "regular" stmts in
5975 general), the following equation:
5976 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5977 does *NOT* necessarily hold for reduction patterns. */
5978
5979 bool
5980 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5981 stmt_vec_info *vec_stmt, slp_tree slp_node,
5982 slp_instance slp_node_instance,
5983 stmt_vector_for_cost *cost_vec)
5984 {
5985 tree vec_dest;
5986 tree scalar_dest;
5987 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5988 tree vectype_in = NULL_TREE;
5989 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5990 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5991 enum tree_code code, orig_code;
5992 internal_fn reduc_fn;
5993 machine_mode vec_mode;
5994 int op_type;
5995 optab optab;
5996 tree new_temp = NULL_TREE;
5997 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5998 stmt_vec_info cond_stmt_vinfo = NULL;
5999 enum tree_code cond_reduc_op_code = ERROR_MARK;
6000 tree scalar_type;
6001 bool is_simple_use;
6002 int i;
6003 int ncopies;
6004 int epilog_copies;
6005 stmt_vec_info prev_stmt_info, prev_phi_info;
6006 bool single_defuse_cycle = false;
6007 stmt_vec_info new_stmt_info = NULL;
6008 int j;
6009 tree ops[3];
6010 enum vect_def_type dts[3];
6011 bool nested_cycle = false, found_nested_cycle_def = false;
6012 bool double_reduc = false;
6013 basic_block def_bb;
6014 struct loop * def_stmt_loop;
6015 tree def_arg;
6016 auto_vec<tree> vec_oprnds0;
6017 auto_vec<tree> vec_oprnds1;
6018 auto_vec<tree> vec_oprnds2;
6019 auto_vec<tree> vect_defs;
6020 auto_vec<stmt_vec_info> phis;
6021 int vec_num;
6022 tree def0, tem;
6023 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6024 tree cond_reduc_val = NULL_TREE;
6025
6026 /* Make sure it was already recognized as a reduction computation. */
6027 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6028 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6029 return false;
6030
6031 if (nested_in_vect_loop_p (loop, stmt_info))
6032 {
6033 loop = loop->inner;
6034 nested_cycle = true;
6035 }
6036
6037 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6038 gcc_assert (slp_node
6039 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6040
6041 if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
6042 {
6043 tree phi_result = gimple_phi_result (phi);
6044 /* Analysis is fully done on the reduction stmt invocation. */
6045 if (! vec_stmt)
6046 {
6047 if (slp_node)
6048 slp_node_instance->reduc_phis = slp_node;
6049
6050 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6051 return true;
6052 }
6053
6054 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6055 /* Leave the scalar phi in place. Note that checking
6056 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6057 for reductions involving a single statement. */
6058 return true;
6059
6060 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6061 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6062
6063 if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6064 == EXTRACT_LAST_REDUCTION)
6065 /* Leave the scalar phi in place. */
6066 return true;
6067
6068 gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6069 code = gimple_assign_rhs_code (reduc_stmt);
6070 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6071 {
6072 tree op = gimple_op (reduc_stmt, k);
6073 if (op == phi_result)
6074 continue;
6075 if (k == 1 && code == COND_EXPR)
6076 continue;
6077 bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt);
6078 gcc_assert (is_simple_use);
6079 if (dt == vect_constant_def || dt == vect_external_def)
6080 continue;
6081 if (!vectype_in
6082 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6083 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6084 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6085 break;
6086 }
6087 /* For a nested cycle we might end up with an operation like
6088 phi_result * phi_result. */
6089 if (!vectype_in)
6090 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
6091 gcc_assert (vectype_in);
6092
6093 if (slp_node)
6094 ncopies = 1;
6095 else
6096 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6097
6098 stmt_vec_info use_stmt_info;
6099 if (ncopies > 1
6100 && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6101 && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6102 && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info)
6103 single_defuse_cycle = true;
6104
6105 /* Create the destination vector */
6106 scalar_dest = gimple_assign_lhs (reduc_stmt);
6107 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6108
6109 if (slp_node)
6110 /* The size vect_schedule_slp_instance computes is off for us. */
6111 vec_num = vect_get_num_vectors
6112 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6113 * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6114 vectype_in);
6115 else
6116 vec_num = 1;
6117
6118 /* Generate the reduction PHIs upfront. */
6119 prev_phi_info = NULL;
6120 for (j = 0; j < ncopies; j++)
6121 {
6122 if (j == 0 || !single_defuse_cycle)
6123 {
6124 for (i = 0; i < vec_num; i++)
6125 {
6126 /* Create the reduction-phi that defines the reduction
6127 operand. */
6128 gimple *new_phi = create_phi_node (vec_dest, loop->header);
6129 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6130
6131 if (slp_node)
6132 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6133 else
6134 {
6135 if (j == 0)
6136 STMT_VINFO_VEC_STMT (stmt_info)
6137 = *vec_stmt = new_phi_info;
6138 else
6139 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6140 prev_phi_info = new_phi_info;
6141 }
6142 }
6143 }
6144 }
6145
6146 return true;
6147 }
6148
6149 /* 1. Is vectorizable reduction? */
6150 /* Not supportable if the reduction variable is used in the loop, unless
6151 it's a reduction chain. */
6152 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6153 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6154 return false;
6155
6156 /* Reductions that are not used even in an enclosing outer-loop,
6157 are expected to be "live" (used out of the loop). */
6158 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6159 && !STMT_VINFO_LIVE_P (stmt_info))
6160 return false;
6161
6162 /* 2. Has this been recognized as a reduction pattern?
6163
6164 Check if STMT represents a pattern that has been recognized
6165 in earlier analysis stages. For stmts that represent a pattern,
6166 the STMT_VINFO_RELATED_STMT field records the last stmt in
6167 the original sequence that constitutes the pattern. */
6168
6169 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6170 if (orig_stmt_info)
6171 {
6172 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6173 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6174 }
6175
6176 /* 3. Check the operands of the operation. The first operands are defined
6177 inside the loop body. The last operand is the reduction variable,
6178 which is defined by the loop-header-phi. */
6179
6180 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6181
6182 /* Flatten RHS. */
6183 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6184 {
6185 case GIMPLE_BINARY_RHS:
6186 code = gimple_assign_rhs_code (stmt);
6187 op_type = TREE_CODE_LENGTH (code);
6188 gcc_assert (op_type == binary_op);
6189 ops[0] = gimple_assign_rhs1 (stmt);
6190 ops[1] = gimple_assign_rhs2 (stmt);
6191 break;
6192
6193 case GIMPLE_TERNARY_RHS:
6194 code = gimple_assign_rhs_code (stmt);
6195 op_type = TREE_CODE_LENGTH (code);
6196 gcc_assert (op_type == ternary_op);
6197 ops[0] = gimple_assign_rhs1 (stmt);
6198 ops[1] = gimple_assign_rhs2 (stmt);
6199 ops[2] = gimple_assign_rhs3 (stmt);
6200 break;
6201
6202 case GIMPLE_UNARY_RHS:
6203 return false;
6204
6205 default:
6206 gcc_unreachable ();
6207 }
6208
6209 if (code == COND_EXPR && slp_node)
6210 return false;
6211
6212 scalar_dest = gimple_assign_lhs (stmt);
6213 scalar_type = TREE_TYPE (scalar_dest);
6214 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6215 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6216 return false;
6217
6218 /* Do not try to vectorize bit-precision reductions. */
6219 if (!type_has_mode_precision_p (scalar_type))
6220 return false;
6221
6222 /* All uses but the last are expected to be defined in the loop.
6223 The last use is the reduction variable. In case of nested cycle this
6224 assumption is not true: we use reduc_index to record the index of the
6225 reduction variable. */
6226 stmt_vec_info reduc_def_info;
6227 if (orig_stmt_info)
6228 reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6229 else
6230 reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6231 gcc_assert (reduc_def_info);
6232 gphi *reduc_def_phi = as_a <gphi *> (reduc_def_info->stmt);
6233 tree reduc_def = PHI_RESULT (reduc_def_phi);
6234 int reduc_index = -1;
6235 for (i = 0; i < op_type; i++)
6236 {
6237 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6238 if (i == 0 && code == COND_EXPR)
6239 continue;
6240
6241 stmt_vec_info def_stmt_info;
6242 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6243 &def_stmt_info);
6244 dt = dts[i];
6245 gcc_assert (is_simple_use);
6246 if (dt == vect_reduction_def
6247 && ops[i] == reduc_def)
6248 {
6249 reduc_index = i;
6250 continue;
6251 }
6252 else if (tem)
6253 {
6254 /* To properly compute ncopies we are interested in the widest
6255 input type in case we're looking at a widening accumulation. */
6256 if (!vectype_in
6257 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6258 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6259 vectype_in = tem;
6260 }
6261
6262 if (dt != vect_internal_def
6263 && dt != vect_external_def
6264 && dt != vect_constant_def
6265 && dt != vect_induction_def
6266 && !(dt == vect_nested_cycle && nested_cycle))
6267 return false;
6268
6269 if (dt == vect_nested_cycle
6270 && ops[i] == reduc_def)
6271 {
6272 found_nested_cycle_def = true;
6273 reduc_index = i;
6274 }
6275
6276 if (i == 1 && code == COND_EXPR)
6277 {
6278 /* Record how value of COND_EXPR is defined. */
6279 if (dt == vect_constant_def)
6280 {
6281 cond_reduc_dt = dt;
6282 cond_reduc_val = ops[i];
6283 }
6284 if (dt == vect_induction_def
6285 && def_stmt_info
6286 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6287 {
6288 cond_reduc_dt = dt;
6289 cond_stmt_vinfo = def_stmt_info;
6290 }
6291 }
6292 }
6293
6294 if (!vectype_in)
6295 vectype_in = vectype_out;
6296
6297 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6298 directy used in stmt. */
6299 if (reduc_index == -1)
6300 {
6301 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6302 {
6303 if (dump_enabled_p ())
6304 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6305 "in-order reduction chain without SLP.\n");
6306 return false;
6307 }
6308 }
6309
6310 if (!(reduc_index == -1
6311 || dts[reduc_index] == vect_reduction_def
6312 || dts[reduc_index] == vect_nested_cycle
6313 || ((dts[reduc_index] == vect_internal_def
6314 || dts[reduc_index] == vect_external_def
6315 || dts[reduc_index] == vect_constant_def
6316 || dts[reduc_index] == vect_induction_def)
6317 && nested_cycle && found_nested_cycle_def)))
6318 {
6319 /* For pattern recognized stmts, orig_stmt might be a reduction,
6320 but some helper statements for the pattern might not, or
6321 might be COND_EXPRs with reduction uses in the condition. */
6322 gcc_assert (orig_stmt_info);
6323 return false;
6324 }
6325
6326 /* PHIs should not participate in patterns. */
6327 gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6328 enum vect_reduction_type v_reduc_type
6329 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6330 stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6331
6332 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6333 /* If we have a condition reduction, see if we can simplify it further. */
6334 if (v_reduc_type == COND_REDUCTION)
6335 {
6336 /* TODO: We can't yet handle reduction chains, since we need to treat
6337 each COND_EXPR in the chain specially, not just the last one.
6338 E.g. for:
6339
6340 x_1 = PHI <x_3, ...>
6341 x_2 = a_2 ? ... : x_1;
6342 x_3 = a_3 ? ... : x_2;
6343
6344 we're interested in the last element in x_3 for which a_2 || a_3
6345 is true, whereas the current reduction chain handling would
6346 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6347 as a reduction operation. */
6348 if (reduc_index == -1)
6349 {
6350 if (dump_enabled_p ())
6351 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6352 "conditional reduction chains not supported\n");
6353 return false;
6354 }
6355
6356 /* vect_is_simple_reduction ensured that operand 2 is the
6357 loop-carried operand. */
6358 gcc_assert (reduc_index == 2);
6359
6360 /* Loop peeling modifies initial value of reduction PHI, which
6361 makes the reduction stmt to be transformed different to the
6362 original stmt analyzed. We need to record reduction code for
6363 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6364 it can be used directly at transform stage. */
6365 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6366 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6367 {
6368 /* Also set the reduction type to CONST_COND_REDUCTION. */
6369 gcc_assert (cond_reduc_dt == vect_constant_def);
6370 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6371 }
6372 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6373 vectype_in, OPTIMIZE_FOR_SPEED))
6374 {
6375 if (dump_enabled_p ())
6376 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6377 "optimizing condition reduction with"
6378 " FOLD_EXTRACT_LAST.\n");
6379 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6380 }
6381 else if (cond_reduc_dt == vect_induction_def)
6382 {
6383 tree base
6384 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6385 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6386
6387 gcc_assert (TREE_CODE (base) == INTEGER_CST
6388 && TREE_CODE (step) == INTEGER_CST);
6389 cond_reduc_val = NULL_TREE;
6390 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6391 above base; punt if base is the minimum value of the type for
6392 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6393 if (tree_int_cst_sgn (step) == -1)
6394 {
6395 cond_reduc_op_code = MIN_EXPR;
6396 if (tree_int_cst_sgn (base) == -1)
6397 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6398 else if (tree_int_cst_lt (base,
6399 TYPE_MAX_VALUE (TREE_TYPE (base))))
6400 cond_reduc_val
6401 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6402 }
6403 else
6404 {
6405 cond_reduc_op_code = MAX_EXPR;
6406 if (tree_int_cst_sgn (base) == 1)
6407 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6408 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6409 base))
6410 cond_reduc_val
6411 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6412 }
6413 if (cond_reduc_val)
6414 {
6415 if (dump_enabled_p ())
6416 dump_printf_loc (MSG_NOTE, vect_location,
6417 "condition expression based on "
6418 "integer induction.\n");
6419 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6420 = INTEGER_INDUC_COND_REDUCTION;
6421 }
6422 }
6423 else if (cond_reduc_dt == vect_constant_def)
6424 {
6425 enum vect_def_type cond_initial_dt;
6426 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6427 tree cond_initial_val
6428 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6429
6430 gcc_assert (cond_reduc_val != NULL_TREE);
6431 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6432 if (cond_initial_dt == vect_constant_def
6433 && types_compatible_p (TREE_TYPE (cond_initial_val),
6434 TREE_TYPE (cond_reduc_val)))
6435 {
6436 tree e = fold_binary (LE_EXPR, boolean_type_node,
6437 cond_initial_val, cond_reduc_val);
6438 if (e && (integer_onep (e) || integer_zerop (e)))
6439 {
6440 if (dump_enabled_p ())
6441 dump_printf_loc (MSG_NOTE, vect_location,
6442 "condition expression based on "
6443 "compile time constant.\n");
6444 /* Record reduction code at analysis stage. */
6445 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6446 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6447 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6448 = CONST_COND_REDUCTION;
6449 }
6450 }
6451 }
6452 }
6453
6454 if (orig_stmt_info)
6455 gcc_assert (tmp == orig_stmt_info
6456 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6457 else
6458 /* We changed STMT to be the first stmt in reduction chain, hence we
6459 check that in this case the first element in the chain is STMT. */
6460 gcc_assert (tmp == stmt_info
6461 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6462
6463 if (STMT_VINFO_LIVE_P (reduc_def_info))
6464 return false;
6465
6466 if (slp_node)
6467 ncopies = 1;
6468 else
6469 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6470
6471 gcc_assert (ncopies >= 1);
6472
6473 vec_mode = TYPE_MODE (vectype_in);
6474 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6475
6476 if (nested_cycle)
6477 {
6478 def_bb = gimple_bb (reduc_def_phi);
6479 def_stmt_loop = def_bb->loop_father;
6480 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6481 loop_preheader_edge (def_stmt_loop));
6482 stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6483 if (def_arg_stmt_info
6484 && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6485 == vect_double_reduction_def))
6486 double_reduc = true;
6487 }
6488
6489 vect_reduction_type reduction_type
6490 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6491 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6492 && ncopies > 1)
6493 {
6494 if (dump_enabled_p ())
6495 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6496 "multiple types in double reduction or condition "
6497 "reduction.\n");
6498 return false;
6499 }
6500
6501 if (code == COND_EXPR)
6502 {
6503 /* Only call during the analysis stage, otherwise we'll lose
6504 STMT_VINFO_TYPE. */
6505 if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
6506 true, NULL, cost_vec))
6507 {
6508 if (dump_enabled_p ())
6509 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6510 "unsupported condition in reduction\n");
6511 return false;
6512 }
6513 }
6514 else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6515 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6516 {
6517 /* Only call during the analysis stage, otherwise we'll lose
6518 STMT_VINFO_TYPE. We only support this for nested cycles
6519 without double reductions at the moment. */
6520 if (!nested_cycle
6521 || double_reduc
6522 || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL,
6523 NULL, cost_vec)))
6524 {
6525 if (dump_enabled_p ())
6526 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6527 "unsupported shift or rotation in reduction\n");
6528 return false;
6529 }
6530 }
6531 else
6532 {
6533 /* 4. Supportable by target? */
6534
6535 /* 4.1. check support for the operation in the loop */
6536 optab = optab_for_tree_code (code, vectype_in, optab_default);
6537 if (!optab)
6538 {
6539 if (dump_enabled_p ())
6540 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6541 "no optab.\n");
6542
6543 return false;
6544 }
6545
6546 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6547 {
6548 if (dump_enabled_p ())
6549 dump_printf (MSG_NOTE, "op not supported by target.\n");
6550
6551 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6552 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6553 return false;
6554
6555 if (dump_enabled_p ())
6556 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6557 }
6558
6559 /* Worthwhile without SIMD support? */
6560 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6561 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6562 {
6563 if (dump_enabled_p ())
6564 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6565 "not worthwhile without SIMD support.\n");
6566
6567 return false;
6568 }
6569 }
6570
6571 /* 4.2. Check support for the epilog operation.
6572
6573 If STMT represents a reduction pattern, then the type of the
6574 reduction variable may be different than the type of the rest
6575 of the arguments. For example, consider the case of accumulation
6576 of shorts into an int accumulator; The original code:
6577 S1: int_a = (int) short_a;
6578 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6579
6580 was replaced with:
6581 STMT: int_acc = widen_sum <short_a, int_acc>
6582
6583 This means that:
6584 1. The tree-code that is used to create the vector operation in the
6585 epilog code (that reduces the partial results) is not the
6586 tree-code of STMT, but is rather the tree-code of the original
6587 stmt from the pattern that STMT is replacing. I.e, in the example
6588 above we want to use 'widen_sum' in the loop, but 'plus' in the
6589 epilog.
6590 2. The type (mode) we use to check available target support
6591 for the vector operation to be created in the *epilog*, is
6592 determined by the type of the reduction variable (in the example
6593 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6594 However the type (mode) we use to check available target support
6595 for the vector operation to be created *inside the loop*, is
6596 determined by the type of the other arguments to STMT (in the
6597 example we'd check this: optab_handler (widen_sum_optab,
6598 vect_short_mode)).
6599
6600 This is contrary to "regular" reductions, in which the types of all
6601 the arguments are the same as the type of the reduction variable.
6602 For "regular" reductions we can therefore use the same vector type
6603 (and also the same tree-code) when generating the epilog code and
6604 when generating the code inside the loop. */
6605
6606 if (orig_stmt_info
6607 && (reduction_type == TREE_CODE_REDUCTION
6608 || reduction_type == FOLD_LEFT_REDUCTION))
6609 {
6610 /* This is a reduction pattern: get the vectype from the type of the
6611 reduction variable, and get the tree-code from orig_stmt. */
6612 orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6613 gcc_assert (vectype_out);
6614 vec_mode = TYPE_MODE (vectype_out);
6615 }
6616 else
6617 {
6618 /* Regular reduction: use the same vectype and tree-code as used for
6619 the vector code inside the loop can be used for the epilog code. */
6620 orig_code = code;
6621
6622 if (code == MINUS_EXPR)
6623 orig_code = PLUS_EXPR;
6624
6625 /* For simple condition reductions, replace with the actual expression
6626 we want to base our reduction around. */
6627 if (reduction_type == CONST_COND_REDUCTION)
6628 {
6629 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6630 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6631 }
6632 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6633 orig_code = cond_reduc_op_code;
6634 }
6635
6636 reduc_fn = IFN_LAST;
6637
6638 if (reduction_type == TREE_CODE_REDUCTION
6639 || reduction_type == FOLD_LEFT_REDUCTION
6640 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6641 || reduction_type == CONST_COND_REDUCTION)
6642 {
6643 if (reduction_type == FOLD_LEFT_REDUCTION
6644 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6645 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6646 {
6647 if (reduc_fn != IFN_LAST
6648 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6649 OPTIMIZE_FOR_SPEED))
6650 {
6651 if (dump_enabled_p ())
6652 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6653 "reduc op not supported by target.\n");
6654
6655 reduc_fn = IFN_LAST;
6656 }
6657 }
6658 else
6659 {
6660 if (!nested_cycle || double_reduc)
6661 {
6662 if (dump_enabled_p ())
6663 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6664 "no reduc code for scalar code.\n");
6665
6666 return false;
6667 }
6668 }
6669 }
6670 else if (reduction_type == COND_REDUCTION)
6671 {
6672 int scalar_precision
6673 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6674 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6675 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6676 nunits_out);
6677
6678 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6679 OPTIMIZE_FOR_SPEED))
6680 reduc_fn = IFN_REDUC_MAX;
6681 }
6682
6683 if (reduction_type != EXTRACT_LAST_REDUCTION
6684 && (!nested_cycle || double_reduc)
6685 && reduc_fn == IFN_LAST
6686 && !nunits_out.is_constant ())
6687 {
6688 if (dump_enabled_p ())
6689 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6690 "missing target support for reduction on"
6691 " variable-length vectors.\n");
6692 return false;
6693 }
6694
6695 /* For SLP reductions, see if there is a neutral value we can use. */
6696 tree neutral_op = NULL_TREE;
6697 if (slp_node)
6698 neutral_op = neutral_op_for_slp_reduction
6699 (slp_node_instance->reduc_phis, code,
6700 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6701
6702 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6703 {
6704 /* We can't support in-order reductions of code such as this:
6705
6706 for (int i = 0; i < n1; ++i)
6707 for (int j = 0; j < n2; ++j)
6708 l += a[j];
6709
6710 since GCC effectively transforms the loop when vectorizing:
6711
6712 for (int i = 0; i < n1 / VF; ++i)
6713 for (int j = 0; j < n2; ++j)
6714 for (int k = 0; k < VF; ++k)
6715 l += a[j];
6716
6717 which is a reassociation of the original operation. */
6718 if (dump_enabled_p ())
6719 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6720 "in-order double reduction not supported.\n");
6721
6722 return false;
6723 }
6724
6725 if (reduction_type == FOLD_LEFT_REDUCTION
6726 && slp_node
6727 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6728 {
6729 /* We cannot use in-order reductions in this case because there is
6730 an implicit reassociation of the operations involved. */
6731 if (dump_enabled_p ())
6732 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6733 "in-order unchained SLP reductions not supported.\n");
6734 return false;
6735 }
6736
6737 /* For double reductions, and for SLP reductions with a neutral value,
6738 we construct a variable-length initial vector by loading a vector
6739 full of the neutral value and then shift-and-inserting the start
6740 values into the low-numbered elements. */
6741 if ((double_reduc || neutral_op)
6742 && !nunits_out.is_constant ()
6743 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6744 vectype_out, OPTIMIZE_FOR_SPEED))
6745 {
6746 if (dump_enabled_p ())
6747 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6748 "reduction on variable-length vectors requires"
6749 " target support for a vector-shift-and-insert"
6750 " operation.\n");
6751 return false;
6752 }
6753
6754 /* Check extra constraints for variable-length unchained SLP reductions. */
6755 if (STMT_SLP_TYPE (stmt_info)
6756 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6757 && !nunits_out.is_constant ())
6758 {
6759 /* We checked above that we could build the initial vector when
6760 there's a neutral element value. Check here for the case in
6761 which each SLP statement has its own initial value and in which
6762 that value needs to be repeated for every instance of the
6763 statement within the initial vector. */
6764 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6765 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6766 if (!neutral_op
6767 && !can_duplicate_and_interleave_p (group_size, elt_mode))
6768 {
6769 if (dump_enabled_p ())
6770 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6771 "unsupported form of SLP reduction for"
6772 " variable-length vectors: cannot build"
6773 " initial vector.\n");
6774 return false;
6775 }
6776 /* The epilogue code relies on the number of elements being a multiple
6777 of the group size. The duplicate-and-interleave approach to setting
6778 up the the initial vector does too. */
6779 if (!multiple_p (nunits_out, group_size))
6780 {
6781 if (dump_enabled_p ())
6782 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6783 "unsupported form of SLP reduction for"
6784 " variable-length vectors: the vector size"
6785 " is not a multiple of the number of results.\n");
6786 return false;
6787 }
6788 }
6789
6790 /* In case of widenning multiplication by a constant, we update the type
6791 of the constant to be the type of the other operand. We check that the
6792 constant fits the type in the pattern recognition pass. */
6793 if (code == DOT_PROD_EXPR
6794 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6795 {
6796 if (TREE_CODE (ops[0]) == INTEGER_CST)
6797 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6798 else if (TREE_CODE (ops[1]) == INTEGER_CST)
6799 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6800 else
6801 {
6802 if (dump_enabled_p ())
6803 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6804 "invalid types in dot-prod\n");
6805
6806 return false;
6807 }
6808 }
6809
6810 if (reduction_type == COND_REDUCTION)
6811 {
6812 widest_int ni;
6813
6814 if (! max_loop_iterations (loop, &ni))
6815 {
6816 if (dump_enabled_p ())
6817 dump_printf_loc (MSG_NOTE, vect_location,
6818 "loop count not known, cannot create cond "
6819 "reduction.\n");
6820 return false;
6821 }
6822 /* Convert backedges to iterations. */
6823 ni += 1;
6824
6825 /* The additional index will be the same type as the condition. Check
6826 that the loop can fit into this less one (because we'll use up the
6827 zero slot for when there are no matches). */
6828 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6829 if (wi::geu_p (ni, wi::to_widest (max_index)))
6830 {
6831 if (dump_enabled_p ())
6832 dump_printf_loc (MSG_NOTE, vect_location,
6833 "loop size is greater than data size.\n");
6834 return false;
6835 }
6836 }
6837
6838 /* In case the vectorization factor (VF) is bigger than the number
6839 of elements that we can fit in a vectype (nunits), we have to generate
6840 more than one vector stmt - i.e - we need to "unroll" the
6841 vector stmt by a factor VF/nunits. For more details see documentation
6842 in vectorizable_operation. */
6843
6844 /* If the reduction is used in an outer loop we need to generate
6845 VF intermediate results, like so (e.g. for ncopies=2):
6846 r0 = phi (init, r0)
6847 r1 = phi (init, r1)
6848 r0 = x0 + r0;
6849 r1 = x1 + r1;
6850 (i.e. we generate VF results in 2 registers).
6851 In this case we have a separate def-use cycle for each copy, and therefore
6852 for each copy we get the vector def for the reduction variable from the
6853 respective phi node created for this copy.
6854
6855 Otherwise (the reduction is unused in the loop nest), we can combine
6856 together intermediate results, like so (e.g. for ncopies=2):
6857 r = phi (init, r)
6858 r = x0 + r;
6859 r = x1 + r;
6860 (i.e. we generate VF/2 results in a single register).
6861 In this case for each copy we get the vector def for the reduction variable
6862 from the vectorized reduction operation generated in the previous iteration.
6863
6864 This only works when we see both the reduction PHI and its only consumer
6865 in vectorizable_reduction and there are no intermediate stmts
6866 participating. */
6867 stmt_vec_info use_stmt_info;
6868 tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6869 if (ncopies > 1
6870 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6871 && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6872 && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
6873 {
6874 single_defuse_cycle = true;
6875 epilog_copies = 1;
6876 }
6877 else
6878 epilog_copies = ncopies;
6879
6880 /* If the reduction stmt is one of the patterns that have lane
6881 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
6882 if ((ncopies > 1
6883 && ! single_defuse_cycle)
6884 && (code == DOT_PROD_EXPR
6885 || code == WIDEN_SUM_EXPR
6886 || code == SAD_EXPR))
6887 {
6888 if (dump_enabled_p ())
6889 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6890 "multi def-use cycle not possible for lane-reducing "
6891 "reduction operation\n");
6892 return false;
6893 }
6894
6895 if (slp_node)
6896 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6897 else
6898 vec_num = 1;
6899
6900 internal_fn cond_fn = get_conditional_internal_fn (code);
6901 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6902
6903 if (!vec_stmt) /* transformation not required. */
6904 {
6905 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
6906 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6907 {
6908 if (reduction_type != FOLD_LEFT_REDUCTION
6909 && (cond_fn == IFN_LAST
6910 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6911 OPTIMIZE_FOR_SPEED)))
6912 {
6913 if (dump_enabled_p ())
6914 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6915 "can't use a fully-masked loop because no"
6916 " conditional operation is available.\n");
6917 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6918 }
6919 else if (reduc_index == -1)
6920 {
6921 if (dump_enabled_p ())
6922 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6923 "can't use a fully-masked loop for chained"
6924 " reductions.\n");
6925 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6926 }
6927 else
6928 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6929 vectype_in);
6930 }
6931 if (dump_enabled_p ()
6932 && reduction_type == FOLD_LEFT_REDUCTION)
6933 dump_printf_loc (MSG_NOTE, vect_location,
6934 "using an in-order (fold-left) reduction.\n");
6935 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6936 return true;
6937 }
6938
6939 /* Transform. */
6940
6941 if (dump_enabled_p ())
6942 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6943
6944 /* FORNOW: Multiple types are not supported for condition. */
6945 if (code == COND_EXPR)
6946 gcc_assert (ncopies == 1);
6947
6948 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6949
6950 if (reduction_type == FOLD_LEFT_REDUCTION)
6951 return vectorize_fold_left_reduction
6952 (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6953 reduc_fn, ops, vectype_in, reduc_index, masks);
6954
6955 if (reduction_type == EXTRACT_LAST_REDUCTION)
6956 {
6957 gcc_assert (!slp_node);
6958 return vectorizable_condition (stmt_info, gsi, vec_stmt,
6959 true, NULL, NULL);
6960 }
6961
6962 /* Create the destination vector */
6963 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6964
6965 prev_stmt_info = NULL;
6966 prev_phi_info = NULL;
6967 if (!slp_node)
6968 {
6969 vec_oprnds0.create (1);
6970 vec_oprnds1.create (1);
6971 if (op_type == ternary_op)
6972 vec_oprnds2.create (1);
6973 }
6974
6975 phis.create (vec_num);
6976 vect_defs.create (vec_num);
6977 if (!slp_node)
6978 vect_defs.quick_push (NULL_TREE);
6979
6980 if (slp_node)
6981 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
6982 else
6983 phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
6984
6985 for (j = 0; j < ncopies; j++)
6986 {
6987 if (code == COND_EXPR)
6988 {
6989 gcc_assert (!slp_node);
6990 vectorizable_condition (stmt_info, gsi, vec_stmt,
6991 true, NULL, NULL);
6992 break;
6993 }
6994 if (code == LSHIFT_EXPR
6995 || code == RSHIFT_EXPR)
6996 {
6997 vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL);
6998 break;
6999 }
7000
7001 /* Handle uses. */
7002 if (j == 0)
7003 {
7004 if (slp_node)
7005 {
7006 /* Get vec defs for all the operands except the reduction index,
7007 ensuring the ordering of the ops in the vector is kept. */
7008 auto_vec<tree, 3> slp_ops;
7009 auto_vec<vec<tree>, 3> vec_defs;
7010
7011 slp_ops.quick_push (ops[0]);
7012 slp_ops.quick_push (ops[1]);
7013 if (op_type == ternary_op)
7014 slp_ops.quick_push (ops[2]);
7015
7016 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7017
7018 vec_oprnds0.safe_splice (vec_defs[0]);
7019 vec_defs[0].release ();
7020 vec_oprnds1.safe_splice (vec_defs[1]);
7021 vec_defs[1].release ();
7022 if (op_type == ternary_op)
7023 {
7024 vec_oprnds2.safe_splice (vec_defs[2]);
7025 vec_defs[2].release ();
7026 }
7027 }
7028 else
7029 {
7030 vec_oprnds0.quick_push
7031 (vect_get_vec_def_for_operand (ops[0], stmt_info));
7032 vec_oprnds1.quick_push
7033 (vect_get_vec_def_for_operand (ops[1], stmt_info));
7034 if (op_type == ternary_op)
7035 vec_oprnds2.quick_push
7036 (vect_get_vec_def_for_operand (ops[2], stmt_info));
7037 }
7038 }
7039 else
7040 {
7041 if (!slp_node)
7042 {
7043 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7044
7045 if (single_defuse_cycle && reduc_index == 0)
7046 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7047 else
7048 vec_oprnds0[0]
7049 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7050 vec_oprnds0[0]);
7051 if (single_defuse_cycle && reduc_index == 1)
7052 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7053 else
7054 vec_oprnds1[0]
7055 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7056 vec_oprnds1[0]);
7057 if (op_type == ternary_op)
7058 {
7059 if (single_defuse_cycle && reduc_index == 2)
7060 vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7061 else
7062 vec_oprnds2[0]
7063 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7064 vec_oprnds2[0]);
7065 }
7066 }
7067 }
7068
7069 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7070 {
7071 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7072 if (masked_loop_p)
7073 {
7074 /* Make sure that the reduction accumulator is vop[0]. */
7075 if (reduc_index == 1)
7076 {
7077 gcc_assert (commutative_tree_code (code));
7078 std::swap (vop[0], vop[1]);
7079 }
7080 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7081 vectype_in, i * ncopies + j);
7082 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7083 vop[0], vop[1],
7084 vop[0]);
7085 new_temp = make_ssa_name (vec_dest, call);
7086 gimple_call_set_lhs (call, new_temp);
7087 gimple_call_set_nothrow (call, true);
7088 new_stmt_info
7089 = vect_finish_stmt_generation (stmt_info, call, gsi);
7090 }
7091 else
7092 {
7093 if (op_type == ternary_op)
7094 vop[2] = vec_oprnds2[i];
7095
7096 gassign *new_stmt = gimple_build_assign (vec_dest, code,
7097 vop[0], vop[1], vop[2]);
7098 new_temp = make_ssa_name (vec_dest, new_stmt);
7099 gimple_assign_set_lhs (new_stmt, new_temp);
7100 new_stmt_info
7101 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7102 }
7103
7104 if (slp_node)
7105 {
7106 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7107 vect_defs.quick_push (new_temp);
7108 }
7109 else
7110 vect_defs[0] = new_temp;
7111 }
7112
7113 if (slp_node)
7114 continue;
7115
7116 if (j == 0)
7117 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7118 else
7119 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7120
7121 prev_stmt_info = new_stmt_info;
7122 }
7123
7124 /* Finalize the reduction-phi (set its arguments) and create the
7125 epilog reduction code. */
7126 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7127 vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7128
7129 vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
7130 epilog_copies, reduc_fn, phis,
7131 double_reduc, slp_node, slp_node_instance,
7132 cond_reduc_val, cond_reduc_op_code,
7133 neutral_op);
7134
7135 return true;
7136 }
7137
7138 /* Function vect_min_worthwhile_factor.
7139
7140 For a loop where we could vectorize the operation indicated by CODE,
7141 return the minimum vectorization factor that makes it worthwhile
7142 to use generic vectors. */
7143 static unsigned int
7144 vect_min_worthwhile_factor (enum tree_code code)
7145 {
7146 switch (code)
7147 {
7148 case PLUS_EXPR:
7149 case MINUS_EXPR:
7150 case NEGATE_EXPR:
7151 return 4;
7152
7153 case BIT_AND_EXPR:
7154 case BIT_IOR_EXPR:
7155 case BIT_XOR_EXPR:
7156 case BIT_NOT_EXPR:
7157 return 2;
7158
7159 default:
7160 return INT_MAX;
7161 }
7162 }
7163
7164 /* Return true if VINFO indicates we are doing loop vectorization and if
7165 it is worth decomposing CODE operations into scalar operations for
7166 that loop's vectorization factor. */
7167
7168 bool
7169 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7170 {
7171 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7172 unsigned HOST_WIDE_INT value;
7173 return (loop_vinfo
7174 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7175 && value >= vect_min_worthwhile_factor (code));
7176 }
7177
7178 /* Function vectorizable_induction
7179
7180 Check if STMT_INFO performs an induction computation that can be vectorized.
7181 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7182 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7183 Return true if STMT_INFO is vectorizable in this way. */
7184
7185 bool
7186 vectorizable_induction (stmt_vec_info stmt_info,
7187 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7188 stmt_vec_info *vec_stmt, slp_tree slp_node,
7189 stmt_vector_for_cost *cost_vec)
7190 {
7191 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7192 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7193 unsigned ncopies;
7194 bool nested_in_vect_loop = false;
7195 struct loop *iv_loop;
7196 tree vec_def;
7197 edge pe = loop_preheader_edge (loop);
7198 basic_block new_bb;
7199 tree new_vec, vec_init, vec_step, t;
7200 tree new_name;
7201 gimple *new_stmt;
7202 gphi *induction_phi;
7203 tree induc_def, vec_dest;
7204 tree init_expr, step_expr;
7205 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7206 unsigned i;
7207 tree expr;
7208 gimple_seq stmts;
7209 imm_use_iterator imm_iter;
7210 use_operand_p use_p;
7211 gimple *exit_phi;
7212 edge latch_e;
7213 tree loop_arg;
7214 gimple_stmt_iterator si;
7215
7216 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7217 if (!phi)
7218 return false;
7219
7220 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7221 return false;
7222
7223 /* Make sure it was recognized as induction computation. */
7224 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7225 return false;
7226
7227 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7228 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7229
7230 if (slp_node)
7231 ncopies = 1;
7232 else
7233 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7234 gcc_assert (ncopies >= 1);
7235
7236 /* FORNOW. These restrictions should be relaxed. */
7237 if (nested_in_vect_loop_p (loop, stmt_info))
7238 {
7239 imm_use_iterator imm_iter;
7240 use_operand_p use_p;
7241 gimple *exit_phi;
7242 edge latch_e;
7243 tree loop_arg;
7244
7245 if (ncopies > 1)
7246 {
7247 if (dump_enabled_p ())
7248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7249 "multiple types in nested loop.\n");
7250 return false;
7251 }
7252
7253 /* FORNOW: outer loop induction with SLP not supported. */
7254 if (STMT_SLP_TYPE (stmt_info))
7255 return false;
7256
7257 exit_phi = NULL;
7258 latch_e = loop_latch_edge (loop->inner);
7259 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7260 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7261 {
7262 gimple *use_stmt = USE_STMT (use_p);
7263 if (is_gimple_debug (use_stmt))
7264 continue;
7265
7266 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7267 {
7268 exit_phi = use_stmt;
7269 break;
7270 }
7271 }
7272 if (exit_phi)
7273 {
7274 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7275 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7276 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7277 {
7278 if (dump_enabled_p ())
7279 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7280 "inner-loop induction only used outside "
7281 "of the outer vectorized loop.\n");
7282 return false;
7283 }
7284 }
7285
7286 nested_in_vect_loop = true;
7287 iv_loop = loop->inner;
7288 }
7289 else
7290 iv_loop = loop;
7291 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7292
7293 if (slp_node && !nunits.is_constant ())
7294 {
7295 /* The current SLP code creates the initial value element-by-element. */
7296 if (dump_enabled_p ())
7297 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7298 "SLP induction not supported for variable-length"
7299 " vectors.\n");
7300 return false;
7301 }
7302
7303 if (!vec_stmt) /* transformation not required. */
7304 {
7305 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7306 DUMP_VECT_SCOPE ("vectorizable_induction");
7307 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7308 return true;
7309 }
7310
7311 /* Transform. */
7312
7313 /* Compute a vector variable, initialized with the first VF values of
7314 the induction variable. E.g., for an iv with IV_PHI='X' and
7315 evolution S, for a vector of 4 units, we want to compute:
7316 [X, X + S, X + 2*S, X + 3*S]. */
7317
7318 if (dump_enabled_p ())
7319 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7320
7321 latch_e = loop_latch_edge (iv_loop);
7322 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7323
7324 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7325 gcc_assert (step_expr != NULL_TREE);
7326
7327 pe = loop_preheader_edge (iv_loop);
7328 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7329 loop_preheader_edge (iv_loop));
7330
7331 stmts = NULL;
7332 if (!nested_in_vect_loop)
7333 {
7334 /* Convert the initial value to the desired type. */
7335 tree new_type = TREE_TYPE (vectype);
7336 init_expr = gimple_convert (&stmts, new_type, init_expr);
7337
7338 /* If we are using the loop mask to "peel" for alignment then we need
7339 to adjust the start value here. */
7340 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7341 if (skip_niters != NULL_TREE)
7342 {
7343 if (FLOAT_TYPE_P (vectype))
7344 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7345 skip_niters);
7346 else
7347 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7348 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7349 skip_niters, step_expr);
7350 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7351 init_expr, skip_step);
7352 }
7353 }
7354
7355 /* Convert the step to the desired type. */
7356 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7357
7358 if (stmts)
7359 {
7360 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7361 gcc_assert (!new_bb);
7362 }
7363
7364 /* Find the first insertion point in the BB. */
7365 basic_block bb = gimple_bb (phi);
7366 si = gsi_after_labels (bb);
7367
7368 /* For SLP induction we have to generate several IVs as for example
7369 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7370 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7371 [VF*S, VF*S, VF*S, VF*S] for all. */
7372 if (slp_node)
7373 {
7374 /* Enforced above. */
7375 unsigned int const_nunits = nunits.to_constant ();
7376
7377 /* Generate [VF*S, VF*S, ... ]. */
7378 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7379 {
7380 expr = build_int_cst (integer_type_node, vf);
7381 expr = fold_convert (TREE_TYPE (step_expr), expr);
7382 }
7383 else
7384 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7385 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7386 expr, step_expr);
7387 if (! CONSTANT_CLASS_P (new_name))
7388 new_name = vect_init_vector (stmt_info, new_name,
7389 TREE_TYPE (step_expr), NULL);
7390 new_vec = build_vector_from_val (vectype, new_name);
7391 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7392
7393 /* Now generate the IVs. */
7394 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7395 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7396 unsigned elts = const_nunits * nvects;
7397 unsigned nivs = least_common_multiple (group_size,
7398 const_nunits) / const_nunits;
7399 gcc_assert (elts % group_size == 0);
7400 tree elt = init_expr;
7401 unsigned ivn;
7402 for (ivn = 0; ivn < nivs; ++ivn)
7403 {
7404 tree_vector_builder elts (vectype, const_nunits, 1);
7405 stmts = NULL;
7406 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7407 {
7408 if (ivn*const_nunits + eltn >= group_size
7409 && (ivn * const_nunits + eltn) % group_size == 0)
7410 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7411 elt, step_expr);
7412 elts.quick_push (elt);
7413 }
7414 vec_init = gimple_build_vector (&stmts, &elts);
7415 if (stmts)
7416 {
7417 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7418 gcc_assert (!new_bb);
7419 }
7420
7421 /* Create the induction-phi that defines the induction-operand. */
7422 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7423 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7424 stmt_vec_info induction_phi_info
7425 = loop_vinfo->add_stmt (induction_phi);
7426 induc_def = PHI_RESULT (induction_phi);
7427
7428 /* Create the iv update inside the loop */
7429 vec_def = make_ssa_name (vec_dest);
7430 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7431 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7432 loop_vinfo->add_stmt (new_stmt);
7433
7434 /* Set the arguments of the phi node: */
7435 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7436 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7437 UNKNOWN_LOCATION);
7438
7439 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7440 }
7441
7442 /* Re-use IVs when we can. */
7443 if (ivn < nvects)
7444 {
7445 unsigned vfp
7446 = least_common_multiple (group_size, const_nunits) / group_size;
7447 /* Generate [VF'*S, VF'*S, ... ]. */
7448 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7449 {
7450 expr = build_int_cst (integer_type_node, vfp);
7451 expr = fold_convert (TREE_TYPE (step_expr), expr);
7452 }
7453 else
7454 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7455 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7456 expr, step_expr);
7457 if (! CONSTANT_CLASS_P (new_name))
7458 new_name = vect_init_vector (stmt_info, new_name,
7459 TREE_TYPE (step_expr), NULL);
7460 new_vec = build_vector_from_val (vectype, new_name);
7461 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7462 for (; ivn < nvects; ++ivn)
7463 {
7464 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7465 tree def;
7466 if (gimple_code (iv) == GIMPLE_PHI)
7467 def = gimple_phi_result (iv);
7468 else
7469 def = gimple_assign_lhs (iv);
7470 new_stmt = gimple_build_assign (make_ssa_name (vectype),
7471 PLUS_EXPR,
7472 def, vec_step);
7473 if (gimple_code (iv) == GIMPLE_PHI)
7474 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7475 else
7476 {
7477 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7478 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7479 }
7480 SLP_TREE_VEC_STMTS (slp_node).quick_push
7481 (loop_vinfo->add_stmt (new_stmt));
7482 }
7483 }
7484
7485 return true;
7486 }
7487
7488 /* Create the vector that holds the initial_value of the induction. */
7489 if (nested_in_vect_loop)
7490 {
7491 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7492 been created during vectorization of previous stmts. We obtain it
7493 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7494 vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7495 /* If the initial value is not of proper type, convert it. */
7496 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7497 {
7498 new_stmt
7499 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7500 vect_simple_var,
7501 "vec_iv_"),
7502 VIEW_CONVERT_EXPR,
7503 build1 (VIEW_CONVERT_EXPR, vectype,
7504 vec_init));
7505 vec_init = gimple_assign_lhs (new_stmt);
7506 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7507 new_stmt);
7508 gcc_assert (!new_bb);
7509 loop_vinfo->add_stmt (new_stmt);
7510 }
7511 }
7512 else
7513 {
7514 /* iv_loop is the loop to be vectorized. Create:
7515 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7516 stmts = NULL;
7517 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7518
7519 unsigned HOST_WIDE_INT const_nunits;
7520 if (nunits.is_constant (&const_nunits))
7521 {
7522 tree_vector_builder elts (vectype, const_nunits, 1);
7523 elts.quick_push (new_name);
7524 for (i = 1; i < const_nunits; i++)
7525 {
7526 /* Create: new_name_i = new_name + step_expr */
7527 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7528 new_name, step_expr);
7529 elts.quick_push (new_name);
7530 }
7531 /* Create a vector from [new_name_0, new_name_1, ...,
7532 new_name_nunits-1] */
7533 vec_init = gimple_build_vector (&stmts, &elts);
7534 }
7535 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7536 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7537 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7538 new_name, step_expr);
7539 else
7540 {
7541 /* Build:
7542 [base, base, base, ...]
7543 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7544 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7545 gcc_assert (flag_associative_math);
7546 tree index = build_index_vector (vectype, 0, 1);
7547 tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7548 new_name);
7549 tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7550 step_expr);
7551 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7552 vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7553 vec_init, step_vec);
7554 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7555 vec_init, base_vec);
7556 }
7557
7558 if (stmts)
7559 {
7560 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7561 gcc_assert (!new_bb);
7562 }
7563 }
7564
7565
7566 /* Create the vector that holds the step of the induction. */
7567 if (nested_in_vect_loop)
7568 /* iv_loop is nested in the loop to be vectorized. Generate:
7569 vec_step = [S, S, S, S] */
7570 new_name = step_expr;
7571 else
7572 {
7573 /* iv_loop is the loop to be vectorized. Generate:
7574 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7575 gimple_seq seq = NULL;
7576 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7577 {
7578 expr = build_int_cst (integer_type_node, vf);
7579 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7580 }
7581 else
7582 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7583 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7584 expr, step_expr);
7585 if (seq)
7586 {
7587 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7588 gcc_assert (!new_bb);
7589 }
7590 }
7591
7592 t = unshare_expr (new_name);
7593 gcc_assert (CONSTANT_CLASS_P (new_name)
7594 || TREE_CODE (new_name) == SSA_NAME);
7595 new_vec = build_vector_from_val (vectype, t);
7596 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7597
7598
7599 /* Create the following def-use cycle:
7600 loop prolog:
7601 vec_init = ...
7602 vec_step = ...
7603 loop:
7604 vec_iv = PHI <vec_init, vec_loop>
7605 ...
7606 STMT
7607 ...
7608 vec_loop = vec_iv + vec_step; */
7609
7610 /* Create the induction-phi that defines the induction-operand. */
7611 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7612 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7613 stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7614 induc_def = PHI_RESULT (induction_phi);
7615
7616 /* Create the iv update inside the loop */
7617 vec_def = make_ssa_name (vec_dest);
7618 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7619 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7620 stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7621
7622 /* Set the arguments of the phi node: */
7623 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7624 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7625 UNKNOWN_LOCATION);
7626
7627 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7628
7629 /* In case that vectorization factor (VF) is bigger than the number
7630 of elements that we can fit in a vectype (nunits), we have to generate
7631 more than one vector stmt - i.e - we need to "unroll" the
7632 vector stmt by a factor VF/nunits. For more details see documentation
7633 in vectorizable_operation. */
7634
7635 if (ncopies > 1)
7636 {
7637 gimple_seq seq = NULL;
7638 stmt_vec_info prev_stmt_vinfo;
7639 /* FORNOW. This restriction should be relaxed. */
7640 gcc_assert (!nested_in_vect_loop);
7641
7642 /* Create the vector that holds the step of the induction. */
7643 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7644 {
7645 expr = build_int_cst (integer_type_node, nunits);
7646 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7647 }
7648 else
7649 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7650 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7651 expr, step_expr);
7652 if (seq)
7653 {
7654 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7655 gcc_assert (!new_bb);
7656 }
7657
7658 t = unshare_expr (new_name);
7659 gcc_assert (CONSTANT_CLASS_P (new_name)
7660 || TREE_CODE (new_name) == SSA_NAME);
7661 new_vec = build_vector_from_val (vectype, t);
7662 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7663
7664 vec_def = induc_def;
7665 prev_stmt_vinfo = induction_phi_info;
7666 for (i = 1; i < ncopies; i++)
7667 {
7668 /* vec_i = vec_prev + vec_step */
7669 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7670 vec_def, vec_step);
7671 vec_def = make_ssa_name (vec_dest, new_stmt);
7672 gimple_assign_set_lhs (new_stmt, vec_def);
7673
7674 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7675 new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7676 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7677 prev_stmt_vinfo = new_stmt_info;
7678 }
7679 }
7680
7681 if (nested_in_vect_loop)
7682 {
7683 /* Find the loop-closed exit-phi of the induction, and record
7684 the final vector of induction results: */
7685 exit_phi = NULL;
7686 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7687 {
7688 gimple *use_stmt = USE_STMT (use_p);
7689 if (is_gimple_debug (use_stmt))
7690 continue;
7691
7692 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7693 {
7694 exit_phi = use_stmt;
7695 break;
7696 }
7697 }
7698 if (exit_phi)
7699 {
7700 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7701 /* FORNOW. Currently not supporting the case that an inner-loop induction
7702 is not used in the outer-loop (i.e. only outside the outer-loop). */
7703 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7704 && !STMT_VINFO_LIVE_P (stmt_vinfo));
7705
7706 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7707 if (dump_enabled_p ())
7708 dump_printf_loc (MSG_NOTE, vect_location,
7709 "vector of inductions after inner-loop:%G",
7710 new_stmt);
7711 }
7712 }
7713
7714
7715 if (dump_enabled_p ())
7716 dump_printf_loc (MSG_NOTE, vect_location,
7717 "transform induction: created def-use cycle: %G%G",
7718 induction_phi, SSA_NAME_DEF_STMT (vec_def));
7719
7720 return true;
7721 }
7722
7723 /* Function vectorizable_live_operation.
7724
7725 STMT_INFO computes a value that is used outside the loop. Check if
7726 it can be supported. */
7727
7728 bool
7729 vectorizable_live_operation (stmt_vec_info stmt_info,
7730 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7731 slp_tree slp_node, int slp_index,
7732 stmt_vec_info *vec_stmt,
7733 stmt_vector_for_cost *)
7734 {
7735 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7736 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7737 imm_use_iterator imm_iter;
7738 tree lhs, lhs_type, bitsize, vec_bitsize;
7739 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7740 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7741 int ncopies;
7742 gimple *use_stmt;
7743 auto_vec<tree> vec_oprnds;
7744 int vec_entry = 0;
7745 poly_uint64 vec_index = 0;
7746
7747 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7748
7749 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7750 return false;
7751
7752 /* FORNOW. CHECKME. */
7753 if (nested_in_vect_loop_p (loop, stmt_info))
7754 return false;
7755
7756 /* If STMT is not relevant and it is a simple assignment and its inputs are
7757 invariant then it can remain in place, unvectorized. The original last
7758 scalar value that it computes will be used. */
7759 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7760 {
7761 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7762 if (dump_enabled_p ())
7763 dump_printf_loc (MSG_NOTE, vect_location,
7764 "statement is simple and uses invariant. Leaving in "
7765 "place.\n");
7766 return true;
7767 }
7768
7769 if (slp_node)
7770 ncopies = 1;
7771 else
7772 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7773
7774 if (slp_node)
7775 {
7776 gcc_assert (slp_index >= 0);
7777
7778 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7779 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7780
7781 /* Get the last occurrence of the scalar index from the concatenation of
7782 all the slp vectors. Calculate which slp vector it is and the index
7783 within. */
7784 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7785
7786 /* Calculate which vector contains the result, and which lane of
7787 that vector we need. */
7788 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7789 {
7790 if (dump_enabled_p ())
7791 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7792 "Cannot determine which vector holds the"
7793 " final result.\n");
7794 return false;
7795 }
7796 }
7797
7798 if (!vec_stmt)
7799 {
7800 /* No transformation required. */
7801 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7802 {
7803 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7804 OPTIMIZE_FOR_SPEED))
7805 {
7806 if (dump_enabled_p ())
7807 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7808 "can't use a fully-masked loop because "
7809 "the target doesn't support extract last "
7810 "reduction.\n");
7811 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7812 }
7813 else if (slp_node)
7814 {
7815 if (dump_enabled_p ())
7816 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7817 "can't use a fully-masked loop because an "
7818 "SLP statement is live after the loop.\n");
7819 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7820 }
7821 else if (ncopies > 1)
7822 {
7823 if (dump_enabled_p ())
7824 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7825 "can't use a fully-masked loop because"
7826 " ncopies is greater than 1.\n");
7827 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7828 }
7829 else
7830 {
7831 gcc_assert (ncopies == 1 && !slp_node);
7832 vect_record_loop_mask (loop_vinfo,
7833 &LOOP_VINFO_MASKS (loop_vinfo),
7834 1, vectype);
7835 }
7836 }
7837 return true;
7838 }
7839
7840 /* Use the lhs of the original scalar statement. */
7841 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
7842
7843 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7844 : gimple_get_lhs (stmt);
7845 lhs_type = TREE_TYPE (lhs);
7846
7847 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7848 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7849 : TYPE_SIZE (TREE_TYPE (vectype)));
7850 vec_bitsize = TYPE_SIZE (vectype);
7851
7852 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
7853 tree vec_lhs, bitstart;
7854 if (slp_node)
7855 {
7856 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7857
7858 /* Get the correct slp vectorized stmt. */
7859 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7860 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7861 vec_lhs = gimple_phi_result (phi);
7862 else
7863 vec_lhs = gimple_get_lhs (vec_stmt);
7864
7865 /* Get entry to use. */
7866 bitstart = bitsize_int (vec_index);
7867 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7868 }
7869 else
7870 {
7871 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7872 vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7873 gcc_checking_assert (ncopies == 1
7874 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7875
7876 /* For multiple copies, get the last copy. */
7877 for (int i = 1; i < ncopies; ++i)
7878 vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
7879
7880 /* Get the last lane in the vector. */
7881 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7882 }
7883
7884 gimple_seq stmts = NULL;
7885 tree new_tree;
7886 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7887 {
7888 /* Emit:
7889
7890 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7891
7892 where VEC_LHS is the vectorized live-out result and MASK is
7893 the loop mask for the final iteration. */
7894 gcc_assert (ncopies == 1 && !slp_node);
7895 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7896 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
7897 1, vectype, 0);
7898 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
7899 scalar_type, mask, vec_lhs);
7900
7901 /* Convert the extracted vector element to the required scalar type. */
7902 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7903 }
7904 else
7905 {
7906 tree bftype = TREE_TYPE (vectype);
7907 if (VECTOR_BOOLEAN_TYPE_P (vectype))
7908 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7909 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7910 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
7911 &stmts, true, NULL_TREE);
7912 }
7913
7914 if (stmts)
7915 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7916
7917 /* Replace use of lhs with newly computed result. If the use stmt is a
7918 single arg PHI, just replace all uses of PHI result. It's necessary
7919 because lcssa PHI defining lhs may be before newly inserted stmt. */
7920 use_operand_p use_p;
7921 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7922 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7923 && !is_gimple_debug (use_stmt))
7924 {
7925 if (gimple_code (use_stmt) == GIMPLE_PHI
7926 && gimple_phi_num_args (use_stmt) == 1)
7927 {
7928 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7929 }
7930 else
7931 {
7932 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7933 SET_USE (use_p, new_tree);
7934 }
7935 update_stmt (use_stmt);
7936 }
7937
7938 return true;
7939 }
7940
7941 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
7942
7943 static void
7944 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info)
7945 {
7946 ssa_op_iter op_iter;
7947 imm_use_iterator imm_iter;
7948 def_operand_p def_p;
7949 gimple *ustmt;
7950
7951 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
7952 {
7953 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7954 {
7955 basic_block bb;
7956
7957 if (!is_gimple_debug (ustmt))
7958 continue;
7959
7960 bb = gimple_bb (ustmt);
7961
7962 if (!flow_bb_inside_loop_p (loop, bb))
7963 {
7964 if (gimple_debug_bind_p (ustmt))
7965 {
7966 if (dump_enabled_p ())
7967 dump_printf_loc (MSG_NOTE, vect_location,
7968 "killing debug use\n");
7969
7970 gimple_debug_bind_reset_value (ustmt);
7971 update_stmt (ustmt);
7972 }
7973 else
7974 gcc_unreachable ();
7975 }
7976 }
7977 }
7978 }
7979
7980 /* Given loop represented by LOOP_VINFO, return true if computation of
7981 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7982 otherwise. */
7983
7984 static bool
7985 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7986 {
7987 /* Constant case. */
7988 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7989 {
7990 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7991 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7992
7993 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
7994 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
7995 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
7996 return true;
7997 }
7998
7999 widest_int max;
8000 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8001 /* Check the upper bound of loop niters. */
8002 if (get_max_loop_iterations (loop, &max))
8003 {
8004 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8005 signop sgn = TYPE_SIGN (type);
8006 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8007 if (max < type_max)
8008 return true;
8009 }
8010 return false;
8011 }
8012
8013 /* Return a mask type with half the number of elements as TYPE. */
8014
8015 tree
8016 vect_halve_mask_nunits (tree type)
8017 {
8018 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8019 return build_truth_vector_type (nunits, current_vector_size);
8020 }
8021
8022 /* Return a mask type with twice as many elements as TYPE. */
8023
8024 tree
8025 vect_double_mask_nunits (tree type)
8026 {
8027 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8028 return build_truth_vector_type (nunits, current_vector_size);
8029 }
8030
8031 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8032 contain a sequence of NVECTORS masks that each control a vector of type
8033 VECTYPE. */
8034
8035 void
8036 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8037 unsigned int nvectors, tree vectype)
8038 {
8039 gcc_assert (nvectors != 0);
8040 if (masks->length () < nvectors)
8041 masks->safe_grow_cleared (nvectors);
8042 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8043 /* The number of scalars per iteration and the number of vectors are
8044 both compile-time constants. */
8045 unsigned int nscalars_per_iter
8046 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8047 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8048 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8049 {
8050 rgm->max_nscalars_per_iter = nscalars_per_iter;
8051 rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8052 }
8053 }
8054
8055 /* Given a complete set of masks MASKS, extract mask number INDEX
8056 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8057 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8058
8059 See the comment above vec_loop_masks for more details about the mask
8060 arrangement. */
8061
8062 tree
8063 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8064 unsigned int nvectors, tree vectype, unsigned int index)
8065 {
8066 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8067 tree mask_type = rgm->mask_type;
8068
8069 /* Populate the rgroup's mask array, if this is the first time we've
8070 used it. */
8071 if (rgm->masks.is_empty ())
8072 {
8073 rgm->masks.safe_grow_cleared (nvectors);
8074 for (unsigned int i = 0; i < nvectors; ++i)
8075 {
8076 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8077 /* Provide a dummy definition until the real one is available. */
8078 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8079 rgm->masks[i] = mask;
8080 }
8081 }
8082
8083 tree mask = rgm->masks[index];
8084 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8085 TYPE_VECTOR_SUBPARTS (vectype)))
8086 {
8087 /* A loop mask for data type X can be reused for data type Y
8088 if X has N times more elements than Y and if Y's elements
8089 are N times bigger than X's. In this case each sequence
8090 of N elements in the loop mask will be all-zero or all-one.
8091 We can then view-convert the mask so that each sequence of
8092 N elements is replaced by a single element. */
8093 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8094 TYPE_VECTOR_SUBPARTS (vectype)));
8095 gimple_seq seq = NULL;
8096 mask_type = build_same_sized_truth_vector_type (vectype);
8097 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8098 if (seq)
8099 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8100 }
8101 return mask;
8102 }
8103
8104 /* Scale profiling counters by estimation for LOOP which is vectorized
8105 by factor VF. */
8106
8107 static void
8108 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8109 {
8110 edge preheader = loop_preheader_edge (loop);
8111 /* Reduce loop iterations by the vectorization factor. */
8112 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8113 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8114
8115 if (freq_h.nonzero_p ())
8116 {
8117 profile_probability p;
8118
8119 /* Avoid dropping loop body profile counter to 0 because of zero count
8120 in loop's preheader. */
8121 if (!(freq_e == profile_count::zero ()))
8122 freq_e = freq_e.force_nonzero ();
8123 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8124 scale_loop_frequencies (loop, p);
8125 }
8126
8127 edge exit_e = single_exit (loop);
8128 exit_e->probability = profile_probability::always ()
8129 .apply_scale (1, new_est_niter + 1);
8130
8131 edge exit_l = single_pred_edge (loop->latch);
8132 profile_probability prob = exit_l->probability;
8133 exit_l->probability = exit_e->probability.invert ();
8134 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8135 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8136 }
8137
8138 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8139 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8140 stmt_vec_info. */
8141
8142 static void
8143 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8144 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8145 {
8146 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8147 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8148
8149 if (dump_enabled_p ())
8150 dump_printf_loc (MSG_NOTE, vect_location,
8151 "------>vectorizing statement: %G", stmt_info->stmt);
8152
8153 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8154 vect_loop_kill_debug_uses (loop, stmt_info);
8155
8156 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8157 && !STMT_VINFO_LIVE_P (stmt_info))
8158 return;
8159
8160 if (STMT_VINFO_VECTYPE (stmt_info))
8161 {
8162 poly_uint64 nunits
8163 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8164 if (!STMT_SLP_TYPE (stmt_info)
8165 && maybe_ne (nunits, vf)
8166 && dump_enabled_p ())
8167 /* For SLP VF is set according to unrolling factor, and not
8168 to vector size, hence for SLP this print is not valid. */
8169 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8170 }
8171
8172 /* Pure SLP statements have already been vectorized. We still need
8173 to apply loop vectorization to hybrid SLP statements. */
8174 if (PURE_SLP_STMT (stmt_info))
8175 return;
8176
8177 if (dump_enabled_p ())
8178 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8179
8180 if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8181 *seen_store = stmt_info;
8182 }
8183
8184 /* Function vect_transform_loop.
8185
8186 The analysis phase has determined that the loop is vectorizable.
8187 Vectorize the loop - created vectorized stmts to replace the scalar
8188 stmts in the loop, and update the loop exit condition.
8189 Returns scalar epilogue loop if any. */
8190
8191 struct loop *
8192 vect_transform_loop (loop_vec_info loop_vinfo)
8193 {
8194 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8195 struct loop *epilogue = NULL;
8196 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8197 int nbbs = loop->num_nodes;
8198 int i;
8199 tree niters_vector = NULL_TREE;
8200 tree step_vector = NULL_TREE;
8201 tree niters_vector_mult_vf = NULL_TREE;
8202 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8203 unsigned int lowest_vf = constant_lower_bound (vf);
8204 gimple *stmt;
8205 bool check_profitability = false;
8206 unsigned int th;
8207
8208 DUMP_VECT_SCOPE ("vec_transform_loop");
8209
8210 loop_vinfo->shared->check_datarefs ();
8211
8212 /* Use the more conservative vectorization threshold. If the number
8213 of iterations is constant assume the cost check has been performed
8214 by our caller. If the threshold makes all loops profitable that
8215 run at least the (estimated) vectorization factor number of times
8216 checking is pointless, too. */
8217 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8218 if (th >= vect_vf_for_cost (loop_vinfo)
8219 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8220 {
8221 if (dump_enabled_p ())
8222 dump_printf_loc (MSG_NOTE, vect_location,
8223 "Profitability threshold is %d loop iterations.\n",
8224 th);
8225 check_profitability = true;
8226 }
8227
8228 /* Make sure there exists a single-predecessor exit bb. Do this before
8229 versioning. */
8230 edge e = single_exit (loop);
8231 if (! single_pred_p (e->dest))
8232 {
8233 split_loop_exit_edge (e, true);
8234 if (dump_enabled_p ())
8235 dump_printf (MSG_NOTE, "split exit edge\n");
8236 }
8237
8238 /* Version the loop first, if required, so the profitability check
8239 comes first. */
8240
8241 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8242 {
8243 poly_uint64 versioning_threshold
8244 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8245 if (check_profitability
8246 && ordered_p (poly_uint64 (th), versioning_threshold))
8247 {
8248 versioning_threshold = ordered_max (poly_uint64 (th),
8249 versioning_threshold);
8250 check_profitability = false;
8251 }
8252 vect_loop_versioning (loop_vinfo, th, check_profitability,
8253 versioning_threshold);
8254 check_profitability = false;
8255 }
8256
8257 /* Make sure there exists a single-predecessor exit bb also on the
8258 scalar loop copy. Do this after versioning but before peeling
8259 so CFG structure is fine for both scalar and if-converted loop
8260 to make slpeel_duplicate_current_defs_from_edges face matched
8261 loop closed PHI nodes on the exit. */
8262 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8263 {
8264 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8265 if (! single_pred_p (e->dest))
8266 {
8267 split_loop_exit_edge (e, true);
8268 if (dump_enabled_p ())
8269 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8270 }
8271 }
8272
8273 tree niters = vect_build_loop_niters (loop_vinfo);
8274 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8275 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8276 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8277 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8278 &step_vector, &niters_vector_mult_vf, th,
8279 check_profitability, niters_no_overflow);
8280
8281 if (niters_vector == NULL_TREE)
8282 {
8283 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8284 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8285 && known_eq (lowest_vf, vf))
8286 {
8287 niters_vector
8288 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8289 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8290 step_vector = build_one_cst (TREE_TYPE (niters));
8291 }
8292 else
8293 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8294 &step_vector, niters_no_overflow);
8295 }
8296
8297 /* 1) Make sure the loop header has exactly two entries
8298 2) Make sure we have a preheader basic block. */
8299
8300 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8301
8302 split_edge (loop_preheader_edge (loop));
8303
8304 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8305 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8306 /* This will deal with any possible peeling. */
8307 vect_prepare_for_masked_peels (loop_vinfo);
8308
8309 /* Schedule the SLP instances first, then handle loop vectorization
8310 below. */
8311 if (!loop_vinfo->slp_instances.is_empty ())
8312 {
8313 DUMP_VECT_SCOPE ("scheduling SLP instances");
8314 vect_schedule_slp (loop_vinfo);
8315 }
8316
8317 /* FORNOW: the vectorizer supports only loops which body consist
8318 of one basic block (header + empty latch). When the vectorizer will
8319 support more involved loop forms, the order by which the BBs are
8320 traversed need to be reconsidered. */
8321
8322 for (i = 0; i < nbbs; i++)
8323 {
8324 basic_block bb = bbs[i];
8325 stmt_vec_info stmt_info;
8326
8327 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8328 gsi_next (&si))
8329 {
8330 gphi *phi = si.phi ();
8331 if (dump_enabled_p ())
8332 dump_printf_loc (MSG_NOTE, vect_location,
8333 "------>vectorizing phi: %G", phi);
8334 stmt_info = loop_vinfo->lookup_stmt (phi);
8335 if (!stmt_info)
8336 continue;
8337
8338 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8339 vect_loop_kill_debug_uses (loop, stmt_info);
8340
8341 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8342 && !STMT_VINFO_LIVE_P (stmt_info))
8343 continue;
8344
8345 if (STMT_VINFO_VECTYPE (stmt_info)
8346 && (maybe_ne
8347 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8348 && dump_enabled_p ())
8349 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8350
8351 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8352 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8353 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8354 && ! PURE_SLP_STMT (stmt_info))
8355 {
8356 if (dump_enabled_p ())
8357 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8358 vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8359 }
8360 }
8361
8362 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8363 !gsi_end_p (si);)
8364 {
8365 stmt = gsi_stmt (si);
8366 /* During vectorization remove existing clobber stmts. */
8367 if (gimple_clobber_p (stmt))
8368 {
8369 unlink_stmt_vdef (stmt);
8370 gsi_remove (&si, true);
8371 release_defs (stmt);
8372 }
8373 else
8374 {
8375 stmt_info = loop_vinfo->lookup_stmt (stmt);
8376
8377 /* vector stmts created in the outer-loop during vectorization of
8378 stmts in an inner-loop may not have a stmt_info, and do not
8379 need to be vectorized. */
8380 stmt_vec_info seen_store = NULL;
8381 if (stmt_info)
8382 {
8383 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8384 {
8385 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8386 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8387 !gsi_end_p (subsi); gsi_next (&subsi))
8388 {
8389 stmt_vec_info pat_stmt_info
8390 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8391 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8392 &si, &seen_store);
8393 }
8394 stmt_vec_info pat_stmt_info
8395 = STMT_VINFO_RELATED_STMT (stmt_info);
8396 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8397 &seen_store);
8398 }
8399 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8400 &seen_store);
8401 }
8402 gsi_next (&si);
8403 if (seen_store)
8404 {
8405 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8406 /* Interleaving. If IS_STORE is TRUE, the
8407 vectorization of the interleaving chain was
8408 completed - free all the stores in the chain. */
8409 vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8410 else
8411 /* Free the attached stmt_vec_info and remove the stmt. */
8412 loop_vinfo->remove_stmt (stmt_info);
8413 }
8414 }
8415 }
8416
8417 /* Stub out scalar statements that must not survive vectorization.
8418 Doing this here helps with grouped statements, or statements that
8419 are involved in patterns. */
8420 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8421 !gsi_end_p (gsi); gsi_next (&gsi))
8422 {
8423 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8424 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8425 {
8426 tree lhs = gimple_get_lhs (call);
8427 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8428 {
8429 tree zero = build_zero_cst (TREE_TYPE (lhs));
8430 gimple *new_stmt = gimple_build_assign (lhs, zero);
8431 gsi_replace (&gsi, new_stmt, true);
8432 }
8433 }
8434 }
8435 } /* BBs in loop */
8436
8437 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8438 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8439 if (integer_onep (step_vector))
8440 niters_no_overflow = true;
8441 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8442 niters_vector_mult_vf, !niters_no_overflow);
8443
8444 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8445 scale_profile_for_vect_loop (loop, assumed_vf);
8446
8447 /* True if the final iteration might not handle a full vector's
8448 worth of scalar iterations. */
8449 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8450 /* The minimum number of iterations performed by the epilogue. This
8451 is 1 when peeling for gaps because we always need a final scalar
8452 iteration. */
8453 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8454 /* +1 to convert latch counts to loop iteration counts,
8455 -min_epilogue_iters to remove iterations that cannot be performed
8456 by the vector code. */
8457 int bias_for_lowest = 1 - min_epilogue_iters;
8458 int bias_for_assumed = bias_for_lowest;
8459 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8460 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8461 {
8462 /* When the amount of peeling is known at compile time, the first
8463 iteration will have exactly alignment_npeels active elements.
8464 In the worst case it will have at least one. */
8465 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8466 bias_for_lowest += lowest_vf - min_first_active;
8467 bias_for_assumed += assumed_vf - min_first_active;
8468 }
8469 /* In these calculations the "- 1" converts loop iteration counts
8470 back to latch counts. */
8471 if (loop->any_upper_bound)
8472 loop->nb_iterations_upper_bound
8473 = (final_iter_may_be_partial
8474 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8475 lowest_vf) - 1
8476 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8477 lowest_vf) - 1);
8478 if (loop->any_likely_upper_bound)
8479 loop->nb_iterations_likely_upper_bound
8480 = (final_iter_may_be_partial
8481 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8482 + bias_for_lowest, lowest_vf) - 1
8483 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8484 + bias_for_lowest, lowest_vf) - 1);
8485 if (loop->any_estimate)
8486 loop->nb_iterations_estimate
8487 = (final_iter_may_be_partial
8488 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8489 assumed_vf) - 1
8490 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8491 assumed_vf) - 1);
8492
8493 if (dump_enabled_p ())
8494 {
8495 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8496 {
8497 dump_printf_loc (MSG_NOTE, vect_location,
8498 "LOOP VECTORIZED\n");
8499 if (loop->inner)
8500 dump_printf_loc (MSG_NOTE, vect_location,
8501 "OUTER LOOP VECTORIZED\n");
8502 dump_printf (MSG_NOTE, "\n");
8503 }
8504 else
8505 {
8506 dump_printf_loc (MSG_NOTE, vect_location,
8507 "LOOP EPILOGUE VECTORIZED (VS=");
8508 dump_dec (MSG_NOTE, current_vector_size);
8509 dump_printf (MSG_NOTE, ")\n");
8510 }
8511 }
8512
8513 /* Loops vectorized with a variable factor won't benefit from
8514 unrolling/peeling. */
8515 if (!vf.is_constant ())
8516 {
8517 loop->unroll = 1;
8518 if (dump_enabled_p ())
8519 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8520 " variable-length vectorization factor\n");
8521 }
8522 /* Free SLP instances here because otherwise stmt reference counting
8523 won't work. */
8524 slp_instance instance;
8525 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8526 vect_free_slp_instance (instance, true);
8527 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8528 /* Clear-up safelen field since its value is invalid after vectorization
8529 since vectorized loop can have loop-carried dependencies. */
8530 loop->safelen = 0;
8531
8532 /* Don't vectorize epilogue for epilogue. */
8533 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8534 epilogue = NULL;
8535
8536 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8537 epilogue = NULL;
8538
8539 if (epilogue)
8540 {
8541 auto_vector_sizes vector_sizes;
8542 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8543 unsigned int next_size = 0;
8544
8545 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8546 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8547 && known_eq (vf, lowest_vf))
8548 {
8549 unsigned int eiters
8550 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8551 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
8552 - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
8553 eiters
8554 = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
8555 epilogue->nb_iterations_upper_bound = eiters - 1;
8556 epilogue->any_upper_bound = true;
8557
8558 unsigned int ratio;
8559 while (next_size < vector_sizes.length ()
8560 && !(constant_multiple_p (current_vector_size,
8561 vector_sizes[next_size], &ratio)
8562 && eiters >= lowest_vf / ratio))
8563 next_size += 1;
8564 }
8565 else
8566 while (next_size < vector_sizes.length ()
8567 && maybe_lt (current_vector_size, vector_sizes[next_size]))
8568 next_size += 1;
8569
8570 if (next_size == vector_sizes.length ())
8571 epilogue = NULL;
8572 }
8573
8574 if (epilogue)
8575 {
8576 epilogue->force_vectorize = loop->force_vectorize;
8577 epilogue->safelen = loop->safelen;
8578 epilogue->dont_vectorize = false;
8579
8580 /* We may need to if-convert epilogue to vectorize it. */
8581 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8582 tree_if_conversion (epilogue);
8583 }
8584
8585 return epilogue;
8586 }
8587
8588 /* The code below is trying to perform simple optimization - revert
8589 if-conversion for masked stores, i.e. if the mask of a store is zero
8590 do not perform it and all stored value producers also if possible.
8591 For example,
8592 for (i=0; i<n; i++)
8593 if (c[i])
8594 {
8595 p1[i] += 1;
8596 p2[i] = p3[i] +2;
8597 }
8598 this transformation will produce the following semi-hammock:
8599
8600 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8601 {
8602 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8603 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8604 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8605 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8606 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8607 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8608 }
8609 */
8610
8611 void
8612 optimize_mask_stores (struct loop *loop)
8613 {
8614 basic_block *bbs = get_loop_body (loop);
8615 unsigned nbbs = loop->num_nodes;
8616 unsigned i;
8617 basic_block bb;
8618 struct loop *bb_loop;
8619 gimple_stmt_iterator gsi;
8620 gimple *stmt;
8621 auto_vec<gimple *> worklist;
8622
8623 vect_location = find_loop_location (loop);
8624 /* Pick up all masked stores in loop if any. */
8625 for (i = 0; i < nbbs; i++)
8626 {
8627 bb = bbs[i];
8628 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8629 gsi_next (&gsi))
8630 {
8631 stmt = gsi_stmt (gsi);
8632 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8633 worklist.safe_push (stmt);
8634 }
8635 }
8636
8637 free (bbs);
8638 if (worklist.is_empty ())
8639 return;
8640
8641 /* Loop has masked stores. */
8642 while (!worklist.is_empty ())
8643 {
8644 gimple *last, *last_store;
8645 edge e, efalse;
8646 tree mask;
8647 basic_block store_bb, join_bb;
8648 gimple_stmt_iterator gsi_to;
8649 tree vdef, new_vdef;
8650 gphi *phi;
8651 tree vectype;
8652 tree zero;
8653
8654 last = worklist.pop ();
8655 mask = gimple_call_arg (last, 2);
8656 bb = gimple_bb (last);
8657 /* Create then_bb and if-then structure in CFG, then_bb belongs to
8658 the same loop as if_bb. It could be different to LOOP when two
8659 level loop-nest is vectorized and mask_store belongs to the inner
8660 one. */
8661 e = split_block (bb, last);
8662 bb_loop = bb->loop_father;
8663 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8664 join_bb = e->dest;
8665 store_bb = create_empty_bb (bb);
8666 add_bb_to_loop (store_bb, bb_loop);
8667 e->flags = EDGE_TRUE_VALUE;
8668 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8669 /* Put STORE_BB to likely part. */
8670 efalse->probability = profile_probability::unlikely ();
8671 store_bb->count = efalse->count ();
8672 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8673 if (dom_info_available_p (CDI_DOMINATORS))
8674 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8675 if (dump_enabled_p ())
8676 dump_printf_loc (MSG_NOTE, vect_location,
8677 "Create new block %d to sink mask stores.",
8678 store_bb->index);
8679 /* Create vector comparison with boolean result. */
8680 vectype = TREE_TYPE (mask);
8681 zero = build_zero_cst (vectype);
8682 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8683 gsi = gsi_last_bb (bb);
8684 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8685 /* Create new PHI node for vdef of the last masked store:
8686 .MEM_2 = VDEF <.MEM_1>
8687 will be converted to
8688 .MEM.3 = VDEF <.MEM_1>
8689 and new PHI node will be created in join bb
8690 .MEM_2 = PHI <.MEM_1, .MEM_3>
8691 */
8692 vdef = gimple_vdef (last);
8693 new_vdef = make_ssa_name (gimple_vop (cfun), last);
8694 gimple_set_vdef (last, new_vdef);
8695 phi = create_phi_node (vdef, join_bb);
8696 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8697
8698 /* Put all masked stores with the same mask to STORE_BB if possible. */
8699 while (true)
8700 {
8701 gimple_stmt_iterator gsi_from;
8702 gimple *stmt1 = NULL;
8703
8704 /* Move masked store to STORE_BB. */
8705 last_store = last;
8706 gsi = gsi_for_stmt (last);
8707 gsi_from = gsi;
8708 /* Shift GSI to the previous stmt for further traversal. */
8709 gsi_prev (&gsi);
8710 gsi_to = gsi_start_bb (store_bb);
8711 gsi_move_before (&gsi_from, &gsi_to);
8712 /* Setup GSI_TO to the non-empty block start. */
8713 gsi_to = gsi_start_bb (store_bb);
8714 if (dump_enabled_p ())
8715 dump_printf_loc (MSG_NOTE, vect_location,
8716 "Move stmt to created bb\n%G", last);
8717 /* Move all stored value producers if possible. */
8718 while (!gsi_end_p (gsi))
8719 {
8720 tree lhs;
8721 imm_use_iterator imm_iter;
8722 use_operand_p use_p;
8723 bool res;
8724
8725 /* Skip debug statements. */
8726 if (is_gimple_debug (gsi_stmt (gsi)))
8727 {
8728 gsi_prev (&gsi);
8729 continue;
8730 }
8731 stmt1 = gsi_stmt (gsi);
8732 /* Do not consider statements writing to memory or having
8733 volatile operand. */
8734 if (gimple_vdef (stmt1)
8735 || gimple_has_volatile_ops (stmt1))
8736 break;
8737 gsi_from = gsi;
8738 gsi_prev (&gsi);
8739 lhs = gimple_get_lhs (stmt1);
8740 if (!lhs)
8741 break;
8742
8743 /* LHS of vectorized stmt must be SSA_NAME. */
8744 if (TREE_CODE (lhs) != SSA_NAME)
8745 break;
8746
8747 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8748 {
8749 /* Remove dead scalar statement. */
8750 if (has_zero_uses (lhs))
8751 {
8752 gsi_remove (&gsi_from, true);
8753 continue;
8754 }
8755 }
8756
8757 /* Check that LHS does not have uses outside of STORE_BB. */
8758 res = true;
8759 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8760 {
8761 gimple *use_stmt;
8762 use_stmt = USE_STMT (use_p);
8763 if (is_gimple_debug (use_stmt))
8764 continue;
8765 if (gimple_bb (use_stmt) != store_bb)
8766 {
8767 res = false;
8768 break;
8769 }
8770 }
8771 if (!res)
8772 break;
8773
8774 if (gimple_vuse (stmt1)
8775 && gimple_vuse (stmt1) != gimple_vuse (last_store))
8776 break;
8777
8778 /* Can move STMT1 to STORE_BB. */
8779 if (dump_enabled_p ())
8780 dump_printf_loc (MSG_NOTE, vect_location,
8781 "Move stmt to created bb\n%G", stmt1);
8782 gsi_move_before (&gsi_from, &gsi_to);
8783 /* Shift GSI_TO for further insertion. */
8784 gsi_prev (&gsi_to);
8785 }
8786 /* Put other masked stores with the same mask to STORE_BB. */
8787 if (worklist.is_empty ()
8788 || gimple_call_arg (worklist.last (), 2) != mask
8789 || worklist.last () != stmt1)
8790 break;
8791 last = worklist.pop ();
8792 }
8793 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8794 }
8795 }