Ensure that dump calls are guarded with dump_enabled_p
[gcc.git] / gcc / tree-vect-loop.c
1 /* Loop Vectorization
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
57
58 /* Loop Vectorization Pass.
59
60 This pass tries to vectorize loops.
61
62 For example, the vectorizer transforms the following simple loop:
63
64 short a[N]; short b[N]; short c[N]; int i;
65
66 for (i=0; i<N; i++){
67 a[i] = b[i] + c[i];
68 }
69
70 as if it was manually vectorized by rewriting the source code into:
71
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75 v8hi va, vb, vc;
76
77 for (i=0; i<N/8; i++){
78 vb = pb[i];
79 vc = pc[i];
80 va = vb + vc;
81 pa[i] = va;
82 }
83
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
95
96 Analysis phase:
97 ===============
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
101
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
106
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
117
118 For example, say stmt S1 was vectorized into stmt VS1:
119
120 VS1: vb = px[i];
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122 S2: a = b;
123
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
128
129 VS1: vb = px[i];
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 VS2: va = vb;
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
133
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
136
137 Target modeling:
138 =================
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
144
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
151
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
154 */
155
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
157
158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
159 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
160 may already be set for general statements (not just data refs). */
161
162 static opt_result
163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
164 bool vectype_maybe_set_p,
165 poly_uint64 *vf,
166 vec<stmt_vec_info > *mask_producers)
167 {
168 gimple *stmt = stmt_info->stmt;
169
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
173 {
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return opt_result::success ();
177 }
178
179 tree stmt_vectype, nunits_vectype;
180 opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
181 &nunits_vectype);
182 if (!res)
183 return res;
184
185 if (stmt_vectype)
186 {
187 if (STMT_VINFO_VECTYPE (stmt_info))
188 /* The only case when a vectype had been already set is for stmts
189 that contain a data ref, or for "pattern-stmts" (stmts generated
190 by the vectorizer to represent/replace a certain idiom). */
191 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
192 || vectype_maybe_set_p)
193 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
194 else if (stmt_vectype == boolean_type_node)
195 mask_producers->safe_push (stmt_info);
196 else
197 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
198 }
199
200 if (nunits_vectype)
201 vect_update_max_nunits (vf, nunits_vectype);
202
203 return opt_result::success ();
204 }
205
206 /* Subroutine of vect_determine_vectorization_factor. Set the vector
207 types of STMT_INFO and all attached pattern statements and update
208 the vectorization factor VF accordingly. If some of the statements
209 produce a mask result whose vector type can only be calculated later,
210 add them to MASK_PRODUCERS. Return true on success or false if
211 something prevented vectorization. */
212
213 static opt_result
214 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
215 vec<stmt_vec_info > *mask_producers)
216 {
217 vec_info *vinfo = stmt_info->vinfo;
218 if (dump_enabled_p ())
219 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
220 stmt_info->stmt);
221 opt_result res
222 = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers);
223 if (!res)
224 return res;
225
226 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
227 && STMT_VINFO_RELATED_STMT (stmt_info))
228 {
229 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
230 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
231
232 /* If a pattern statement has def stmts, analyze them too. */
233 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
234 !gsi_end_p (si); gsi_next (&si))
235 {
236 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
237 if (dump_enabled_p ())
238 dump_printf_loc (MSG_NOTE, vect_location,
239 "==> examining pattern def stmt: %G",
240 def_stmt_info->stmt);
241 if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
242 vf, mask_producers))
243 res = vect_determine_vf_for_stmt_1 (def_stmt_info, true,
244 vf, mask_producers);
245 if (!res)
246 return res;
247 }
248
249 if (dump_enabled_p ())
250 dump_printf_loc (MSG_NOTE, vect_location,
251 "==> examining pattern statement: %G",
252 stmt_info->stmt);
253 res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers);
254 if (!res)
255 return res;
256 }
257
258 return opt_result::success ();
259 }
260
261 /* Function vect_determine_vectorization_factor
262
263 Determine the vectorization factor (VF). VF is the number of data elements
264 that are operated upon in parallel in a single iteration of the vectorized
265 loop. For example, when vectorizing a loop that operates on 4byte elements,
266 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
267 elements can fit in a single vector register.
268
269 We currently support vectorization of loops in which all types operated upon
270 are of the same size. Therefore this function currently sets VF according to
271 the size of the types operated upon, and fails if there are multiple sizes
272 in the loop.
273
274 VF is also the factor by which the loop iterations are strip-mined, e.g.:
275 original loop:
276 for (i=0; i<N; i++){
277 a[i] = b[i] + c[i];
278 }
279
280 vectorized loop:
281 for (i=0; i<N; i+=VF){
282 a[i:VF] = b[i:VF] + c[i:VF];
283 }
284 */
285
286 static opt_result
287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
288 {
289 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
290 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
291 unsigned nbbs = loop->num_nodes;
292 poly_uint64 vectorization_factor = 1;
293 tree scalar_type = NULL_TREE;
294 gphi *phi;
295 tree vectype;
296 stmt_vec_info stmt_info;
297 unsigned i;
298 auto_vec<stmt_vec_info> mask_producers;
299
300 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
301
302 for (i = 0; i < nbbs; i++)
303 {
304 basic_block bb = bbs[i];
305
306 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
307 gsi_next (&si))
308 {
309 phi = si.phi ();
310 stmt_info = loop_vinfo->lookup_stmt (phi);
311 if (dump_enabled_p ())
312 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
313 phi);
314
315 gcc_assert (stmt_info);
316
317 if (STMT_VINFO_RELEVANT_P (stmt_info)
318 || STMT_VINFO_LIVE_P (stmt_info))
319 {
320 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
321 scalar_type = TREE_TYPE (PHI_RESULT (phi));
322
323 if (dump_enabled_p ())
324 dump_printf_loc (MSG_NOTE, vect_location,
325 "get vectype for scalar type: %T\n",
326 scalar_type);
327
328 vectype = get_vectype_for_scalar_type (scalar_type);
329 if (!vectype)
330 return opt_result::failure_at (phi,
331 "not vectorized: unsupported "
332 "data-type %T\n",
333 scalar_type);
334 STMT_VINFO_VECTYPE (stmt_info) = vectype;
335
336 if (dump_enabled_p ())
337 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
338 vectype);
339
340 if (dump_enabled_p ())
341 {
342 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
343 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
344 dump_printf (MSG_NOTE, "\n");
345 }
346
347 vect_update_max_nunits (&vectorization_factor, vectype);
348 }
349 }
350
351 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
352 gsi_next (&si))
353 {
354 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
355 opt_result res
356 = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
357 &mask_producers);
358 if (!res)
359 return res;
360 }
361 }
362
363 /* TODO: Analyze cost. Decide if worth while to vectorize. */
364 if (dump_enabled_p ())
365 {
366 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
367 dump_dec (MSG_NOTE, vectorization_factor);
368 dump_printf (MSG_NOTE, "\n");
369 }
370
371 if (known_le (vectorization_factor, 1U))
372 return opt_result::failure_at (vect_location,
373 "not vectorized: unsupported data-type\n");
374 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
375
376 for (i = 0; i < mask_producers.length (); i++)
377 {
378 stmt_info = mask_producers[i];
379 opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
380 if (!mask_type)
381 return opt_result::propagate_failure (mask_type);
382 STMT_VINFO_VECTYPE (stmt_info) = mask_type;
383 }
384
385 return opt_result::success ();
386 }
387
388
389 /* Function vect_is_simple_iv_evolution.
390
391 FORNOW: A simple evolution of an induction variables in the loop is
392 considered a polynomial evolution. */
393
394 static bool
395 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
396 tree * step)
397 {
398 tree init_expr;
399 tree step_expr;
400 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
401 basic_block bb;
402
403 /* When there is no evolution in this loop, the evolution function
404 is not "simple". */
405 if (evolution_part == NULL_TREE)
406 return false;
407
408 /* When the evolution is a polynomial of degree >= 2
409 the evolution function is not "simple". */
410 if (tree_is_chrec (evolution_part))
411 return false;
412
413 step_expr = evolution_part;
414 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
415
416 if (dump_enabled_p ())
417 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
418 step_expr, init_expr);
419
420 *init = init_expr;
421 *step = step_expr;
422
423 if (TREE_CODE (step_expr) != INTEGER_CST
424 && (TREE_CODE (step_expr) != SSA_NAME
425 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
426 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
427 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
428 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
429 || !flag_associative_math)))
430 && (TREE_CODE (step_expr) != REAL_CST
431 || !flag_associative_math))
432 {
433 if (dump_enabled_p ())
434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
435 "step unknown.\n");
436 return false;
437 }
438
439 return true;
440 }
441
442 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
443 what we are assuming is a double reduction. For example, given
444 a structure like this:
445
446 outer1:
447 x_1 = PHI <x_4(outer2), ...>;
448 ...
449
450 inner:
451 x_2 = PHI <x_1(outer1), ...>;
452 ...
453 x_3 = ...;
454 ...
455
456 outer2:
457 x_4 = PHI <x_3(inner)>;
458 ...
459
460 outer loop analysis would treat x_1 as a double reduction phi and
461 this function would then return true for x_2. */
462
463 static bool
464 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
465 {
466 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
467 use_operand_p use_p;
468 ssa_op_iter op_iter;
469 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
470 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
471 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
472 return true;
473 return false;
474 }
475
476 /* Function vect_analyze_scalar_cycles_1.
477
478 Examine the cross iteration def-use cycles of scalar variables
479 in LOOP. LOOP_VINFO represents the loop that is now being
480 considered for vectorization (can be LOOP, or an outer-loop
481 enclosing LOOP). */
482
483 static void
484 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
485 {
486 basic_block bb = loop->header;
487 tree init, step;
488 auto_vec<stmt_vec_info, 64> worklist;
489 gphi_iterator gsi;
490 bool double_reduc;
491
492 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
493
494 /* First - identify all inductions. Reduction detection assumes that all the
495 inductions have been identified, therefore, this order must not be
496 changed. */
497 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
498 {
499 gphi *phi = gsi.phi ();
500 tree access_fn = NULL;
501 tree def = PHI_RESULT (phi);
502 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
503
504 if (dump_enabled_p ())
505 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
506
507 /* Skip virtual phi's. The data dependences that are associated with
508 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
509 if (virtual_operand_p (def))
510 continue;
511
512 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
513
514 /* Analyze the evolution function. */
515 access_fn = analyze_scalar_evolution (loop, def);
516 if (access_fn)
517 {
518 STRIP_NOPS (access_fn);
519 if (dump_enabled_p ())
520 dump_printf_loc (MSG_NOTE, vect_location,
521 "Access function of PHI: %T\n", access_fn);
522 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
523 = initial_condition_in_loop_num (access_fn, loop->num);
524 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
525 = evolution_part_in_loop_num (access_fn, loop->num);
526 }
527
528 if (!access_fn
529 || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
530 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
531 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
532 && TREE_CODE (step) != INTEGER_CST))
533 {
534 worklist.safe_push (stmt_vinfo);
535 continue;
536 }
537
538 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
539 != NULL_TREE);
540 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
541
542 if (dump_enabled_p ())
543 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
544 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
545 }
546
547
548 /* Second - identify all reductions and nested cycles. */
549 while (worklist.length () > 0)
550 {
551 stmt_vec_info stmt_vinfo = worklist.pop ();
552 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
553 tree def = PHI_RESULT (phi);
554
555 if (dump_enabled_p ())
556 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
557
558 gcc_assert (!virtual_operand_p (def)
559 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
560
561 stmt_vec_info reduc_stmt_info
562 = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
563 &double_reduc, false);
564 if (reduc_stmt_info)
565 {
566 if (double_reduc)
567 {
568 if (dump_enabled_p ())
569 dump_printf_loc (MSG_NOTE, vect_location,
570 "Detected double reduction.\n");
571
572 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
573 STMT_VINFO_DEF_TYPE (reduc_stmt_info)
574 = vect_double_reduction_def;
575 }
576 else
577 {
578 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
579 {
580 if (dump_enabled_p ())
581 dump_printf_loc (MSG_NOTE, vect_location,
582 "Detected vectorizable nested cycle.\n");
583
584 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
585 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
586 }
587 else
588 {
589 if (dump_enabled_p ())
590 dump_printf_loc (MSG_NOTE, vect_location,
591 "Detected reduction.\n");
592
593 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
594 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
595 /* Store the reduction cycles for possible vectorization in
596 loop-aware SLP if it was not detected as reduction
597 chain. */
598 if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
599 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
600 (reduc_stmt_info);
601 }
602 }
603 }
604 else
605 if (dump_enabled_p ())
606 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
607 "Unknown def-use cycle pattern.\n");
608 }
609 }
610
611
612 /* Function vect_analyze_scalar_cycles.
613
614 Examine the cross iteration def-use cycles of scalar variables, by
615 analyzing the loop-header PHIs of scalar variables. Classify each
616 cycle as one of the following: invariant, induction, reduction, unknown.
617 We do that for the loop represented by LOOP_VINFO, and also to its
618 inner-loop, if exists.
619 Examples for scalar cycles:
620
621 Example1: reduction:
622
623 loop1:
624 for (i=0; i<N; i++)
625 sum += a[i];
626
627 Example2: induction:
628
629 loop2:
630 for (i=0; i<N; i++)
631 a[i] = i; */
632
633 static void
634 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
635 {
636 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
637
638 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
639
640 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
641 Reductions in such inner-loop therefore have different properties than
642 the reductions in the nest that gets vectorized:
643 1. When vectorized, they are executed in the same order as in the original
644 scalar loop, so we can't change the order of computation when
645 vectorizing them.
646 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
647 current checks are too strict. */
648
649 if (loop->inner)
650 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
651 }
652
653 /* Transfer group and reduction information from STMT_INFO to its
654 pattern stmt. */
655
656 static void
657 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
658 {
659 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
660 stmt_vec_info stmtp;
661 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
662 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
663 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
664 do
665 {
666 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
667 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
668 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
669 if (stmt_info)
670 REDUC_GROUP_NEXT_ELEMENT (stmtp)
671 = STMT_VINFO_RELATED_STMT (stmt_info);
672 }
673 while (stmt_info);
674 STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
675 }
676
677 /* Fixup scalar cycles that now have their stmts detected as patterns. */
678
679 static void
680 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
681 {
682 stmt_vec_info first;
683 unsigned i;
684
685 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
686 if (STMT_VINFO_IN_PATTERN_P (first))
687 {
688 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
689 while (next)
690 {
691 if (! STMT_VINFO_IN_PATTERN_P (next))
692 break;
693 next = REDUC_GROUP_NEXT_ELEMENT (next);
694 }
695 /* If not all stmt in the chain are patterns try to handle
696 the chain without patterns. */
697 if (! next)
698 {
699 vect_fixup_reduc_chain (first);
700 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
701 = STMT_VINFO_RELATED_STMT (first);
702 }
703 }
704 }
705
706 /* Function vect_get_loop_niters.
707
708 Determine how many iterations the loop is executed and place it
709 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
710 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
711 niter information holds in ASSUMPTIONS.
712
713 Return the loop exit condition. */
714
715
716 static gcond *
717 vect_get_loop_niters (struct loop *loop, tree *assumptions,
718 tree *number_of_iterations, tree *number_of_iterationsm1)
719 {
720 edge exit = single_exit (loop);
721 struct tree_niter_desc niter_desc;
722 tree niter_assumptions, niter, may_be_zero;
723 gcond *cond = get_loop_exit_condition (loop);
724
725 *assumptions = boolean_true_node;
726 *number_of_iterationsm1 = chrec_dont_know;
727 *number_of_iterations = chrec_dont_know;
728 DUMP_VECT_SCOPE ("get_loop_niters");
729
730 if (!exit)
731 return cond;
732
733 niter = chrec_dont_know;
734 may_be_zero = NULL_TREE;
735 niter_assumptions = boolean_true_node;
736 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
737 || chrec_contains_undetermined (niter_desc.niter))
738 return cond;
739
740 niter_assumptions = niter_desc.assumptions;
741 may_be_zero = niter_desc.may_be_zero;
742 niter = niter_desc.niter;
743
744 if (may_be_zero && integer_zerop (may_be_zero))
745 may_be_zero = NULL_TREE;
746
747 if (may_be_zero)
748 {
749 if (COMPARISON_CLASS_P (may_be_zero))
750 {
751 /* Try to combine may_be_zero with assumptions, this can simplify
752 computation of niter expression. */
753 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
754 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
755 niter_assumptions,
756 fold_build1 (TRUTH_NOT_EXPR,
757 boolean_type_node,
758 may_be_zero));
759 else
760 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
761 build_int_cst (TREE_TYPE (niter), 0),
762 rewrite_to_non_trapping_overflow (niter));
763
764 may_be_zero = NULL_TREE;
765 }
766 else if (integer_nonzerop (may_be_zero))
767 {
768 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
769 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
770 return cond;
771 }
772 else
773 return cond;
774 }
775
776 *assumptions = niter_assumptions;
777 *number_of_iterationsm1 = niter;
778
779 /* We want the number of loop header executions which is the number
780 of latch executions plus one.
781 ??? For UINT_MAX latch executions this number overflows to zero
782 for loops like do { n++; } while (n != 0); */
783 if (niter && !chrec_contains_undetermined (niter))
784 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
785 build_int_cst (TREE_TYPE (niter), 1));
786 *number_of_iterations = niter;
787
788 return cond;
789 }
790
791 /* Function bb_in_loop_p
792
793 Used as predicate for dfs order traversal of the loop bbs. */
794
795 static bool
796 bb_in_loop_p (const_basic_block bb, const void *data)
797 {
798 const struct loop *const loop = (const struct loop *)data;
799 if (flow_bb_inside_loop_p (loop, bb))
800 return true;
801 return false;
802 }
803
804
805 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
806 stmt_vec_info structs for all the stmts in LOOP_IN. */
807
808 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
809 : vec_info (vec_info::loop, init_cost (loop_in), shared),
810 loop (loop_in),
811 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
812 num_itersm1 (NULL_TREE),
813 num_iters (NULL_TREE),
814 num_iters_unchanged (NULL_TREE),
815 num_iters_assumptions (NULL_TREE),
816 th (0),
817 versioning_threshold (0),
818 vectorization_factor (0),
819 max_vectorization_factor (0),
820 mask_skip_niters (NULL_TREE),
821 mask_compare_type (NULL_TREE),
822 unaligned_dr (NULL),
823 peeling_for_alignment (0),
824 ptr_mask (0),
825 ivexpr_map (NULL),
826 slp_unrolling_factor (1),
827 single_scalar_iteration_cost (0),
828 vectorizable (false),
829 can_fully_mask_p (true),
830 fully_masked_p (false),
831 peeling_for_gaps (false),
832 peeling_for_niter (false),
833 operands_swapped (false),
834 no_data_dependencies (false),
835 has_mask_store (false),
836 scalar_loop (NULL),
837 orig_loop_info (NULL)
838 {
839 /* CHECKME: We want to visit all BBs before their successors (except for
840 latch blocks, for which this assertion wouldn't hold). In the simple
841 case of the loop forms we allow, a dfs order of the BBs would the same
842 as reversed postorder traversal, so we are safe. */
843
844 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
845 bbs, loop->num_nodes, loop);
846 gcc_assert (nbbs == loop->num_nodes);
847
848 for (unsigned int i = 0; i < nbbs; i++)
849 {
850 basic_block bb = bbs[i];
851 gimple_stmt_iterator si;
852
853 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
854 {
855 gimple *phi = gsi_stmt (si);
856 gimple_set_uid (phi, 0);
857 add_stmt (phi);
858 }
859
860 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
861 {
862 gimple *stmt = gsi_stmt (si);
863 gimple_set_uid (stmt, 0);
864 add_stmt (stmt);
865 }
866 }
867 }
868
869 /* Free all levels of MASKS. */
870
871 void
872 release_vec_loop_masks (vec_loop_masks *masks)
873 {
874 rgroup_masks *rgm;
875 unsigned int i;
876 FOR_EACH_VEC_ELT (*masks, i, rgm)
877 rgm->masks.release ();
878 masks->release ();
879 }
880
881 /* Free all memory used by the _loop_vec_info, as well as all the
882 stmt_vec_info structs of all the stmts in the loop. */
883
884 _loop_vec_info::~_loop_vec_info ()
885 {
886 int nbbs;
887 gimple_stmt_iterator si;
888 int j;
889
890 nbbs = loop->num_nodes;
891 for (j = 0; j < nbbs; j++)
892 {
893 basic_block bb = bbs[j];
894 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
895 {
896 gimple *stmt = gsi_stmt (si);
897
898 /* We may have broken canonical form by moving a constant
899 into RHS1 of a commutative op. Fix such occurrences. */
900 if (operands_swapped && is_gimple_assign (stmt))
901 {
902 enum tree_code code = gimple_assign_rhs_code (stmt);
903
904 if ((code == PLUS_EXPR
905 || code == POINTER_PLUS_EXPR
906 || code == MULT_EXPR)
907 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
908 swap_ssa_operands (stmt,
909 gimple_assign_rhs1_ptr (stmt),
910 gimple_assign_rhs2_ptr (stmt));
911 else if (code == COND_EXPR
912 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
913 {
914 tree cond_expr = gimple_assign_rhs1 (stmt);
915 enum tree_code cond_code = TREE_CODE (cond_expr);
916
917 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
918 {
919 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
920 0));
921 cond_code = invert_tree_comparison (cond_code,
922 honor_nans);
923 if (cond_code != ERROR_MARK)
924 {
925 TREE_SET_CODE (cond_expr, cond_code);
926 swap_ssa_operands (stmt,
927 gimple_assign_rhs2_ptr (stmt),
928 gimple_assign_rhs3_ptr (stmt));
929 }
930 }
931 }
932 }
933 gsi_next (&si);
934 }
935 }
936
937 free (bbs);
938
939 release_vec_loop_masks (&masks);
940 delete ivexpr_map;
941
942 loop->aux = NULL;
943 }
944
945 /* Return an invariant or register for EXPR and emit necessary
946 computations in the LOOP_VINFO loop preheader. */
947
948 tree
949 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
950 {
951 if (is_gimple_reg (expr)
952 || is_gimple_min_invariant (expr))
953 return expr;
954
955 if (! loop_vinfo->ivexpr_map)
956 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
957 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
958 if (! cached)
959 {
960 gimple_seq stmts = NULL;
961 cached = force_gimple_operand (unshare_expr (expr),
962 &stmts, true, NULL_TREE);
963 if (stmts)
964 {
965 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
966 gsi_insert_seq_on_edge_immediate (e, stmts);
967 }
968 }
969 return cached;
970 }
971
972 /* Return true if we can use CMP_TYPE as the comparison type to produce
973 all masks required to mask LOOP_VINFO. */
974
975 static bool
976 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
977 {
978 rgroup_masks *rgm;
979 unsigned int i;
980 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
981 if (rgm->mask_type != NULL_TREE
982 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
983 cmp_type, rgm->mask_type,
984 OPTIMIZE_FOR_SPEED))
985 return false;
986 return true;
987 }
988
989 /* Calculate the maximum number of scalars per iteration for every
990 rgroup in LOOP_VINFO. */
991
992 static unsigned int
993 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
994 {
995 unsigned int res = 1;
996 unsigned int i;
997 rgroup_masks *rgm;
998 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
999 res = MAX (res, rgm->max_nscalars_per_iter);
1000 return res;
1001 }
1002
1003 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1004 whether we can actually generate the masks required. Return true if so,
1005 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1006
1007 static bool
1008 vect_verify_full_masking (loop_vec_info loop_vinfo)
1009 {
1010 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1011 unsigned int min_ni_width;
1012
1013 /* Use a normal loop if there are no statements that need masking.
1014 This only happens in rare degenerate cases: it means that the loop
1015 has no loads, no stores, and no live-out values. */
1016 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1017 return false;
1018
1019 /* Get the maximum number of iterations that is representable
1020 in the counter type. */
1021 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1022 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1023
1024 /* Get a more refined estimate for the number of iterations. */
1025 widest_int max_back_edges;
1026 if (max_loop_iterations (loop, &max_back_edges))
1027 max_ni = wi::smin (max_ni, max_back_edges + 1);
1028
1029 /* Account for rgroup masks, in which each bit is replicated N times. */
1030 max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1031
1032 /* Work out how many bits we need to represent the limit. */
1033 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1034
1035 /* Find a scalar mode for which WHILE_ULT is supported. */
1036 opt_scalar_int_mode cmp_mode_iter;
1037 tree cmp_type = NULL_TREE;
1038 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1039 {
1040 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1041 if (cmp_bits >= min_ni_width
1042 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1043 {
1044 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1045 if (this_type
1046 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1047 {
1048 /* Although we could stop as soon as we find a valid mode,
1049 it's often better to continue until we hit Pmode, since the
1050 operands to the WHILE are more likely to be reusable in
1051 address calculations. */
1052 cmp_type = this_type;
1053 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1054 break;
1055 }
1056 }
1057 }
1058
1059 if (!cmp_type)
1060 return false;
1061
1062 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1063 return true;
1064 }
1065
1066 /* Calculate the cost of one scalar iteration of the loop. */
1067 static void
1068 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1069 {
1070 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1071 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1072 int nbbs = loop->num_nodes, factor;
1073 int innerloop_iters, i;
1074
1075 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1076
1077 /* Gather costs for statements in the scalar loop. */
1078
1079 /* FORNOW. */
1080 innerloop_iters = 1;
1081 if (loop->inner)
1082 innerloop_iters = 50; /* FIXME */
1083
1084 for (i = 0; i < nbbs; i++)
1085 {
1086 gimple_stmt_iterator si;
1087 basic_block bb = bbs[i];
1088
1089 if (bb->loop_father == loop->inner)
1090 factor = innerloop_iters;
1091 else
1092 factor = 1;
1093
1094 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1095 {
1096 gimple *stmt = gsi_stmt (si);
1097 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1098
1099 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1100 continue;
1101
1102 /* Skip stmts that are not vectorized inside the loop. */
1103 if (stmt_info
1104 && !STMT_VINFO_RELEVANT_P (stmt_info)
1105 && (!STMT_VINFO_LIVE_P (stmt_info)
1106 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1107 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1108 continue;
1109
1110 vect_cost_for_stmt kind;
1111 if (STMT_VINFO_DATA_REF (stmt_info))
1112 {
1113 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1114 kind = scalar_load;
1115 else
1116 kind = scalar_store;
1117 }
1118 else
1119 kind = scalar_stmt;
1120
1121 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1122 factor, kind, stmt_info, 0, vect_prologue);
1123 }
1124 }
1125
1126 /* Now accumulate cost. */
1127 void *target_cost_data = init_cost (loop);
1128 stmt_info_for_cost *si;
1129 int j;
1130 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1131 j, si)
1132 (void) add_stmt_cost (target_cost_data, si->count,
1133 si->kind, si->stmt_info, si->misalign,
1134 vect_body);
1135 unsigned dummy, body_cost = 0;
1136 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1137 destroy_cost_data (target_cost_data);
1138 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1139 }
1140
1141
1142 /* Function vect_analyze_loop_form_1.
1143
1144 Verify that certain CFG restrictions hold, including:
1145 - the loop has a pre-header
1146 - the loop has a single entry and exit
1147 - the loop exit condition is simple enough
1148 - the number of iterations can be analyzed, i.e, a countable loop. The
1149 niter could be analyzed under some assumptions. */
1150
1151 opt_result
1152 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1153 tree *assumptions, tree *number_of_iterationsm1,
1154 tree *number_of_iterations, gcond **inner_loop_cond)
1155 {
1156 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1157
1158 /* Different restrictions apply when we are considering an inner-most loop,
1159 vs. an outer (nested) loop.
1160 (FORNOW. May want to relax some of these restrictions in the future). */
1161
1162 if (!loop->inner)
1163 {
1164 /* Inner-most loop. We currently require that the number of BBs is
1165 exactly 2 (the header and latch). Vectorizable inner-most loops
1166 look like this:
1167
1168 (pre-header)
1169 |
1170 header <--------+
1171 | | |
1172 | +--> latch --+
1173 |
1174 (exit-bb) */
1175
1176 if (loop->num_nodes != 2)
1177 return opt_result::failure_at (vect_location,
1178 "not vectorized:"
1179 " control flow in loop.\n");
1180
1181 if (empty_block_p (loop->header))
1182 return opt_result::failure_at (vect_location,
1183 "not vectorized: empty loop.\n");
1184 }
1185 else
1186 {
1187 struct loop *innerloop = loop->inner;
1188 edge entryedge;
1189
1190 /* Nested loop. We currently require that the loop is doubly-nested,
1191 contains a single inner loop, and the number of BBs is exactly 5.
1192 Vectorizable outer-loops look like this:
1193
1194 (pre-header)
1195 |
1196 header <---+
1197 | |
1198 inner-loop |
1199 | |
1200 tail ------+
1201 |
1202 (exit-bb)
1203
1204 The inner-loop has the properties expected of inner-most loops
1205 as described above. */
1206
1207 if ((loop->inner)->inner || (loop->inner)->next)
1208 return opt_result::failure_at (vect_location,
1209 "not vectorized:"
1210 " multiple nested loops.\n");
1211
1212 if (loop->num_nodes != 5)
1213 return opt_result::failure_at (vect_location,
1214 "not vectorized:"
1215 " control flow in loop.\n");
1216
1217 entryedge = loop_preheader_edge (innerloop);
1218 if (entryedge->src != loop->header
1219 || !single_exit (innerloop)
1220 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1221 return opt_result::failure_at (vect_location,
1222 "not vectorized:"
1223 " unsupported outerloop form.\n");
1224
1225 /* Analyze the inner-loop. */
1226 tree inner_niterm1, inner_niter, inner_assumptions;
1227 opt_result res
1228 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1229 &inner_assumptions, &inner_niterm1,
1230 &inner_niter, NULL);
1231 if (!res)
1232 {
1233 if (dump_enabled_p ())
1234 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1235 "not vectorized: Bad inner loop.\n");
1236 return res;
1237 }
1238
1239 /* Don't support analyzing niter under assumptions for inner
1240 loop. */
1241 if (!integer_onep (inner_assumptions))
1242 return opt_result::failure_at (vect_location,
1243 "not vectorized: Bad inner loop.\n");
1244
1245 if (!expr_invariant_in_loop_p (loop, inner_niter))
1246 return opt_result::failure_at (vect_location,
1247 "not vectorized: inner-loop count not"
1248 " invariant.\n");
1249
1250 if (dump_enabled_p ())
1251 dump_printf_loc (MSG_NOTE, vect_location,
1252 "Considering outer-loop vectorization.\n");
1253 }
1254
1255 if (!single_exit (loop))
1256 return opt_result::failure_at (vect_location,
1257 "not vectorized: multiple exits.\n");
1258 if (EDGE_COUNT (loop->header->preds) != 2)
1259 return opt_result::failure_at (vect_location,
1260 "not vectorized:"
1261 " too many incoming edges.\n");
1262
1263 /* We assume that the loop exit condition is at the end of the loop. i.e,
1264 that the loop is represented as a do-while (with a proper if-guard
1265 before the loop if needed), where the loop header contains all the
1266 executable statements, and the latch is empty. */
1267 if (!empty_block_p (loop->latch)
1268 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1269 return opt_result::failure_at (vect_location,
1270 "not vectorized: latch block not empty.\n");
1271
1272 /* Make sure the exit is not abnormal. */
1273 edge e = single_exit (loop);
1274 if (e->flags & EDGE_ABNORMAL)
1275 return opt_result::failure_at (vect_location,
1276 "not vectorized:"
1277 " abnormal loop exit edge.\n");
1278
1279 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1280 number_of_iterationsm1);
1281 if (!*loop_cond)
1282 return opt_result::failure_at
1283 (vect_location,
1284 "not vectorized: complicated exit condition.\n");
1285
1286 if (integer_zerop (*assumptions)
1287 || !*number_of_iterations
1288 || chrec_contains_undetermined (*number_of_iterations))
1289 return opt_result::failure_at
1290 (*loop_cond,
1291 "not vectorized: number of iterations cannot be computed.\n");
1292
1293 if (integer_zerop (*number_of_iterations))
1294 return opt_result::failure_at
1295 (*loop_cond,
1296 "not vectorized: number of iterations = 0.\n");
1297
1298 return opt_result::success ();
1299 }
1300
1301 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1302
1303 opt_loop_vec_info
1304 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1305 {
1306 tree assumptions, number_of_iterations, number_of_iterationsm1;
1307 gcond *loop_cond, *inner_loop_cond = NULL;
1308
1309 opt_result res
1310 = vect_analyze_loop_form_1 (loop, &loop_cond,
1311 &assumptions, &number_of_iterationsm1,
1312 &number_of_iterations, &inner_loop_cond);
1313 if (!res)
1314 return opt_loop_vec_info::propagate_failure (res);
1315
1316 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1317 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1318 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1319 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1320 if (!integer_onep (assumptions))
1321 {
1322 /* We consider to vectorize this loop by versioning it under
1323 some assumptions. In order to do this, we need to clear
1324 existing information computed by scev and niter analyzer. */
1325 scev_reset_htab ();
1326 free_numbers_of_iterations_estimates (loop);
1327 /* Also set flag for this loop so that following scev and niter
1328 analysis are done under the assumptions. */
1329 loop_constraint_set (loop, LOOP_C_FINITE);
1330 /* Also record the assumptions for versioning. */
1331 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1332 }
1333
1334 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1335 {
1336 if (dump_enabled_p ())
1337 {
1338 dump_printf_loc (MSG_NOTE, vect_location,
1339 "Symbolic number of iterations is ");
1340 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1341 dump_printf (MSG_NOTE, "\n");
1342 }
1343 }
1344
1345 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1346 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1347 if (inner_loop_cond)
1348 {
1349 stmt_vec_info inner_loop_cond_info
1350 = loop_vinfo->lookup_stmt (inner_loop_cond);
1351 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1352 }
1353
1354 gcc_assert (!loop->aux);
1355 loop->aux = loop_vinfo;
1356 return opt_loop_vec_info::success (loop_vinfo);
1357 }
1358
1359
1360
1361 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1362 statements update the vectorization factor. */
1363
1364 static void
1365 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1366 {
1367 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1368 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1369 int nbbs = loop->num_nodes;
1370 poly_uint64 vectorization_factor;
1371 int i;
1372
1373 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1374
1375 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1376 gcc_assert (known_ne (vectorization_factor, 0U));
1377
1378 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1379 vectorization factor of the loop is the unrolling factor required by
1380 the SLP instances. If that unrolling factor is 1, we say, that we
1381 perform pure SLP on loop - cross iteration parallelism is not
1382 exploited. */
1383 bool only_slp_in_loop = true;
1384 for (i = 0; i < nbbs; i++)
1385 {
1386 basic_block bb = bbs[i];
1387 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1388 gsi_next (&si))
1389 {
1390 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1391 stmt_info = vect_stmt_to_vectorize (stmt_info);
1392 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1393 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1394 && !PURE_SLP_STMT (stmt_info))
1395 /* STMT needs both SLP and loop-based vectorization. */
1396 only_slp_in_loop = false;
1397 }
1398 }
1399
1400 if (only_slp_in_loop)
1401 {
1402 if (dump_enabled_p ())
1403 dump_printf_loc (MSG_NOTE, vect_location,
1404 "Loop contains only SLP stmts\n");
1405 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1406 }
1407 else
1408 {
1409 if (dump_enabled_p ())
1410 dump_printf_loc (MSG_NOTE, vect_location,
1411 "Loop contains SLP and non-SLP stmts\n");
1412 /* Both the vectorization factor and unroll factor have the form
1413 current_vector_size * X for some rational X, so they must have
1414 a common multiple. */
1415 vectorization_factor
1416 = force_common_multiple (vectorization_factor,
1417 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1418 }
1419
1420 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1421 if (dump_enabled_p ())
1422 {
1423 dump_printf_loc (MSG_NOTE, vect_location,
1424 "Updating vectorization factor to ");
1425 dump_dec (MSG_NOTE, vectorization_factor);
1426 dump_printf (MSG_NOTE, ".\n");
1427 }
1428 }
1429
1430 /* Return true if STMT_INFO describes a double reduction phi and if
1431 the other phi in the reduction is also relevant for vectorization.
1432 This rejects cases such as:
1433
1434 outer1:
1435 x_1 = PHI <x_3(outer2), ...>;
1436 ...
1437
1438 inner:
1439 x_2 = ...;
1440 ...
1441
1442 outer2:
1443 x_3 = PHI <x_2(inner)>;
1444
1445 if nothing in x_2 or elsewhere makes x_1 relevant. */
1446
1447 static bool
1448 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1449 {
1450 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1451 return false;
1452
1453 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1454 }
1455
1456 /* Function vect_analyze_loop_operations.
1457
1458 Scan the loop stmts and make sure they are all vectorizable. */
1459
1460 static opt_result
1461 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1462 {
1463 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1464 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1465 int nbbs = loop->num_nodes;
1466 int i;
1467 stmt_vec_info stmt_info;
1468 bool need_to_vectorize = false;
1469 bool ok;
1470
1471 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1472
1473 stmt_vector_for_cost cost_vec;
1474 cost_vec.create (2);
1475
1476 for (i = 0; i < nbbs; i++)
1477 {
1478 basic_block bb = bbs[i];
1479
1480 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1481 gsi_next (&si))
1482 {
1483 gphi *phi = si.phi ();
1484 ok = true;
1485
1486 stmt_info = loop_vinfo->lookup_stmt (phi);
1487 if (dump_enabled_p ())
1488 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1489 if (virtual_operand_p (gimple_phi_result (phi)))
1490 continue;
1491
1492 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1493 (i.e., a phi in the tail of the outer-loop). */
1494 if (! is_loop_header_bb_p (bb))
1495 {
1496 /* FORNOW: we currently don't support the case that these phis
1497 are not used in the outerloop (unless it is double reduction,
1498 i.e., this phi is vect_reduction_def), cause this case
1499 requires to actually do something here. */
1500 if (STMT_VINFO_LIVE_P (stmt_info)
1501 && !vect_active_double_reduction_p (stmt_info))
1502 return opt_result::failure_at (phi,
1503 "Unsupported loop-closed phi"
1504 " in outer-loop.\n");
1505
1506 /* If PHI is used in the outer loop, we check that its operand
1507 is defined in the inner loop. */
1508 if (STMT_VINFO_RELEVANT_P (stmt_info))
1509 {
1510 tree phi_op;
1511
1512 if (gimple_phi_num_args (phi) != 1)
1513 return opt_result::failure_at (phi, "unsupported phi");
1514
1515 phi_op = PHI_ARG_DEF (phi, 0);
1516 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1517 if (!op_def_info)
1518 return opt_result::failure_at (phi, "unsupported phi");
1519
1520 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1521 && (STMT_VINFO_RELEVANT (op_def_info)
1522 != vect_used_in_outer_by_reduction))
1523 return opt_result::failure_at (phi, "unsupported phi");
1524 }
1525
1526 continue;
1527 }
1528
1529 gcc_assert (stmt_info);
1530
1531 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1532 || STMT_VINFO_LIVE_P (stmt_info))
1533 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1534 /* A scalar-dependence cycle that we don't support. */
1535 return opt_result::failure_at (phi,
1536 "not vectorized:"
1537 " scalar dependence cycle.\n");
1538
1539 if (STMT_VINFO_RELEVANT_P (stmt_info))
1540 {
1541 need_to_vectorize = true;
1542 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1543 && ! PURE_SLP_STMT (stmt_info))
1544 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1545 &cost_vec);
1546 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1547 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1548 && ! PURE_SLP_STMT (stmt_info))
1549 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
1550 &cost_vec);
1551 }
1552
1553 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1554 if (ok
1555 && STMT_VINFO_LIVE_P (stmt_info)
1556 && !PURE_SLP_STMT (stmt_info))
1557 ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
1558 &cost_vec);
1559
1560 if (!ok)
1561 return opt_result::failure_at (phi,
1562 "not vectorized: relevant phi not "
1563 "supported: %G",
1564 static_cast <gimple *> (phi));
1565 }
1566
1567 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1568 gsi_next (&si))
1569 {
1570 gimple *stmt = gsi_stmt (si);
1571 if (!gimple_clobber_p (stmt))
1572 {
1573 opt_result res
1574 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1575 &need_to_vectorize,
1576 NULL, NULL, &cost_vec);
1577 if (!res)
1578 return res;
1579 }
1580 }
1581 } /* bbs */
1582
1583 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1584 cost_vec.release ();
1585
1586 /* All operations in the loop are either irrelevant (deal with loop
1587 control, or dead), or only used outside the loop and can be moved
1588 out of the loop (e.g. invariants, inductions). The loop can be
1589 optimized away by scalar optimizations. We're better off not
1590 touching this loop. */
1591 if (!need_to_vectorize)
1592 {
1593 if (dump_enabled_p ())
1594 dump_printf_loc (MSG_NOTE, vect_location,
1595 "All the computation can be taken out of the loop.\n");
1596 return opt_result::failure_at
1597 (vect_location,
1598 "not vectorized: redundant loop. no profit to vectorize.\n");
1599 }
1600
1601 return opt_result::success ();
1602 }
1603
1604 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1605 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1606 definitely no, or -1 if it's worth retrying. */
1607
1608 static int
1609 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1610 {
1611 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1612 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1613
1614 /* Only fully-masked loops can have iteration counts less than the
1615 vectorization factor. */
1616 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1617 {
1618 HOST_WIDE_INT max_niter;
1619
1620 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1621 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1622 else
1623 max_niter = max_stmt_executions_int (loop);
1624
1625 if (max_niter != -1
1626 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1627 {
1628 if (dump_enabled_p ())
1629 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1630 "not vectorized: iteration count smaller than "
1631 "vectorization factor.\n");
1632 return 0;
1633 }
1634 }
1635
1636 int min_profitable_iters, min_profitable_estimate;
1637 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1638 &min_profitable_estimate);
1639
1640 if (min_profitable_iters < 0)
1641 {
1642 if (dump_enabled_p ())
1643 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1644 "not vectorized: vectorization not profitable.\n");
1645 if (dump_enabled_p ())
1646 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1647 "not vectorized: vector version will never be "
1648 "profitable.\n");
1649 return -1;
1650 }
1651
1652 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1653 * assumed_vf);
1654
1655 /* Use the cost model only if it is more conservative than user specified
1656 threshold. */
1657 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1658 min_profitable_iters);
1659
1660 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1661
1662 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1663 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1664 {
1665 if (dump_enabled_p ())
1666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1667 "not vectorized: vectorization not profitable.\n");
1668 if (dump_enabled_p ())
1669 dump_printf_loc (MSG_NOTE, vect_location,
1670 "not vectorized: iteration count smaller than user "
1671 "specified loop bound parameter or minimum profitable "
1672 "iterations (whichever is more conservative).\n");
1673 return 0;
1674 }
1675
1676 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1677 if (estimated_niter == -1)
1678 estimated_niter = likely_max_stmt_executions_int (loop);
1679 if (estimated_niter != -1
1680 && ((unsigned HOST_WIDE_INT) estimated_niter
1681 < MAX (th, (unsigned) min_profitable_estimate)))
1682 {
1683 if (dump_enabled_p ())
1684 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1685 "not vectorized: estimated iteration count too "
1686 "small.\n");
1687 if (dump_enabled_p ())
1688 dump_printf_loc (MSG_NOTE, vect_location,
1689 "not vectorized: estimated iteration count smaller "
1690 "than specified loop bound parameter or minimum "
1691 "profitable iterations (whichever is more "
1692 "conservative).\n");
1693 return -1;
1694 }
1695
1696 return 1;
1697 }
1698
1699 static opt_result
1700 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1701 vec<data_reference_p> *datarefs,
1702 unsigned int *n_stmts)
1703 {
1704 *n_stmts = 0;
1705 for (unsigned i = 0; i < loop->num_nodes; i++)
1706 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1707 !gsi_end_p (gsi); gsi_next (&gsi))
1708 {
1709 gimple *stmt = gsi_stmt (gsi);
1710 if (is_gimple_debug (stmt))
1711 continue;
1712 ++(*n_stmts);
1713 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1714 if (!res)
1715 {
1716 if (is_gimple_call (stmt) && loop->safelen)
1717 {
1718 tree fndecl = gimple_call_fndecl (stmt), op;
1719 if (fndecl != NULL_TREE)
1720 {
1721 cgraph_node *node = cgraph_node::get (fndecl);
1722 if (node != NULL && node->simd_clones != NULL)
1723 {
1724 unsigned int j, n = gimple_call_num_args (stmt);
1725 for (j = 0; j < n; j++)
1726 {
1727 op = gimple_call_arg (stmt, j);
1728 if (DECL_P (op)
1729 || (REFERENCE_CLASS_P (op)
1730 && get_base_address (op)))
1731 break;
1732 }
1733 op = gimple_call_lhs (stmt);
1734 /* Ignore #pragma omp declare simd functions
1735 if they don't have data references in the
1736 call stmt itself. */
1737 if (j == n
1738 && !(op
1739 && (DECL_P (op)
1740 || (REFERENCE_CLASS_P (op)
1741 && get_base_address (op)))))
1742 continue;
1743 }
1744 }
1745 }
1746 return res;
1747 }
1748 /* If dependence analysis will give up due to the limit on the
1749 number of datarefs stop here and fail fatally. */
1750 if (datarefs->length ()
1751 > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1752 return opt_result::failure_at (stmt, "exceeded param "
1753 "loop-max-datarefs-for-datadeps\n");
1754 }
1755 return opt_result::success ();
1756 }
1757
1758 /* Function vect_analyze_loop_2.
1759
1760 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1761 for it. The different analyses will record information in the
1762 loop_vec_info struct. */
1763 static opt_result
1764 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1765 {
1766 opt_result ok = opt_result::success ();
1767 int res;
1768 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1769 poly_uint64 min_vf = 2;
1770
1771 /* The first group of checks is independent of the vector size. */
1772 fatal = true;
1773
1774 /* Find all data references in the loop (which correspond to vdefs/vuses)
1775 and analyze their evolution in the loop. */
1776
1777 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1778
1779 /* Gather the data references and count stmts in the loop. */
1780 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1781 {
1782 opt_result res
1783 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1784 &LOOP_VINFO_DATAREFS (loop_vinfo),
1785 n_stmts);
1786 if (!res)
1787 {
1788 if (dump_enabled_p ())
1789 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1790 "not vectorized: loop contains function "
1791 "calls or data references that cannot "
1792 "be analyzed\n");
1793 return res;
1794 }
1795 loop_vinfo->shared->save_datarefs ();
1796 }
1797 else
1798 loop_vinfo->shared->check_datarefs ();
1799
1800 /* Analyze the data references and also adjust the minimal
1801 vectorization factor according to the loads and stores. */
1802
1803 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1804 if (!ok)
1805 {
1806 if (dump_enabled_p ())
1807 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1808 "bad data references.\n");
1809 return ok;
1810 }
1811
1812 /* Classify all cross-iteration scalar data-flow cycles.
1813 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1814 vect_analyze_scalar_cycles (loop_vinfo);
1815
1816 vect_pattern_recog (loop_vinfo);
1817
1818 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1819
1820 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1821 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1822
1823 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1824 if (!ok)
1825 {
1826 if (dump_enabled_p ())
1827 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1828 "bad data access.\n");
1829 return ok;
1830 }
1831
1832 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1833
1834 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1835 if (!ok)
1836 {
1837 if (dump_enabled_p ())
1838 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1839 "unexpected pattern.\n");
1840 return ok;
1841 }
1842
1843 /* While the rest of the analysis below depends on it in some way. */
1844 fatal = false;
1845
1846 /* Analyze data dependences between the data-refs in the loop
1847 and adjust the maximum vectorization factor according to
1848 the dependences.
1849 FORNOW: fail at the first data dependence that we encounter. */
1850
1851 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1852 if (!ok)
1853 {
1854 if (dump_enabled_p ())
1855 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1856 "bad data dependence.\n");
1857 return ok;
1858 }
1859 if (max_vf != MAX_VECTORIZATION_FACTOR
1860 && maybe_lt (max_vf, min_vf))
1861 return opt_result::failure_at (vect_location, "bad data dependence.\n");
1862 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1863
1864 ok = vect_determine_vectorization_factor (loop_vinfo);
1865 if (!ok)
1866 {
1867 if (dump_enabled_p ())
1868 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1869 "can't determine vectorization factor.\n");
1870 return ok;
1871 }
1872 if (max_vf != MAX_VECTORIZATION_FACTOR
1873 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1874 return opt_result::failure_at (vect_location, "bad data dependence.\n");
1875
1876 /* Compute the scalar iteration cost. */
1877 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1878
1879 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1880 unsigned th;
1881
1882 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1883 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1884 if (!ok)
1885 return ok;
1886
1887 /* If there are any SLP instances mark them as pure_slp. */
1888 bool slp = vect_make_slp_decision (loop_vinfo);
1889 if (slp)
1890 {
1891 /* Find stmts that need to be both vectorized and SLPed. */
1892 vect_detect_hybrid_slp (loop_vinfo);
1893
1894 /* Update the vectorization factor based on the SLP decision. */
1895 vect_update_vf_for_slp (loop_vinfo);
1896 }
1897
1898 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1899
1900 /* We don't expect to have to roll back to anything other than an empty
1901 set of rgroups. */
1902 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1903
1904 /* This is the point where we can re-start analysis with SLP forced off. */
1905 start_over:
1906
1907 /* Now the vectorization factor is final. */
1908 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1909 gcc_assert (known_ne (vectorization_factor, 0U));
1910
1911 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1912 {
1913 dump_printf_loc (MSG_NOTE, vect_location,
1914 "vectorization_factor = ");
1915 dump_dec (MSG_NOTE, vectorization_factor);
1916 dump_printf (MSG_NOTE, ", niters = %wd\n",
1917 LOOP_VINFO_INT_NITERS (loop_vinfo));
1918 }
1919
1920 HOST_WIDE_INT max_niter
1921 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1922
1923 /* Analyze the alignment of the data-refs in the loop.
1924 Fail if a data reference is found that cannot be vectorized. */
1925
1926 ok = vect_analyze_data_refs_alignment (loop_vinfo);
1927 if (!ok)
1928 {
1929 if (dump_enabled_p ())
1930 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1931 "bad data alignment.\n");
1932 return ok;
1933 }
1934
1935 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1936 It is important to call pruning after vect_analyze_data_ref_accesses,
1937 since we use grouping information gathered by interleaving analysis. */
1938 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1939 if (!ok)
1940 return ok;
1941
1942 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
1943 vectorization, since we do not want to add extra peeling or
1944 add versioning for alignment. */
1945 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1946 /* This pass will decide on using loop versioning and/or loop peeling in
1947 order to enhance the alignment of data references in the loop. */
1948 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1949 else
1950 ok = vect_verify_datarefs_alignment (loop_vinfo);
1951 if (!ok)
1952 return ok;
1953
1954 if (slp)
1955 {
1956 /* Analyze operations in the SLP instances. Note this may
1957 remove unsupported SLP instances which makes the above
1958 SLP kind detection invalid. */
1959 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1960 vect_slp_analyze_operations (loop_vinfo);
1961 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1962 {
1963 ok = opt_result::failure_at (vect_location,
1964 "unsupported SLP instances\n");
1965 goto again;
1966 }
1967 }
1968
1969 /* Scan all the remaining operations in the loop that are not subject
1970 to SLP and make sure they are vectorizable. */
1971 ok = vect_analyze_loop_operations (loop_vinfo);
1972 if (!ok)
1973 {
1974 if (dump_enabled_p ())
1975 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1976 "bad operation or unsupported loop bound.\n");
1977 return ok;
1978 }
1979
1980 /* Decide whether to use a fully-masked loop for this vectorization
1981 factor. */
1982 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
1983 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
1984 && vect_verify_full_masking (loop_vinfo));
1985 if (dump_enabled_p ())
1986 {
1987 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1988 dump_printf_loc (MSG_NOTE, vect_location,
1989 "using a fully-masked loop.\n");
1990 else
1991 dump_printf_loc (MSG_NOTE, vect_location,
1992 "not using a fully-masked loop.\n");
1993 }
1994
1995 /* If epilog loop is required because of data accesses with gaps,
1996 one additional iteration needs to be peeled. Check if there is
1997 enough iterations for vectorization. */
1998 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1999 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2000 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2001 {
2002 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2003 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2004
2005 if (known_lt (wi::to_widest (scalar_niters), vf))
2006 return opt_result::failure_at (vect_location,
2007 "loop has no enough iterations to"
2008 " support peeling for gaps.\n");
2009 }
2010
2011 /* Check the costings of the loop make vectorizing worthwhile. */
2012 res = vect_analyze_loop_costing (loop_vinfo);
2013 if (res < 0)
2014 {
2015 ok = opt_result::failure_at (vect_location,
2016 "Loop costings may not be worthwhile.\n");
2017 goto again;
2018 }
2019 if (!res)
2020 return opt_result::failure_at (vect_location,
2021 "Loop costings not worthwhile.\n");
2022
2023 /* Decide whether we need to create an epilogue loop to handle
2024 remaining scalar iterations. */
2025 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2026
2027 unsigned HOST_WIDE_INT const_vf;
2028 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2029 /* The main loop handles all iterations. */
2030 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2031 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2032 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2033 {
2034 /* Work out the (constant) number of iterations that need to be
2035 peeled for reasons other than niters. */
2036 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2037 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2038 peel_niter += 1;
2039 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
2040 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2041 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2042 }
2043 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2044 /* ??? When peeling for gaps but not alignment, we could
2045 try to check whether the (variable) niters is known to be
2046 VF * N + 1. That's something of a niche case though. */
2047 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2048 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2049 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2050 < (unsigned) exact_log2 (const_vf))
2051 /* In case of versioning, check if the maximum number of
2052 iterations is greater than th. If they are identical,
2053 the epilogue is unnecessary. */
2054 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2055 || ((unsigned HOST_WIDE_INT) max_niter
2056 > (th / const_vf) * const_vf))))
2057 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2058
2059 /* If an epilogue loop is required make sure we can create one. */
2060 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2061 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2062 {
2063 if (dump_enabled_p ())
2064 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2065 if (!vect_can_advance_ivs_p (loop_vinfo)
2066 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2067 single_exit (LOOP_VINFO_LOOP
2068 (loop_vinfo))))
2069 {
2070 ok = opt_result::failure_at (vect_location,
2071 "not vectorized: can't create required "
2072 "epilog loop\n");
2073 goto again;
2074 }
2075 }
2076
2077 /* During peeling, we need to check if number of loop iterations is
2078 enough for both peeled prolog loop and vector loop. This check
2079 can be merged along with threshold check of loop versioning, so
2080 increase threshold for this case if necessary. */
2081 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2082 {
2083 poly_uint64 niters_th = 0;
2084
2085 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2086 {
2087 /* Niters for peeled prolog loop. */
2088 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2089 {
2090 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2091 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2092 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2093 }
2094 else
2095 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2096 }
2097
2098 /* Niters for at least one iteration of vectorized loop. */
2099 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2100 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2101 /* One additional iteration because of peeling for gap. */
2102 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2103 niters_th += 1;
2104 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2105 }
2106
2107 gcc_assert (known_eq (vectorization_factor,
2108 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2109
2110 /* Ok to vectorize! */
2111 return opt_result::success ();
2112
2113 again:
2114 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2115 gcc_assert (!ok);
2116
2117 /* Try again with SLP forced off but if we didn't do any SLP there is
2118 no point in re-trying. */
2119 if (!slp)
2120 return ok;
2121
2122 /* If there are reduction chains re-trying will fail anyway. */
2123 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2124 return ok;
2125
2126 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2127 via interleaving or lane instructions. */
2128 slp_instance instance;
2129 slp_tree node;
2130 unsigned i, j;
2131 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2132 {
2133 stmt_vec_info vinfo;
2134 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2135 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2136 continue;
2137 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2138 unsigned int size = DR_GROUP_SIZE (vinfo);
2139 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2140 if (! vect_store_lanes_supported (vectype, size, false)
2141 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2142 && ! vect_grouped_store_supported (vectype, size))
2143 return opt_result::failure_at (vinfo->stmt,
2144 "unsupported grouped store\n");
2145 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2146 {
2147 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2148 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2149 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2150 size = DR_GROUP_SIZE (vinfo);
2151 vectype = STMT_VINFO_VECTYPE (vinfo);
2152 if (! vect_load_lanes_supported (vectype, size, false)
2153 && ! vect_grouped_load_supported (vectype, single_element_p,
2154 size))
2155 return opt_result::failure_at (vinfo->stmt,
2156 "unsupported grouped load\n");
2157 }
2158 }
2159
2160 if (dump_enabled_p ())
2161 dump_printf_loc (MSG_NOTE, vect_location,
2162 "re-trying with SLP disabled\n");
2163
2164 /* Roll back state appropriately. No SLP this time. */
2165 slp = false;
2166 /* Restore vectorization factor as it were without SLP. */
2167 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2168 /* Free the SLP instances. */
2169 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2170 vect_free_slp_instance (instance, false);
2171 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2172 /* Reset SLP type to loop_vect on all stmts. */
2173 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2174 {
2175 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2176 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2177 !gsi_end_p (si); gsi_next (&si))
2178 {
2179 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2180 STMT_SLP_TYPE (stmt_info) = loop_vect;
2181 }
2182 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2183 !gsi_end_p (si); gsi_next (&si))
2184 {
2185 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2186 STMT_SLP_TYPE (stmt_info) = loop_vect;
2187 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2188 {
2189 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2190 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2191 STMT_SLP_TYPE (stmt_info) = loop_vect;
2192 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2193 !gsi_end_p (pi); gsi_next (&pi))
2194 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2195 = loop_vect;
2196 }
2197 }
2198 }
2199 /* Free optimized alias test DDRS. */
2200 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2201 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2202 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2203 /* Reset target cost data. */
2204 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2205 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2206 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2207 /* Reset accumulated rgroup information. */
2208 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2209 /* Reset assorted flags. */
2210 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2211 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2212 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2213 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2214 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2215
2216 goto start_over;
2217 }
2218
2219 /* Function vect_analyze_loop.
2220
2221 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2222 for it. The different analyses will record information in the
2223 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2224 be vectorized. */
2225 opt_loop_vec_info
2226 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2227 vec_info_shared *shared)
2228 {
2229 auto_vector_sizes vector_sizes;
2230
2231 /* Autodetect first vector size we try. */
2232 current_vector_size = 0;
2233 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2234 unsigned int next_size = 0;
2235
2236 DUMP_VECT_SCOPE ("analyze_loop_nest");
2237
2238 if (loop_outer (loop)
2239 && loop_vec_info_for_loop (loop_outer (loop))
2240 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2241 return opt_loop_vec_info::failure_at (vect_location,
2242 "outer-loop already vectorized.\n");
2243
2244 if (!find_loop_nest (loop, &shared->loop_nest))
2245 return opt_loop_vec_info::failure_at
2246 (vect_location,
2247 "not vectorized: loop nest containing two or more consecutive inner"
2248 " loops cannot be vectorized\n");
2249
2250 unsigned n_stmts = 0;
2251 poly_uint64 autodetected_vector_size = 0;
2252 while (1)
2253 {
2254 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2255 opt_loop_vec_info loop_vinfo
2256 = vect_analyze_loop_form (loop, shared);
2257 if (!loop_vinfo)
2258 {
2259 if (dump_enabled_p ())
2260 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2261 "bad loop form.\n");
2262 return loop_vinfo;
2263 }
2264
2265 bool fatal = false;
2266
2267 if (orig_loop_vinfo)
2268 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2269
2270 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2271 if (res)
2272 {
2273 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2274
2275 return loop_vinfo;
2276 }
2277
2278 delete loop_vinfo;
2279
2280 if (next_size == 0)
2281 autodetected_vector_size = current_vector_size;
2282
2283 if (next_size < vector_sizes.length ()
2284 && known_eq (vector_sizes[next_size], autodetected_vector_size))
2285 next_size += 1;
2286
2287 if (fatal
2288 || next_size == vector_sizes.length ()
2289 || known_eq (current_vector_size, 0U))
2290 return opt_loop_vec_info::propagate_failure (res);
2291
2292 /* Try the next biggest vector size. */
2293 current_vector_size = vector_sizes[next_size++];
2294 if (dump_enabled_p ())
2295 {
2296 dump_printf_loc (MSG_NOTE, vect_location,
2297 "***** Re-trying analysis with "
2298 "vector size ");
2299 dump_dec (MSG_NOTE, current_vector_size);
2300 dump_printf (MSG_NOTE, "\n");
2301 }
2302 }
2303 }
2304
2305 /* Return true if there is an in-order reduction function for CODE, storing
2306 it in *REDUC_FN if so. */
2307
2308 static bool
2309 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2310 {
2311 switch (code)
2312 {
2313 case PLUS_EXPR:
2314 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2315 return true;
2316
2317 default:
2318 return false;
2319 }
2320 }
2321
2322 /* Function reduction_fn_for_scalar_code
2323
2324 Input:
2325 CODE - tree_code of a reduction operations.
2326
2327 Output:
2328 REDUC_FN - the corresponding internal function to be used to reduce the
2329 vector of partial results into a single scalar result, or IFN_LAST
2330 if the operation is a supported reduction operation, but does not have
2331 such an internal function.
2332
2333 Return FALSE if CODE currently cannot be vectorized as reduction. */
2334
2335 static bool
2336 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2337 {
2338 switch (code)
2339 {
2340 case MAX_EXPR:
2341 *reduc_fn = IFN_REDUC_MAX;
2342 return true;
2343
2344 case MIN_EXPR:
2345 *reduc_fn = IFN_REDUC_MIN;
2346 return true;
2347
2348 case PLUS_EXPR:
2349 *reduc_fn = IFN_REDUC_PLUS;
2350 return true;
2351
2352 case BIT_AND_EXPR:
2353 *reduc_fn = IFN_REDUC_AND;
2354 return true;
2355
2356 case BIT_IOR_EXPR:
2357 *reduc_fn = IFN_REDUC_IOR;
2358 return true;
2359
2360 case BIT_XOR_EXPR:
2361 *reduc_fn = IFN_REDUC_XOR;
2362 return true;
2363
2364 case MULT_EXPR:
2365 case MINUS_EXPR:
2366 *reduc_fn = IFN_LAST;
2367 return true;
2368
2369 default:
2370 return false;
2371 }
2372 }
2373
2374 /* If there is a neutral value X such that SLP reduction NODE would not
2375 be affected by the introduction of additional X elements, return that X,
2376 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2377 is true if the SLP statements perform a single reduction, false if each
2378 statement performs an independent reduction. */
2379
2380 static tree
2381 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2382 bool reduc_chain)
2383 {
2384 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2385 stmt_vec_info stmt_vinfo = stmts[0];
2386 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2387 tree scalar_type = TREE_TYPE (vector_type);
2388 struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2389 gcc_assert (loop);
2390
2391 switch (code)
2392 {
2393 case WIDEN_SUM_EXPR:
2394 case DOT_PROD_EXPR:
2395 case SAD_EXPR:
2396 case PLUS_EXPR:
2397 case MINUS_EXPR:
2398 case BIT_IOR_EXPR:
2399 case BIT_XOR_EXPR:
2400 return build_zero_cst (scalar_type);
2401
2402 case MULT_EXPR:
2403 return build_one_cst (scalar_type);
2404
2405 case BIT_AND_EXPR:
2406 return build_all_ones_cst (scalar_type);
2407
2408 case MAX_EXPR:
2409 case MIN_EXPR:
2410 /* For MIN/MAX the initial values are neutral. A reduction chain
2411 has only a single initial value, so that value is neutral for
2412 all statements. */
2413 if (reduc_chain)
2414 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2415 loop_preheader_edge (loop));
2416 return NULL_TREE;
2417
2418 default:
2419 return NULL_TREE;
2420 }
2421 }
2422
2423 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2424 STMT is printed with a message MSG. */
2425
2426 static void
2427 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2428 {
2429 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2430 }
2431
2432 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2433 operation. Return true if the results of DEF_STMT_INFO are something
2434 that can be accumulated by such a reduction. */
2435
2436 static bool
2437 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2438 {
2439 return (is_gimple_assign (def_stmt_info->stmt)
2440 || is_gimple_call (def_stmt_info->stmt)
2441 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2442 || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2443 && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2444 && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2445 }
2446
2447 /* Detect SLP reduction of the form:
2448
2449 #a1 = phi <a5, a0>
2450 a2 = operation (a1)
2451 a3 = operation (a2)
2452 a4 = operation (a3)
2453 a5 = operation (a4)
2454
2455 #a = phi <a5>
2456
2457 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2458 FIRST_STMT is the first reduction stmt in the chain
2459 (a2 = operation (a1)).
2460
2461 Return TRUE if a reduction chain was detected. */
2462
2463 static bool
2464 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2465 gimple *first_stmt)
2466 {
2467 struct loop *loop = (gimple_bb (phi))->loop_father;
2468 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2469 enum tree_code code;
2470 gimple *loop_use_stmt = NULL;
2471 stmt_vec_info use_stmt_info, current_stmt_info = NULL;
2472 tree lhs;
2473 imm_use_iterator imm_iter;
2474 use_operand_p use_p;
2475 int nloop_uses, size = 0, n_out_of_loop_uses;
2476 bool found = false;
2477
2478 if (loop != vect_loop)
2479 return false;
2480
2481 lhs = PHI_RESULT (phi);
2482 code = gimple_assign_rhs_code (first_stmt);
2483 while (1)
2484 {
2485 nloop_uses = 0;
2486 n_out_of_loop_uses = 0;
2487 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2488 {
2489 gimple *use_stmt = USE_STMT (use_p);
2490 if (is_gimple_debug (use_stmt))
2491 continue;
2492
2493 /* Check if we got back to the reduction phi. */
2494 if (use_stmt == phi)
2495 {
2496 loop_use_stmt = use_stmt;
2497 found = true;
2498 break;
2499 }
2500
2501 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2502 {
2503 loop_use_stmt = use_stmt;
2504 nloop_uses++;
2505 }
2506 else
2507 n_out_of_loop_uses++;
2508
2509 /* There are can be either a single use in the loop or two uses in
2510 phi nodes. */
2511 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2512 return false;
2513 }
2514
2515 if (found)
2516 break;
2517
2518 /* We reached a statement with no loop uses. */
2519 if (nloop_uses == 0)
2520 return false;
2521
2522 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2523 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2524 return false;
2525
2526 if (!is_gimple_assign (loop_use_stmt)
2527 || code != gimple_assign_rhs_code (loop_use_stmt)
2528 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2529 return false;
2530
2531 /* Insert USE_STMT into reduction chain. */
2532 use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2533 if (current_stmt_info)
2534 {
2535 REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = use_stmt_info;
2536 REDUC_GROUP_FIRST_ELEMENT (use_stmt_info)
2537 = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2538 }
2539 else
2540 REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = use_stmt_info;
2541
2542 lhs = gimple_assign_lhs (loop_use_stmt);
2543 current_stmt_info = use_stmt_info;
2544 size++;
2545 }
2546
2547 if (!found || loop_use_stmt != phi || size < 2)
2548 return false;
2549
2550 /* Swap the operands, if needed, to make the reduction operand be the second
2551 operand. */
2552 lhs = PHI_RESULT (phi);
2553 stmt_vec_info next_stmt_info = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2554 while (next_stmt_info)
2555 {
2556 gassign *next_stmt = as_a <gassign *> (next_stmt_info->stmt);
2557 if (gimple_assign_rhs2 (next_stmt) == lhs)
2558 {
2559 tree op = gimple_assign_rhs1 (next_stmt);
2560 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2561
2562 /* Check that the other def is either defined in the loop
2563 ("vect_internal_def"), or it's an induction (defined by a
2564 loop-header phi-node). */
2565 if (def_stmt_info
2566 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2567 && vect_valid_reduction_input_p (def_stmt_info))
2568 {
2569 lhs = gimple_assign_lhs (next_stmt);
2570 next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
2571 continue;
2572 }
2573
2574 return false;
2575 }
2576 else
2577 {
2578 tree op = gimple_assign_rhs2 (next_stmt);
2579 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2580
2581 /* Check that the other def is either defined in the loop
2582 ("vect_internal_def"), or it's an induction (defined by a
2583 loop-header phi-node). */
2584 if (def_stmt_info
2585 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2586 && vect_valid_reduction_input_p (def_stmt_info))
2587 {
2588 if (dump_enabled_p ())
2589 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
2590 next_stmt);
2591
2592 swap_ssa_operands (next_stmt,
2593 gimple_assign_rhs1_ptr (next_stmt),
2594 gimple_assign_rhs2_ptr (next_stmt));
2595 update_stmt (next_stmt);
2596
2597 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2598 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2599 }
2600 else
2601 return false;
2602 }
2603
2604 lhs = gimple_assign_lhs (next_stmt);
2605 next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
2606 }
2607
2608 /* Save the chain for further analysis in SLP detection. */
2609 stmt_vec_info first_stmt_info
2610 = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2611 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first_stmt_info);
2612 REDUC_GROUP_SIZE (first_stmt_info) = size;
2613
2614 return true;
2615 }
2616
2617 /* Return true if we need an in-order reduction for operation CODE
2618 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2619 overflow must wrap. */
2620
2621 static bool
2622 needs_fold_left_reduction_p (tree type, tree_code code,
2623 bool need_wrapping_integral_overflow)
2624 {
2625 /* CHECKME: check for !flag_finite_math_only too? */
2626 if (SCALAR_FLOAT_TYPE_P (type))
2627 switch (code)
2628 {
2629 case MIN_EXPR:
2630 case MAX_EXPR:
2631 return false;
2632
2633 default:
2634 return !flag_associative_math;
2635 }
2636
2637 if (INTEGRAL_TYPE_P (type))
2638 {
2639 if (!operation_no_trapping_overflow (type, code))
2640 return true;
2641 if (need_wrapping_integral_overflow
2642 && !TYPE_OVERFLOW_WRAPS (type)
2643 && operation_can_overflow (code))
2644 return true;
2645 return false;
2646 }
2647
2648 if (SAT_FIXED_POINT_TYPE_P (type))
2649 return true;
2650
2651 return false;
2652 }
2653
2654 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2655 reduction operation CODE has a handled computation expression. */
2656
2657 bool
2658 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2659 tree loop_arg, enum tree_code code)
2660 {
2661 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2662 auto_bitmap visited;
2663 tree lookfor = PHI_RESULT (phi);
2664 ssa_op_iter curri;
2665 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2666 while (USE_FROM_PTR (curr) != loop_arg)
2667 curr = op_iter_next_use (&curri);
2668 curri.i = curri.numops;
2669 do
2670 {
2671 path.safe_push (std::make_pair (curri, curr));
2672 tree use = USE_FROM_PTR (curr);
2673 if (use == lookfor)
2674 break;
2675 gimple *def = SSA_NAME_DEF_STMT (use);
2676 if (gimple_nop_p (def)
2677 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2678 {
2679 pop:
2680 do
2681 {
2682 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2683 curri = x.first;
2684 curr = x.second;
2685 do
2686 curr = op_iter_next_use (&curri);
2687 /* Skip already visited or non-SSA operands (from iterating
2688 over PHI args). */
2689 while (curr != NULL_USE_OPERAND_P
2690 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2691 || ! bitmap_set_bit (visited,
2692 SSA_NAME_VERSION
2693 (USE_FROM_PTR (curr)))));
2694 }
2695 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2696 if (curr == NULL_USE_OPERAND_P)
2697 break;
2698 }
2699 else
2700 {
2701 if (gimple_code (def) == GIMPLE_PHI)
2702 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2703 else
2704 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2705 while (curr != NULL_USE_OPERAND_P
2706 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2707 || ! bitmap_set_bit (visited,
2708 SSA_NAME_VERSION
2709 (USE_FROM_PTR (curr)))))
2710 curr = op_iter_next_use (&curri);
2711 if (curr == NULL_USE_OPERAND_P)
2712 goto pop;
2713 }
2714 }
2715 while (1);
2716 if (dump_file && (dump_flags & TDF_DETAILS))
2717 {
2718 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2719 unsigned i;
2720 std::pair<ssa_op_iter, use_operand_p> *x;
2721 FOR_EACH_VEC_ELT (path, i, x)
2722 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2723 dump_printf (MSG_NOTE, "\n");
2724 }
2725
2726 /* Check whether the reduction path detected is valid. */
2727 bool fail = path.length () == 0;
2728 bool neg = false;
2729 for (unsigned i = 1; i < path.length (); ++i)
2730 {
2731 gimple *use_stmt = USE_STMT (path[i].second);
2732 tree op = USE_FROM_PTR (path[i].second);
2733 if (! has_single_use (op)
2734 || ! is_gimple_assign (use_stmt))
2735 {
2736 fail = true;
2737 break;
2738 }
2739 if (gimple_assign_rhs_code (use_stmt) != code)
2740 {
2741 if (code == PLUS_EXPR
2742 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2743 {
2744 /* Track whether we negate the reduction value each iteration. */
2745 if (gimple_assign_rhs2 (use_stmt) == op)
2746 neg = ! neg;
2747 }
2748 else
2749 {
2750 fail = true;
2751 break;
2752 }
2753 }
2754 }
2755 return ! fail && ! neg;
2756 }
2757
2758
2759 /* Function vect_is_simple_reduction
2760
2761 (1) Detect a cross-iteration def-use cycle that represents a simple
2762 reduction computation. We look for the following pattern:
2763
2764 loop_header:
2765 a1 = phi < a0, a2 >
2766 a3 = ...
2767 a2 = operation (a3, a1)
2768
2769 or
2770
2771 a3 = ...
2772 loop_header:
2773 a1 = phi < a0, a2 >
2774 a2 = operation (a3, a1)
2775
2776 such that:
2777 1. operation is commutative and associative and it is safe to
2778 change the order of the computation
2779 2. no uses for a2 in the loop (a2 is used out of the loop)
2780 3. no uses of a1 in the loop besides the reduction operation
2781 4. no uses of a1 outside the loop.
2782
2783 Conditions 1,4 are tested here.
2784 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2785
2786 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2787 nested cycles.
2788
2789 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2790 reductions:
2791
2792 a1 = phi < a0, a2 >
2793 inner loop (def of a3)
2794 a2 = phi < a3 >
2795
2796 (4) Detect condition expressions, ie:
2797 for (int i = 0; i < N; i++)
2798 if (a[i] < val)
2799 ret_val = a[i];
2800
2801 */
2802
2803 static stmt_vec_info
2804 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2805 bool *double_reduc,
2806 bool need_wrapping_integral_overflow,
2807 enum vect_reduction_type *v_reduc_type)
2808 {
2809 gphi *phi = as_a <gphi *> (phi_info->stmt);
2810 struct loop *loop = (gimple_bb (phi))->loop_father;
2811 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2812 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2813 gimple *phi_use_stmt = NULL;
2814 enum tree_code orig_code, code;
2815 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2816 tree type;
2817 tree name;
2818 imm_use_iterator imm_iter;
2819 use_operand_p use_p;
2820 bool phi_def;
2821
2822 *double_reduc = false;
2823 *v_reduc_type = TREE_CODE_REDUCTION;
2824
2825 tree phi_name = PHI_RESULT (phi);
2826 /* ??? If there are no uses of the PHI result the inner loop reduction
2827 won't be detected as possibly double-reduction by vectorizable_reduction
2828 because that tries to walk the PHI arg from the preheader edge which
2829 can be constant. See PR60382. */
2830 if (has_zero_uses (phi_name))
2831 return NULL;
2832 unsigned nphi_def_loop_uses = 0;
2833 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2834 {
2835 gimple *use_stmt = USE_STMT (use_p);
2836 if (is_gimple_debug (use_stmt))
2837 continue;
2838
2839 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2840 {
2841 if (dump_enabled_p ())
2842 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2843 "intermediate value used outside loop.\n");
2844
2845 return NULL;
2846 }
2847
2848 nphi_def_loop_uses++;
2849 phi_use_stmt = use_stmt;
2850 }
2851
2852 edge latch_e = loop_latch_edge (loop);
2853 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2854 if (TREE_CODE (loop_arg) != SSA_NAME)
2855 {
2856 if (dump_enabled_p ())
2857 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2858 "reduction: not ssa_name: %T\n", loop_arg);
2859 return NULL;
2860 }
2861
2862 stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
2863 if (!def_stmt_info
2864 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
2865 return NULL;
2866
2867 if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
2868 {
2869 name = gimple_assign_lhs (def_stmt);
2870 phi_def = false;
2871 }
2872 else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2873 {
2874 name = PHI_RESULT (def_stmt);
2875 phi_def = true;
2876 }
2877 else
2878 {
2879 if (dump_enabled_p ())
2880 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2881 "reduction: unhandled reduction operation: %G",
2882 def_stmt_info->stmt);
2883 return NULL;
2884 }
2885
2886 unsigned nlatch_def_loop_uses = 0;
2887 auto_vec<gphi *, 3> lcphis;
2888 bool inner_loop_of_double_reduc = false;
2889 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2890 {
2891 gimple *use_stmt = USE_STMT (use_p);
2892 if (is_gimple_debug (use_stmt))
2893 continue;
2894 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2895 nlatch_def_loop_uses++;
2896 else
2897 {
2898 /* We can have more than one loop-closed PHI. */
2899 lcphis.safe_push (as_a <gphi *> (use_stmt));
2900 if (nested_in_vect_loop
2901 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
2902 == vect_double_reduction_def))
2903 inner_loop_of_double_reduc = true;
2904 }
2905 }
2906
2907 /* If this isn't a nested cycle or if the nested cycle reduction value
2908 is used ouside of the inner loop we cannot handle uses of the reduction
2909 value. */
2910 if ((!nested_in_vect_loop || inner_loop_of_double_reduc)
2911 && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1))
2912 {
2913 if (dump_enabled_p ())
2914 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2915 "reduction used in loop.\n");
2916 return NULL;
2917 }
2918
2919 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2920 defined in the inner loop. */
2921 if (phi_def)
2922 {
2923 gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
2924 op1 = PHI_ARG_DEF (def_stmt, 0);
2925
2926 if (gimple_phi_num_args (def_stmt) != 1
2927 || TREE_CODE (op1) != SSA_NAME)
2928 {
2929 if (dump_enabled_p ())
2930 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2931 "unsupported phi node definition.\n");
2932
2933 return NULL;
2934 }
2935
2936 gimple *def1 = SSA_NAME_DEF_STMT (op1);
2937 if (gimple_bb (def1)
2938 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2939 && loop->inner
2940 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2941 && is_gimple_assign (def1)
2942 && is_a <gphi *> (phi_use_stmt)
2943 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2944 {
2945 if (dump_enabled_p ())
2946 report_vect_op (MSG_NOTE, def_stmt,
2947 "detected double reduction: ");
2948
2949 *double_reduc = true;
2950 return def_stmt_info;
2951 }
2952
2953 return NULL;
2954 }
2955
2956 /* If we are vectorizing an inner reduction we are executing that
2957 in the original order only in case we are not dealing with a
2958 double reduction. */
2959 bool check_reduction = true;
2960 if (flow_loop_nested_p (vect_loop, loop))
2961 {
2962 gphi *lcphi;
2963 unsigned i;
2964 check_reduction = false;
2965 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2966 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2967 {
2968 gimple *use_stmt = USE_STMT (use_p);
2969 if (is_gimple_debug (use_stmt))
2970 continue;
2971 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2972 check_reduction = true;
2973 }
2974 }
2975
2976 gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
2977 code = orig_code = gimple_assign_rhs_code (def_stmt);
2978
2979 if (nested_in_vect_loop && !check_reduction)
2980 {
2981 /* FIXME: Even for non-reductions code generation is funneled
2982 through vectorizable_reduction for the stmt defining the
2983 PHI latch value. So we have to artificially restrict ourselves
2984 for the supported operations. */
2985 switch (get_gimple_rhs_class (code))
2986 {
2987 case GIMPLE_BINARY_RHS:
2988 case GIMPLE_TERNARY_RHS:
2989 break;
2990 default:
2991 /* Not supported by vectorizable_reduction. */
2992 if (dump_enabled_p ())
2993 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2994 "nested cycle: not handled operation: ");
2995 return NULL;
2996 }
2997 if (dump_enabled_p ())
2998 report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
2999 return def_stmt_info;
3000 }
3001
3002 /* We can handle "res -= x[i]", which is non-associative by
3003 simply rewriting this into "res += -x[i]". Avoid changing
3004 gimple instruction for the first simple tests and only do this
3005 if we're allowed to change code at all. */
3006 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3007 code = PLUS_EXPR;
3008
3009 if (code == COND_EXPR)
3010 {
3011 if (! nested_in_vect_loop)
3012 *v_reduc_type = COND_REDUCTION;
3013
3014 op3 = gimple_assign_rhs1 (def_stmt);
3015 if (COMPARISON_CLASS_P (op3))
3016 {
3017 op4 = TREE_OPERAND (op3, 1);
3018 op3 = TREE_OPERAND (op3, 0);
3019 }
3020 if (op3 == phi_name || op4 == phi_name)
3021 {
3022 if (dump_enabled_p ())
3023 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3024 "reduction: condition depends on previous"
3025 " iteration: ");
3026 return NULL;
3027 }
3028
3029 op1 = gimple_assign_rhs2 (def_stmt);
3030 op2 = gimple_assign_rhs3 (def_stmt);
3031 }
3032 else if (!commutative_tree_code (code) || !associative_tree_code (code))
3033 {
3034 if (dump_enabled_p ())
3035 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3036 "reduction: not commutative/associative: ");
3037 return NULL;
3038 }
3039 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3040 {
3041 op1 = gimple_assign_rhs1 (def_stmt);
3042 op2 = gimple_assign_rhs2 (def_stmt);
3043 }
3044 else
3045 {
3046 if (dump_enabled_p ())
3047 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3048 "reduction: not handled operation: ");
3049 return NULL;
3050 }
3051
3052 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3053 {
3054 if (dump_enabled_p ())
3055 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3056 "reduction: both uses not ssa_names: ");
3057
3058 return NULL;
3059 }
3060
3061 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3062 if ((TREE_CODE (op1) == SSA_NAME
3063 && !types_compatible_p (type,TREE_TYPE (op1)))
3064 || (TREE_CODE (op2) == SSA_NAME
3065 && !types_compatible_p (type, TREE_TYPE (op2)))
3066 || (op3 && TREE_CODE (op3) == SSA_NAME
3067 && !types_compatible_p (type, TREE_TYPE (op3)))
3068 || (op4 && TREE_CODE (op4) == SSA_NAME
3069 && !types_compatible_p (type, TREE_TYPE (op4))))
3070 {
3071 if (dump_enabled_p ())
3072 {
3073 dump_printf_loc (MSG_NOTE, vect_location,
3074 "reduction: multiple types: operation type: "
3075 "%T, operands types: %T,%T",
3076 type, TREE_TYPE (op1), TREE_TYPE (op2));
3077 if (op3)
3078 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
3079
3080 if (op4)
3081 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
3082 dump_printf (MSG_NOTE, "\n");
3083 }
3084
3085 return NULL;
3086 }
3087
3088 /* Check whether it's ok to change the order of the computation.
3089 Generally, when vectorizing a reduction we change the order of the
3090 computation. This may change the behavior of the program in some
3091 cases, so we need to check that this is ok. One exception is when
3092 vectorizing an outer-loop: the inner-loop is executed sequentially,
3093 and therefore vectorizing reductions in the inner-loop during
3094 outer-loop vectorization is safe. */
3095 if (check_reduction
3096 && *v_reduc_type == TREE_CODE_REDUCTION
3097 && needs_fold_left_reduction_p (type, code,
3098 need_wrapping_integral_overflow))
3099 *v_reduc_type = FOLD_LEFT_REDUCTION;
3100
3101 /* Reduction is safe. We're dealing with one of the following:
3102 1) integer arithmetic and no trapv
3103 2) floating point arithmetic, and special flags permit this optimization
3104 3) nested cycle (i.e., outer loop vectorization). */
3105 stmt_vec_info def1_info = loop_info->lookup_def (op1);
3106 stmt_vec_info def2_info = loop_info->lookup_def (op2);
3107 if (code != COND_EXPR && !def1_info && !def2_info)
3108 {
3109 if (dump_enabled_p ())
3110 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3111 return NULL;
3112 }
3113
3114 /* Check that one def is the reduction def, defined by PHI,
3115 the other def is either defined in the loop ("vect_internal_def"),
3116 or it's an induction (defined by a loop-header phi-node). */
3117
3118 if (def2_info
3119 && def2_info->stmt == phi
3120 && (code == COND_EXPR
3121 || !def1_info
3122 || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
3123 || vect_valid_reduction_input_p (def1_info)))
3124 {
3125 if (dump_enabled_p ())
3126 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3127 return def_stmt_info;
3128 }
3129
3130 if (def1_info
3131 && def1_info->stmt == phi
3132 && (code == COND_EXPR
3133 || !def2_info
3134 || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
3135 || vect_valid_reduction_input_p (def2_info)))
3136 {
3137 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3138 {
3139 /* Check if we can swap operands (just for simplicity - so that
3140 the rest of the code can assume that the reduction variable
3141 is always the last (second) argument). */
3142 if (code == COND_EXPR)
3143 {
3144 /* Swap cond_expr by inverting the condition. */
3145 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3146 enum tree_code invert_code = ERROR_MARK;
3147 enum tree_code cond_code = TREE_CODE (cond_expr);
3148
3149 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3150 {
3151 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3152 invert_code = invert_tree_comparison (cond_code, honor_nans);
3153 }
3154 if (invert_code != ERROR_MARK)
3155 {
3156 TREE_SET_CODE (cond_expr, invert_code);
3157 swap_ssa_operands (def_stmt,
3158 gimple_assign_rhs2_ptr (def_stmt),
3159 gimple_assign_rhs3_ptr (def_stmt));
3160 }
3161 else
3162 {
3163 if (dump_enabled_p ())
3164 report_vect_op (MSG_NOTE, def_stmt,
3165 "detected reduction: cannot swap operands "
3166 "for cond_expr");
3167 return NULL;
3168 }
3169 }
3170 else
3171 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3172 gimple_assign_rhs2_ptr (def_stmt));
3173
3174 if (dump_enabled_p ())
3175 report_vect_op (MSG_NOTE, def_stmt,
3176 "detected reduction: need to swap operands: ");
3177
3178 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3179 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3180 }
3181 else
3182 {
3183 if (dump_enabled_p ())
3184 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3185 }
3186
3187 return def_stmt_info;
3188 }
3189
3190 /* Try to find SLP reduction chain. */
3191 if (! nested_in_vect_loop
3192 && code != COND_EXPR
3193 && orig_code != MINUS_EXPR
3194 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3195 {
3196 if (dump_enabled_p ())
3197 report_vect_op (MSG_NOTE, def_stmt,
3198 "reduction: detected reduction chain: ");
3199
3200 return def_stmt_info;
3201 }
3202
3203 /* Dissolve group eventually half-built by vect_is_slp_reduction. */
3204 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (def_stmt_info);
3205 while (first)
3206 {
3207 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
3208 REDUC_GROUP_FIRST_ELEMENT (first) = NULL;
3209 REDUC_GROUP_NEXT_ELEMENT (first) = NULL;
3210 first = next;
3211 }
3212
3213 /* Look for the expression computing loop_arg from loop PHI result. */
3214 if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3215 return def_stmt_info;
3216
3217 if (dump_enabled_p ())
3218 {
3219 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3220 "reduction: unknown pattern: ");
3221 }
3222
3223 return NULL;
3224 }
3225
3226 /* Wrapper around vect_is_simple_reduction, which will modify code
3227 in-place if it enables detection of more reductions. Arguments
3228 as there. */
3229
3230 stmt_vec_info
3231 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3232 bool *double_reduc,
3233 bool need_wrapping_integral_overflow)
3234 {
3235 enum vect_reduction_type v_reduc_type;
3236 stmt_vec_info def_info
3237 = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3238 need_wrapping_integral_overflow,
3239 &v_reduc_type);
3240 if (def_info)
3241 {
3242 STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3243 STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3244 STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3245 STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3246 }
3247 return def_info;
3248 }
3249
3250 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3251 int
3252 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3253 int *peel_iters_epilogue,
3254 stmt_vector_for_cost *scalar_cost_vec,
3255 stmt_vector_for_cost *prologue_cost_vec,
3256 stmt_vector_for_cost *epilogue_cost_vec)
3257 {
3258 int retval = 0;
3259 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3260
3261 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3262 {
3263 *peel_iters_epilogue = assumed_vf / 2;
3264 if (dump_enabled_p ())
3265 dump_printf_loc (MSG_NOTE, vect_location,
3266 "cost model: epilogue peel iters set to vf/2 "
3267 "because loop iterations are unknown .\n");
3268
3269 /* If peeled iterations are known but number of scalar loop
3270 iterations are unknown, count a taken branch per peeled loop. */
3271 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3272 NULL, 0, vect_prologue);
3273 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3274 NULL, 0, vect_epilogue);
3275 }
3276 else
3277 {
3278 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3279 peel_iters_prologue = niters < peel_iters_prologue ?
3280 niters : peel_iters_prologue;
3281 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3282 /* If we need to peel for gaps, but no peeling is required, we have to
3283 peel VF iterations. */
3284 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3285 *peel_iters_epilogue = assumed_vf;
3286 }
3287
3288 stmt_info_for_cost *si;
3289 int j;
3290 if (peel_iters_prologue)
3291 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3292 retval += record_stmt_cost (prologue_cost_vec,
3293 si->count * peel_iters_prologue,
3294 si->kind, si->stmt_info, si->misalign,
3295 vect_prologue);
3296 if (*peel_iters_epilogue)
3297 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3298 retval += record_stmt_cost (epilogue_cost_vec,
3299 si->count * *peel_iters_epilogue,
3300 si->kind, si->stmt_info, si->misalign,
3301 vect_epilogue);
3302
3303 return retval;
3304 }
3305
3306 /* Function vect_estimate_min_profitable_iters
3307
3308 Return the number of iterations required for the vector version of the
3309 loop to be profitable relative to the cost of the scalar version of the
3310 loop.
3311
3312 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3313 of iterations for vectorization. -1 value means loop vectorization
3314 is not profitable. This returned value may be used for dynamic
3315 profitability check.
3316
3317 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3318 for static check against estimated number of iterations. */
3319
3320 static void
3321 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3322 int *ret_min_profitable_niters,
3323 int *ret_min_profitable_estimate)
3324 {
3325 int min_profitable_iters;
3326 int min_profitable_estimate;
3327 int peel_iters_prologue;
3328 int peel_iters_epilogue;
3329 unsigned vec_inside_cost = 0;
3330 int vec_outside_cost = 0;
3331 unsigned vec_prologue_cost = 0;
3332 unsigned vec_epilogue_cost = 0;
3333 int scalar_single_iter_cost = 0;
3334 int scalar_outside_cost = 0;
3335 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3336 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3337 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3338
3339 /* Cost model disabled. */
3340 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3341 {
3342 if (dump_enabled_p ())
3343 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3344 *ret_min_profitable_niters = 0;
3345 *ret_min_profitable_estimate = 0;
3346 return;
3347 }
3348
3349 /* Requires loop versioning tests to handle misalignment. */
3350 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3351 {
3352 /* FIXME: Make cost depend on complexity of individual check. */
3353 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3354 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3355 vect_prologue);
3356 if (dump_enabled_p ())
3357 dump_printf (MSG_NOTE,
3358 "cost model: Adding cost of checks for loop "
3359 "versioning to treat misalignment.\n");
3360 }
3361
3362 /* Requires loop versioning with alias checks. */
3363 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3364 {
3365 /* FIXME: Make cost depend on complexity of individual check. */
3366 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3367 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3368 vect_prologue);
3369 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3370 if (len)
3371 /* Count LEN - 1 ANDs and LEN comparisons. */
3372 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3373 NULL, 0, vect_prologue);
3374 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3375 if (len)
3376 {
3377 /* Count LEN - 1 ANDs and LEN comparisons. */
3378 unsigned int nstmts = len * 2 - 1;
3379 /* +1 for each bias that needs adding. */
3380 for (unsigned int i = 0; i < len; ++i)
3381 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3382 nstmts += 1;
3383 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3384 NULL, 0, vect_prologue);
3385 }
3386 if (dump_enabled_p ())
3387 dump_printf (MSG_NOTE,
3388 "cost model: Adding cost of checks for loop "
3389 "versioning aliasing.\n");
3390 }
3391
3392 /* Requires loop versioning with niter checks. */
3393 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3394 {
3395 /* FIXME: Make cost depend on complexity of individual check. */
3396 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3397 vect_prologue);
3398 if (dump_enabled_p ())
3399 dump_printf (MSG_NOTE,
3400 "cost model: Adding cost of checks for loop "
3401 "versioning niters.\n");
3402 }
3403
3404 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3405 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3406 vect_prologue);
3407
3408 /* Count statements in scalar loop. Using this as scalar cost for a single
3409 iteration for now.
3410
3411 TODO: Add outer loop support.
3412
3413 TODO: Consider assigning different costs to different scalar
3414 statements. */
3415
3416 scalar_single_iter_cost
3417 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3418
3419 /* Add additional cost for the peeled instructions in prologue and epilogue
3420 loop. (For fully-masked loops there will be no peeling.)
3421
3422 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3423 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3424
3425 TODO: Build an expression that represents peel_iters for prologue and
3426 epilogue to be used in a run-time test. */
3427
3428 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3429 {
3430 peel_iters_prologue = 0;
3431 peel_iters_epilogue = 0;
3432
3433 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3434 {
3435 /* We need to peel exactly one iteration. */
3436 peel_iters_epilogue += 1;
3437 stmt_info_for_cost *si;
3438 int j;
3439 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3440 j, si)
3441 (void) add_stmt_cost (target_cost_data, si->count,
3442 si->kind, si->stmt_info, si->misalign,
3443 vect_epilogue);
3444 }
3445 }
3446 else if (npeel < 0)
3447 {
3448 peel_iters_prologue = assumed_vf / 2;
3449 if (dump_enabled_p ())
3450 dump_printf (MSG_NOTE, "cost model: "
3451 "prologue peel iters set to vf/2.\n");
3452
3453 /* If peeling for alignment is unknown, loop bound of main loop becomes
3454 unknown. */
3455 peel_iters_epilogue = assumed_vf / 2;
3456 if (dump_enabled_p ())
3457 dump_printf (MSG_NOTE, "cost model: "
3458 "epilogue peel iters set to vf/2 because "
3459 "peeling for alignment is unknown.\n");
3460
3461 /* If peeled iterations are unknown, count a taken branch and a not taken
3462 branch per peeled loop. Even if scalar loop iterations are known,
3463 vector iterations are not known since peeled prologue iterations are
3464 not known. Hence guards remain the same. */
3465 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3466 NULL, 0, vect_prologue);
3467 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3468 NULL, 0, vect_prologue);
3469 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3470 NULL, 0, vect_epilogue);
3471 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3472 NULL, 0, vect_epilogue);
3473 stmt_info_for_cost *si;
3474 int j;
3475 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3476 {
3477 (void) add_stmt_cost (target_cost_data,
3478 si->count * peel_iters_prologue,
3479 si->kind, si->stmt_info, si->misalign,
3480 vect_prologue);
3481 (void) add_stmt_cost (target_cost_data,
3482 si->count * peel_iters_epilogue,
3483 si->kind, si->stmt_info, si->misalign,
3484 vect_epilogue);
3485 }
3486 }
3487 else
3488 {
3489 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3490 stmt_info_for_cost *si;
3491 int j;
3492 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3493
3494 prologue_cost_vec.create (2);
3495 epilogue_cost_vec.create (2);
3496 peel_iters_prologue = npeel;
3497
3498 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3499 &peel_iters_epilogue,
3500 &LOOP_VINFO_SCALAR_ITERATION_COST
3501 (loop_vinfo),
3502 &prologue_cost_vec,
3503 &epilogue_cost_vec);
3504
3505 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3506 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3507 si->misalign, vect_prologue);
3508
3509 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3510 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3511 si->misalign, vect_epilogue);
3512
3513 prologue_cost_vec.release ();
3514 epilogue_cost_vec.release ();
3515 }
3516
3517 /* FORNOW: The scalar outside cost is incremented in one of the
3518 following ways:
3519
3520 1. The vectorizer checks for alignment and aliasing and generates
3521 a condition that allows dynamic vectorization. A cost model
3522 check is ANDED with the versioning condition. Hence scalar code
3523 path now has the added cost of the versioning check.
3524
3525 if (cost > th & versioning_check)
3526 jmp to vector code
3527
3528 Hence run-time scalar is incremented by not-taken branch cost.
3529
3530 2. The vectorizer then checks if a prologue is required. If the
3531 cost model check was not done before during versioning, it has to
3532 be done before the prologue check.
3533
3534 if (cost <= th)
3535 prologue = scalar_iters
3536 if (prologue == 0)
3537 jmp to vector code
3538 else
3539 execute prologue
3540 if (prologue == num_iters)
3541 go to exit
3542
3543 Hence the run-time scalar cost is incremented by a taken branch,
3544 plus a not-taken branch, plus a taken branch cost.
3545
3546 3. The vectorizer then checks if an epilogue is required. If the
3547 cost model check was not done before during prologue check, it
3548 has to be done with the epilogue check.
3549
3550 if (prologue == 0)
3551 jmp to vector code
3552 else
3553 execute prologue
3554 if (prologue == num_iters)
3555 go to exit
3556 vector code:
3557 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3558 jmp to epilogue
3559
3560 Hence the run-time scalar cost should be incremented by 2 taken
3561 branches.
3562
3563 TODO: The back end may reorder the BBS's differently and reverse
3564 conditions/branch directions. Change the estimates below to
3565 something more reasonable. */
3566
3567 /* If the number of iterations is known and we do not do versioning, we can
3568 decide whether to vectorize at compile time. Hence the scalar version
3569 do not carry cost model guard costs. */
3570 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3571 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3572 {
3573 /* Cost model check occurs at versioning. */
3574 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3575 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3576 else
3577 {
3578 /* Cost model check occurs at prologue generation. */
3579 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3580 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3581 + vect_get_stmt_cost (cond_branch_not_taken);
3582 /* Cost model check occurs at epilogue generation. */
3583 else
3584 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3585 }
3586 }
3587
3588 /* Complete the target-specific cost calculations. */
3589 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3590 &vec_inside_cost, &vec_epilogue_cost);
3591
3592 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3593
3594 if (dump_enabled_p ())
3595 {
3596 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3597 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3598 vec_inside_cost);
3599 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3600 vec_prologue_cost);
3601 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3602 vec_epilogue_cost);
3603 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3604 scalar_single_iter_cost);
3605 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3606 scalar_outside_cost);
3607 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3608 vec_outside_cost);
3609 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3610 peel_iters_prologue);
3611 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3612 peel_iters_epilogue);
3613 }
3614
3615 /* Calculate number of iterations required to make the vector version
3616 profitable, relative to the loop bodies only. The following condition
3617 must hold true:
3618 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3619 where
3620 SIC = scalar iteration cost, VIC = vector iteration cost,
3621 VOC = vector outside cost, VF = vectorization factor,
3622 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3623 SOC = scalar outside cost for run time cost model check. */
3624
3625 if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3626 {
3627 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3628 * assumed_vf
3629 - vec_inside_cost * peel_iters_prologue
3630 - vec_inside_cost * peel_iters_epilogue);
3631 if (min_profitable_iters <= 0)
3632 min_profitable_iters = 0;
3633 else
3634 {
3635 min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3636 - vec_inside_cost);
3637
3638 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3639 <= (((int) vec_inside_cost * min_profitable_iters)
3640 + (((int) vec_outside_cost - scalar_outside_cost)
3641 * assumed_vf)))
3642 min_profitable_iters++;
3643 }
3644 }
3645 /* vector version will never be profitable. */
3646 else
3647 {
3648 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3649 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3650 "vectorization did not happen for a simd loop");
3651
3652 if (dump_enabled_p ())
3653 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3654 "cost model: the vector iteration cost = %d "
3655 "divided by the scalar iteration cost = %d "
3656 "is greater or equal to the vectorization factor = %d"
3657 ".\n",
3658 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3659 *ret_min_profitable_niters = -1;
3660 *ret_min_profitable_estimate = -1;
3661 return;
3662 }
3663
3664 if (dump_enabled_p ())
3665 dump_printf (MSG_NOTE,
3666 " Calculated minimum iters for profitability: %d\n",
3667 min_profitable_iters);
3668
3669 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3670 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3671 /* We want the vectorized loop to execute at least once. */
3672 min_profitable_iters = assumed_vf + peel_iters_prologue;
3673
3674 if (dump_enabled_p ())
3675 dump_printf_loc (MSG_NOTE, vect_location,
3676 " Runtime profitability threshold = %d\n",
3677 min_profitable_iters);
3678
3679 *ret_min_profitable_niters = min_profitable_iters;
3680
3681 /* Calculate number of iterations required to make the vector version
3682 profitable, relative to the loop bodies only.
3683
3684 Non-vectorized variant is SIC * niters and it must win over vector
3685 variant on the expected loop trip count. The following condition must hold true:
3686 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3687
3688 if (vec_outside_cost <= 0)
3689 min_profitable_estimate = 0;
3690 else
3691 {
3692 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3693 * assumed_vf
3694 - vec_inside_cost * peel_iters_prologue
3695 - vec_inside_cost * peel_iters_epilogue)
3696 / ((scalar_single_iter_cost * assumed_vf)
3697 - vec_inside_cost);
3698 }
3699 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3700 if (dump_enabled_p ())
3701 dump_printf_loc (MSG_NOTE, vect_location,
3702 " Static estimate profitability threshold = %d\n",
3703 min_profitable_estimate);
3704
3705 *ret_min_profitable_estimate = min_profitable_estimate;
3706 }
3707
3708 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3709 vector elements (not bits) for a vector with NELT elements. */
3710 static void
3711 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3712 vec_perm_builder *sel)
3713 {
3714 /* The encoding is a single stepped pattern. Any wrap-around is handled
3715 by vec_perm_indices. */
3716 sel->new_vector (nelt, 1, 3);
3717 for (unsigned int i = 0; i < 3; i++)
3718 sel->quick_push (i + offset);
3719 }
3720
3721 /* Checks whether the target supports whole-vector shifts for vectors of mode
3722 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3723 it supports vec_perm_const with masks for all necessary shift amounts. */
3724 static bool
3725 have_whole_vector_shift (machine_mode mode)
3726 {
3727 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3728 return true;
3729
3730 /* Variable-length vectors should be handled via the optab. */
3731 unsigned int nelt;
3732 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3733 return false;
3734
3735 vec_perm_builder sel;
3736 vec_perm_indices indices;
3737 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3738 {
3739 calc_vec_perm_mask_for_shift (i, nelt, &sel);
3740 indices.new_vector (sel, 2, nelt);
3741 if (!can_vec_perm_const_p (mode, indices, false))
3742 return false;
3743 }
3744 return true;
3745 }
3746
3747 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3748 functions. Design better to avoid maintenance issues. */
3749
3750 /* Function vect_model_reduction_cost.
3751
3752 Models cost for a reduction operation, including the vector ops
3753 generated within the strip-mine loop, the initial definition before
3754 the loop, and the epilogue code that must be generated. */
3755
3756 static void
3757 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3758 int ncopies, stmt_vector_for_cost *cost_vec)
3759 {
3760 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3761 enum tree_code code;
3762 optab optab;
3763 tree vectype;
3764 machine_mode mode;
3765 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3766 struct loop *loop = NULL;
3767
3768 if (loop_vinfo)
3769 loop = LOOP_VINFO_LOOP (loop_vinfo);
3770
3771 /* Condition reductions generate two reductions in the loop. */
3772 vect_reduction_type reduction_type
3773 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3774 if (reduction_type == COND_REDUCTION)
3775 ncopies *= 2;
3776
3777 vectype = STMT_VINFO_VECTYPE (stmt_info);
3778 mode = TYPE_MODE (vectype);
3779 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3780
3781 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3782
3783 if (reduction_type == EXTRACT_LAST_REDUCTION
3784 || reduction_type == FOLD_LEFT_REDUCTION)
3785 {
3786 /* No extra instructions needed in the prologue. */
3787 prologue_cost = 0;
3788
3789 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3790 /* Count one reduction-like operation per vector. */
3791 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3792 stmt_info, 0, vect_body);
3793 else
3794 {
3795 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
3796 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3797 inside_cost = record_stmt_cost (cost_vec, nelements,
3798 vec_to_scalar, stmt_info, 0,
3799 vect_body);
3800 inside_cost += record_stmt_cost (cost_vec, nelements,
3801 scalar_stmt, stmt_info, 0,
3802 vect_body);
3803 }
3804 }
3805 else
3806 {
3807 /* Add in cost for initial definition.
3808 For cond reduction we have four vectors: initial index, step,
3809 initial result of the data reduction, initial value of the index
3810 reduction. */
3811 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3812 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3813 scalar_to_vec, stmt_info, 0,
3814 vect_prologue);
3815
3816 /* Cost of reduction op inside loop. */
3817 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3818 stmt_info, 0, vect_body);
3819 }
3820
3821 /* Determine cost of epilogue code.
3822
3823 We have a reduction operator that will reduce the vector in one statement.
3824 Also requires scalar extract. */
3825
3826 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3827 {
3828 if (reduc_fn != IFN_LAST)
3829 {
3830 if (reduction_type == COND_REDUCTION)
3831 {
3832 /* An EQ stmt and an COND_EXPR stmt. */
3833 epilogue_cost += record_stmt_cost (cost_vec, 2,
3834 vector_stmt, stmt_info, 0,
3835 vect_epilogue);
3836 /* Reduction of the max index and a reduction of the found
3837 values. */
3838 epilogue_cost += record_stmt_cost (cost_vec, 2,
3839 vec_to_scalar, stmt_info, 0,
3840 vect_epilogue);
3841 /* A broadcast of the max value. */
3842 epilogue_cost += record_stmt_cost (cost_vec, 1,
3843 scalar_to_vec, stmt_info, 0,
3844 vect_epilogue);
3845 }
3846 else
3847 {
3848 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3849 stmt_info, 0, vect_epilogue);
3850 epilogue_cost += record_stmt_cost (cost_vec, 1,
3851 vec_to_scalar, stmt_info, 0,
3852 vect_epilogue);
3853 }
3854 }
3855 else if (reduction_type == COND_REDUCTION)
3856 {
3857 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3858 /* Extraction of scalar elements. */
3859 epilogue_cost += record_stmt_cost (cost_vec,
3860 2 * estimated_nunits,
3861 vec_to_scalar, stmt_info, 0,
3862 vect_epilogue);
3863 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
3864 epilogue_cost += record_stmt_cost (cost_vec,
3865 2 * estimated_nunits - 3,
3866 scalar_stmt, stmt_info, 0,
3867 vect_epilogue);
3868 }
3869 else if (reduction_type == EXTRACT_LAST_REDUCTION
3870 || reduction_type == FOLD_LEFT_REDUCTION)
3871 /* No extra instructions need in the epilogue. */
3872 ;
3873 else
3874 {
3875 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3876 tree bitsize =
3877 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3878 int element_bitsize = tree_to_uhwi (bitsize);
3879 int nelements = vec_size_in_bits / element_bitsize;
3880
3881 if (code == COND_EXPR)
3882 code = MAX_EXPR;
3883
3884 optab = optab_for_tree_code (code, vectype, optab_default);
3885
3886 /* We have a whole vector shift available. */
3887 if (optab != unknown_optab
3888 && VECTOR_MODE_P (mode)
3889 && optab_handler (optab, mode) != CODE_FOR_nothing
3890 && have_whole_vector_shift (mode))
3891 {
3892 /* Final reduction via vector shifts and the reduction operator.
3893 Also requires scalar extract. */
3894 epilogue_cost += record_stmt_cost (cost_vec,
3895 exact_log2 (nelements) * 2,
3896 vector_stmt, stmt_info, 0,
3897 vect_epilogue);
3898 epilogue_cost += record_stmt_cost (cost_vec, 1,
3899 vec_to_scalar, stmt_info, 0,
3900 vect_epilogue);
3901 }
3902 else
3903 /* Use extracts and reduction op for final reduction. For N
3904 elements, we have N extracts and N-1 reduction ops. */
3905 epilogue_cost += record_stmt_cost (cost_vec,
3906 nelements + nelements - 1,
3907 vector_stmt, stmt_info, 0,
3908 vect_epilogue);
3909 }
3910 }
3911
3912 if (dump_enabled_p ())
3913 dump_printf (MSG_NOTE,
3914 "vect_model_reduction_cost: inside_cost = %d, "
3915 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3916 prologue_cost, epilogue_cost);
3917 }
3918
3919
3920 /* Function vect_model_induction_cost.
3921
3922 Models cost for induction operations. */
3923
3924 static void
3925 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
3926 stmt_vector_for_cost *cost_vec)
3927 {
3928 unsigned inside_cost, prologue_cost;
3929
3930 if (PURE_SLP_STMT (stmt_info))
3931 return;
3932
3933 /* loop cost for vec_loop. */
3934 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3935 stmt_info, 0, vect_body);
3936
3937 /* prologue cost for vec_init and vec_step. */
3938 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
3939 stmt_info, 0, vect_prologue);
3940
3941 if (dump_enabled_p ())
3942 dump_printf_loc (MSG_NOTE, vect_location,
3943 "vect_model_induction_cost: inside_cost = %d, "
3944 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3945 }
3946
3947
3948
3949 /* Function get_initial_def_for_reduction
3950
3951 Input:
3952 STMT_VINFO - a stmt that performs a reduction operation in the loop.
3953 INIT_VAL - the initial value of the reduction variable
3954
3955 Output:
3956 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3957 of the reduction (used for adjusting the epilog - see below).
3958 Return a vector variable, initialized according to the operation that
3959 STMT_VINFO performs. This vector will be used as the initial value
3960 of the vector of partial results.
3961
3962 Option1 (adjust in epilog): Initialize the vector as follows:
3963 add/bit or/xor: [0,0,...,0,0]
3964 mult/bit and: [1,1,...,1,1]
3965 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3966 and when necessary (e.g. add/mult case) let the caller know
3967 that it needs to adjust the result by init_val.
3968
3969 Option2: Initialize the vector as follows:
3970 add/bit or/xor: [init_val,0,0,...,0]
3971 mult/bit and: [init_val,1,1,...,1]
3972 min/max/cond_expr: [init_val,init_val,...,init_val]
3973 and no adjustments are needed.
3974
3975 For example, for the following code:
3976
3977 s = init_val;
3978 for (i=0;i<n;i++)
3979 s = s + a[i];
3980
3981 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
3982 For a vector of 4 units, we want to return either [0,0,0,init_val],
3983 or [0,0,0,0] and let the caller know that it needs to adjust
3984 the result at the end by 'init_val'.
3985
3986 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3987 initialization vector is simpler (same element in all entries), if
3988 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3989
3990 A cost model should help decide between these two schemes. */
3991
3992 tree
3993 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
3994 tree *adjustment_def)
3995 {
3996 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3997 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3998 tree scalar_type = TREE_TYPE (init_val);
3999 tree vectype = get_vectype_for_scalar_type (scalar_type);
4000 enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
4001 tree def_for_init;
4002 tree init_def;
4003 REAL_VALUE_TYPE real_init_val = dconst0;
4004 int int_init_val = 0;
4005 gimple_seq stmts = NULL;
4006
4007 gcc_assert (vectype);
4008
4009 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4010 || SCALAR_FLOAT_TYPE_P (scalar_type));
4011
4012 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4013 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4014
4015 vect_reduction_type reduction_type
4016 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4017
4018 switch (code)
4019 {
4020 case WIDEN_SUM_EXPR:
4021 case DOT_PROD_EXPR:
4022 case SAD_EXPR:
4023 case PLUS_EXPR:
4024 case MINUS_EXPR:
4025 case BIT_IOR_EXPR:
4026 case BIT_XOR_EXPR:
4027 case MULT_EXPR:
4028 case BIT_AND_EXPR:
4029 {
4030 /* ADJUSTMENT_DEF is NULL when called from
4031 vect_create_epilog_for_reduction to vectorize double reduction. */
4032 if (adjustment_def)
4033 *adjustment_def = init_val;
4034
4035 if (code == MULT_EXPR)
4036 {
4037 real_init_val = dconst1;
4038 int_init_val = 1;
4039 }
4040
4041 if (code == BIT_AND_EXPR)
4042 int_init_val = -1;
4043
4044 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4045 def_for_init = build_real (scalar_type, real_init_val);
4046 else
4047 def_for_init = build_int_cst (scalar_type, int_init_val);
4048
4049 if (adjustment_def)
4050 /* Option1: the first element is '0' or '1' as well. */
4051 init_def = gimple_build_vector_from_val (&stmts, vectype,
4052 def_for_init);
4053 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4054 {
4055 /* Option2 (variable length): the first element is INIT_VAL. */
4056 init_def = gimple_build_vector_from_val (&stmts, vectype,
4057 def_for_init);
4058 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4059 vectype, init_def, init_val);
4060 }
4061 else
4062 {
4063 /* Option2: the first element is INIT_VAL. */
4064 tree_vector_builder elts (vectype, 1, 2);
4065 elts.quick_push (init_val);
4066 elts.quick_push (def_for_init);
4067 init_def = gimple_build_vector (&stmts, &elts);
4068 }
4069 }
4070 break;
4071
4072 case MIN_EXPR:
4073 case MAX_EXPR:
4074 case COND_EXPR:
4075 {
4076 if (adjustment_def)
4077 {
4078 *adjustment_def = NULL_TREE;
4079 if (reduction_type != COND_REDUCTION
4080 && reduction_type != EXTRACT_LAST_REDUCTION)
4081 {
4082 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
4083 break;
4084 }
4085 }
4086 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4087 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4088 }
4089 break;
4090
4091 default:
4092 gcc_unreachable ();
4093 }
4094
4095 if (stmts)
4096 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4097 return init_def;
4098 }
4099
4100 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4101 NUMBER_OF_VECTORS is the number of vector defs to create.
4102 If NEUTRAL_OP is nonnull, introducing extra elements of that
4103 value will not change the result. */
4104
4105 static void
4106 get_initial_defs_for_reduction (slp_tree slp_node,
4107 vec<tree> *vec_oprnds,
4108 unsigned int number_of_vectors,
4109 bool reduc_chain, tree neutral_op)
4110 {
4111 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4112 stmt_vec_info stmt_vinfo = stmts[0];
4113 unsigned HOST_WIDE_INT nunits;
4114 unsigned j, number_of_places_left_in_vector;
4115 tree vector_type;
4116 tree vop;
4117 int group_size = stmts.length ();
4118 unsigned int vec_num, i;
4119 unsigned number_of_copies = 1;
4120 vec<tree> voprnds;
4121 voprnds.create (number_of_vectors);
4122 struct loop *loop;
4123 auto_vec<tree, 16> permute_results;
4124
4125 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4126
4127 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4128
4129 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4130 gcc_assert (loop);
4131 edge pe = loop_preheader_edge (loop);
4132
4133 gcc_assert (!reduc_chain || neutral_op);
4134
4135 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4136 created vectors. It is greater than 1 if unrolling is performed.
4137
4138 For example, we have two scalar operands, s1 and s2 (e.g., group of
4139 strided accesses of size two), while NUNITS is four (i.e., four scalars
4140 of this type can be packed in a vector). The output vector will contain
4141 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4142 will be 2).
4143
4144 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4145 vectors containing the operands.
4146
4147 For example, NUNITS is four as before, and the group size is 8
4148 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4149 {s5, s6, s7, s8}. */
4150
4151 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4152 nunits = group_size;
4153
4154 number_of_copies = nunits * number_of_vectors / group_size;
4155
4156 number_of_places_left_in_vector = nunits;
4157 bool constant_p = true;
4158 tree_vector_builder elts (vector_type, nunits, 1);
4159 elts.quick_grow (nunits);
4160 for (j = 0; j < number_of_copies; j++)
4161 {
4162 for (i = group_size - 1; stmts.iterate (i, &stmt_vinfo); i--)
4163 {
4164 tree op;
4165 /* Get the def before the loop. In reduction chain we have only
4166 one initial value. */
4167 if ((j != (number_of_copies - 1)
4168 || (reduc_chain && i != 0))
4169 && neutral_op)
4170 op = neutral_op;
4171 else
4172 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4173
4174 /* Create 'vect_ = {op0,op1,...,opn}'. */
4175 number_of_places_left_in_vector--;
4176 elts[number_of_places_left_in_vector] = op;
4177 if (!CONSTANT_CLASS_P (op))
4178 constant_p = false;
4179
4180 if (number_of_places_left_in_vector == 0)
4181 {
4182 gimple_seq ctor_seq = NULL;
4183 tree init;
4184 if (constant_p && !neutral_op
4185 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4186 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4187 /* Build the vector directly from ELTS. */
4188 init = gimple_build_vector (&ctor_seq, &elts);
4189 else if (neutral_op)
4190 {
4191 /* Build a vector of the neutral value and shift the
4192 other elements into place. */
4193 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4194 neutral_op);
4195 int k = nunits;
4196 while (k > 0 && elts[k - 1] == neutral_op)
4197 k -= 1;
4198 while (k > 0)
4199 {
4200 k -= 1;
4201 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4202 vector_type, init, elts[k]);
4203 }
4204 }
4205 else
4206 {
4207 /* First time round, duplicate ELTS to fill the
4208 required number of vectors, then cherry pick the
4209 appropriate result for each iteration. */
4210 if (vec_oprnds->is_empty ())
4211 duplicate_and_interleave (&ctor_seq, vector_type, elts,
4212 number_of_vectors,
4213 permute_results);
4214 init = permute_results[number_of_vectors - j - 1];
4215 }
4216 if (ctor_seq != NULL)
4217 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4218 voprnds.quick_push (init);
4219
4220 number_of_places_left_in_vector = nunits;
4221 elts.new_vector (vector_type, nunits, 1);
4222 elts.quick_grow (nunits);
4223 constant_p = true;
4224 }
4225 }
4226 }
4227
4228 /* Since the vectors are created in the reverse order, we should invert
4229 them. */
4230 vec_num = voprnds.length ();
4231 for (j = vec_num; j != 0; j--)
4232 {
4233 vop = voprnds[j - 1];
4234 vec_oprnds->quick_push (vop);
4235 }
4236
4237 voprnds.release ();
4238
4239 /* In case that VF is greater than the unrolling factor needed for the SLP
4240 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4241 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4242 to replicate the vectors. */
4243 tree neutral_vec = NULL;
4244 while (number_of_vectors > vec_oprnds->length ())
4245 {
4246 if (neutral_op)
4247 {
4248 if (!neutral_vec)
4249 {
4250 gimple_seq ctor_seq = NULL;
4251 neutral_vec = gimple_build_vector_from_val
4252 (&ctor_seq, vector_type, neutral_op);
4253 if (ctor_seq != NULL)
4254 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4255 }
4256 vec_oprnds->quick_push (neutral_vec);
4257 }
4258 else
4259 {
4260 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4261 vec_oprnds->quick_push (vop);
4262 }
4263 }
4264 }
4265
4266
4267 /* Function vect_create_epilog_for_reduction
4268
4269 Create code at the loop-epilog to finalize the result of a reduction
4270 computation.
4271
4272 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4273 reduction statements.
4274 STMT_INFO is the scalar reduction stmt that is being vectorized.
4275 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4276 number of elements that we can fit in a vectype (nunits). In this case
4277 we have to generate more than one vector stmt - i.e - we need to "unroll"
4278 the vector stmt by a factor VF/nunits. For more details see documentation
4279 in vectorizable_operation.
4280 REDUC_FN is the internal function for the epilog reduction.
4281 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4282 computation.
4283 REDUC_INDEX is the index of the operand in the right hand side of the
4284 statement that is defined by REDUCTION_PHI.
4285 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4286 SLP_NODE is an SLP node containing a group of reduction statements. The
4287 first one in this group is STMT_INFO.
4288 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4289 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4290 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4291 any value of the IV in the loop.
4292 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4293 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4294 null if this is not an SLP reduction
4295
4296 This function:
4297 1. Creates the reduction def-use cycles: sets the arguments for
4298 REDUCTION_PHIS:
4299 The loop-entry argument is the vectorized initial-value of the reduction.
4300 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4301 sums.
4302 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4303 by calling the function specified by REDUC_FN if available, or by
4304 other means (whole-vector shifts or a scalar loop).
4305 The function also creates a new phi node at the loop exit to preserve
4306 loop-closed form, as illustrated below.
4307
4308 The flow at the entry to this function:
4309
4310 loop:
4311 vec_def = phi <null, null> # REDUCTION_PHI
4312 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4313 s_loop = scalar_stmt # (scalar) STMT_INFO
4314 loop_exit:
4315 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4316 use <s_out0>
4317 use <s_out0>
4318
4319 The above is transformed by this function into:
4320
4321 loop:
4322 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4323 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4324 s_loop = scalar_stmt # (scalar) STMT_INFO
4325 loop_exit:
4326 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4327 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4328 v_out2 = reduce <v_out1>
4329 s_out3 = extract_field <v_out2, 0>
4330 s_out4 = adjust_result <s_out3>
4331 use <s_out4>
4332 use <s_out4>
4333 */
4334
4335 static void
4336 vect_create_epilog_for_reduction (vec<tree> vect_defs,
4337 stmt_vec_info stmt_info,
4338 gimple *reduc_def_stmt,
4339 int ncopies, internal_fn reduc_fn,
4340 vec<stmt_vec_info> reduction_phis,
4341 bool double_reduc,
4342 slp_tree slp_node,
4343 slp_instance slp_node_instance,
4344 tree induc_val, enum tree_code induc_code,
4345 tree neutral_op)
4346 {
4347 stmt_vec_info prev_phi_info;
4348 tree vectype;
4349 machine_mode mode;
4350 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4351 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4352 basic_block exit_bb;
4353 tree scalar_dest;
4354 tree scalar_type;
4355 gimple *new_phi = NULL, *phi;
4356 stmt_vec_info phi_info;
4357 gimple_stmt_iterator exit_gsi;
4358 tree vec_dest;
4359 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4360 gimple *epilog_stmt = NULL;
4361 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
4362 gimple *exit_phi;
4363 tree bitsize;
4364 tree adjustment_def = NULL;
4365 tree vec_initial_def = NULL;
4366 tree expr, def, initial_def = NULL;
4367 tree orig_name, scalar_result;
4368 imm_use_iterator imm_iter, phi_imm_iter;
4369 use_operand_p use_p, phi_use_p;
4370 gimple *use_stmt;
4371 stmt_vec_info reduction_phi_info = NULL;
4372 bool nested_in_vect_loop = false;
4373 auto_vec<gimple *> new_phis;
4374 auto_vec<stmt_vec_info> inner_phis;
4375 int j, i;
4376 auto_vec<tree> scalar_results;
4377 unsigned int group_size = 1, k, ratio;
4378 auto_vec<tree> vec_initial_defs;
4379 auto_vec<gimple *> phis;
4380 bool slp_reduc = false;
4381 bool direct_slp_reduc;
4382 tree new_phi_result;
4383 stmt_vec_info inner_phi = NULL;
4384 tree induction_index = NULL_TREE;
4385
4386 if (slp_node)
4387 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4388
4389 if (nested_in_vect_loop_p (loop, stmt_info))
4390 {
4391 outer_loop = loop;
4392 loop = loop->inner;
4393 nested_in_vect_loop = true;
4394 gcc_assert (!slp_node);
4395 }
4396
4397 vectype = STMT_VINFO_VECTYPE (stmt_info);
4398 gcc_assert (vectype);
4399 mode = TYPE_MODE (vectype);
4400
4401 /* 1. Create the reduction def-use cycle:
4402 Set the arguments of REDUCTION_PHIS, i.e., transform
4403
4404 loop:
4405 vec_def = phi <null, null> # REDUCTION_PHI
4406 VECT_DEF = vector_stmt # vectorized form of STMT
4407 ...
4408
4409 into:
4410
4411 loop:
4412 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4413 VECT_DEF = vector_stmt # vectorized form of STMT
4414 ...
4415
4416 (in case of SLP, do it for all the phis). */
4417
4418 /* Get the loop-entry arguments. */
4419 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4420 if (slp_node)
4421 {
4422 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4423 vec_initial_defs.reserve (vec_num);
4424 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4425 &vec_initial_defs, vec_num,
4426 REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4427 neutral_op);
4428 }
4429 else
4430 {
4431 /* Get at the scalar def before the loop, that defines the initial value
4432 of the reduction variable. */
4433 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4434 loop_preheader_edge (loop));
4435 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4436 and we can't use zero for induc_val, use initial_def. Similarly
4437 for REDUC_MIN and initial_def larger than the base. */
4438 if (TREE_CODE (initial_def) == INTEGER_CST
4439 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4440 == INTEGER_INDUC_COND_REDUCTION)
4441 && !integer_zerop (induc_val)
4442 && ((induc_code == MAX_EXPR
4443 && tree_int_cst_lt (initial_def, induc_val))
4444 || (induc_code == MIN_EXPR
4445 && tree_int_cst_lt (induc_val, initial_def))))
4446 induc_val = initial_def;
4447
4448 if (double_reduc)
4449 /* In case of double reduction we only create a vector variable
4450 to be put in the reduction phi node. The actual statement
4451 creation is done later in this function. */
4452 vec_initial_def = vect_create_destination_var (initial_def, vectype);
4453 else if (nested_in_vect_loop)
4454 {
4455 /* Do not use an adjustment def as that case is not supported
4456 correctly if ncopies is not one. */
4457 vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4458 vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4459 stmt_info);
4460 }
4461 else
4462 vec_initial_def
4463 = get_initial_def_for_reduction (stmt_info, initial_def,
4464 &adjustment_def);
4465 vec_initial_defs.create (1);
4466 vec_initial_defs.quick_push (vec_initial_def);
4467 }
4468
4469 /* Set phi nodes arguments. */
4470 FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4471 {
4472 tree vec_init_def = vec_initial_defs[i];
4473 tree def = vect_defs[i];
4474 for (j = 0; j < ncopies; j++)
4475 {
4476 if (j != 0)
4477 {
4478 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4479 if (nested_in_vect_loop)
4480 vec_init_def
4481 = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
4482 }
4483
4484 /* Set the loop-entry arg of the reduction-phi. */
4485
4486 gphi *phi = as_a <gphi *> (phi_info->stmt);
4487 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4488 == INTEGER_INDUC_COND_REDUCTION)
4489 {
4490 /* Initialise the reduction phi to zero. This prevents initial
4491 values of non-zero interferring with the reduction op. */
4492 gcc_assert (ncopies == 1);
4493 gcc_assert (i == 0);
4494
4495 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4496 tree induc_val_vec
4497 = build_vector_from_val (vec_init_def_type, induc_val);
4498
4499 add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4500 UNKNOWN_LOCATION);
4501 }
4502 else
4503 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4504 UNKNOWN_LOCATION);
4505
4506 /* Set the loop-latch arg for the reduction-phi. */
4507 if (j > 0)
4508 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4509
4510 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4511
4512 if (dump_enabled_p ())
4513 dump_printf_loc (MSG_NOTE, vect_location,
4514 "transform reduction: created def-use cycle: %G%G",
4515 phi, SSA_NAME_DEF_STMT (def));
4516 }
4517 }
4518
4519 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4520 which is updated with the current index of the loop for every match of
4521 the original loop's cond_expr (VEC_STMT). This results in a vector
4522 containing the last time the condition passed for that vector lane.
4523 The first match will be a 1 to allow 0 to be used for non-matching
4524 indexes. If there are no matches at all then the vector will be all
4525 zeroes. */
4526 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4527 {
4528 tree indx_before_incr, indx_after_incr;
4529 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4530
4531 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4532 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4533
4534 int scalar_precision
4535 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4536 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4537 tree cr_index_vector_type = build_vector_type
4538 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4539
4540 /* First we create a simple vector induction variable which starts
4541 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4542 vector size (STEP). */
4543
4544 /* Create a {1,2,3,...} vector. */
4545 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4546
4547 /* Create a vector of the step value. */
4548 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4549 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4550
4551 /* Create an induction variable. */
4552 gimple_stmt_iterator incr_gsi;
4553 bool insert_after;
4554 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4555 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4556 insert_after, &indx_before_incr, &indx_after_incr);
4557
4558 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4559 filled with zeros (VEC_ZERO). */
4560
4561 /* Create a vector of 0s. */
4562 tree zero = build_zero_cst (cr_index_scalar_type);
4563 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4564
4565 /* Create a vector phi node. */
4566 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4567 new_phi = create_phi_node (new_phi_tree, loop->header);
4568 loop_vinfo->add_stmt (new_phi);
4569 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4570 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4571
4572 /* Now take the condition from the loops original cond_expr
4573 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4574 every match uses values from the induction variable
4575 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4576 (NEW_PHI_TREE).
4577 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4578 the new cond_expr (INDEX_COND_EXPR). */
4579
4580 /* Duplicate the condition from vec_stmt. */
4581 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4582
4583 /* Create a conditional, where the condition is taken from vec_stmt
4584 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4585 else is the phi (NEW_PHI_TREE). */
4586 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4587 ccompare, indx_before_incr,
4588 new_phi_tree);
4589 induction_index = make_ssa_name (cr_index_vector_type);
4590 gimple *index_condition = gimple_build_assign (induction_index,
4591 index_cond_expr);
4592 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4593 stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4594 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4595
4596 /* Update the phi with the vec cond. */
4597 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4598 loop_latch_edge (loop), UNKNOWN_LOCATION);
4599 }
4600
4601 /* 2. Create epilog code.
4602 The reduction epilog code operates across the elements of the vector
4603 of partial results computed by the vectorized loop.
4604 The reduction epilog code consists of:
4605
4606 step 1: compute the scalar result in a vector (v_out2)
4607 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4608 step 3: adjust the scalar result (s_out3) if needed.
4609
4610 Step 1 can be accomplished using one the following three schemes:
4611 (scheme 1) using reduc_fn, if available.
4612 (scheme 2) using whole-vector shifts, if available.
4613 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4614 combined.
4615
4616 The overall epilog code looks like this:
4617
4618 s_out0 = phi <s_loop> # original EXIT_PHI
4619 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4620 v_out2 = reduce <v_out1> # step 1
4621 s_out3 = extract_field <v_out2, 0> # step 2
4622 s_out4 = adjust_result <s_out3> # step 3
4623
4624 (step 3 is optional, and steps 1 and 2 may be combined).
4625 Lastly, the uses of s_out0 are replaced by s_out4. */
4626
4627
4628 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4629 v_out1 = phi <VECT_DEF>
4630 Store them in NEW_PHIS. */
4631
4632 exit_bb = single_exit (loop)->dest;
4633 prev_phi_info = NULL;
4634 new_phis.create (vect_defs.length ());
4635 FOR_EACH_VEC_ELT (vect_defs, i, def)
4636 {
4637 for (j = 0; j < ncopies; j++)
4638 {
4639 tree new_def = copy_ssa_name (def);
4640 phi = create_phi_node (new_def, exit_bb);
4641 stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4642 if (j == 0)
4643 new_phis.quick_push (phi);
4644 else
4645 {
4646 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4647 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4648 }
4649
4650 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4651 prev_phi_info = phi_info;
4652 }
4653 }
4654
4655 /* The epilogue is created for the outer-loop, i.e., for the loop being
4656 vectorized. Create exit phis for the outer loop. */
4657 if (double_reduc)
4658 {
4659 loop = outer_loop;
4660 exit_bb = single_exit (loop)->dest;
4661 inner_phis.create (vect_defs.length ());
4662 FOR_EACH_VEC_ELT (new_phis, i, phi)
4663 {
4664 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4665 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4666 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4667 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4668 PHI_RESULT (phi));
4669 prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4670 inner_phis.quick_push (phi_info);
4671 new_phis[i] = outer_phi;
4672 while (STMT_VINFO_RELATED_STMT (phi_info))
4673 {
4674 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4675 new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4676 outer_phi = create_phi_node (new_result, exit_bb);
4677 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4678 PHI_RESULT (phi_info->stmt));
4679 stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4680 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4681 prev_phi_info = outer_phi_info;
4682 }
4683 }
4684 }
4685
4686 exit_gsi = gsi_after_labels (exit_bb);
4687
4688 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4689 (i.e. when reduc_fn is not available) and in the final adjustment
4690 code (if needed). Also get the original scalar reduction variable as
4691 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4692 represents a reduction pattern), the tree-code and scalar-def are
4693 taken from the original stmt that the pattern-stmt (STMT) replaces.
4694 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4695 are taken from STMT. */
4696
4697 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4698 if (orig_stmt_info != stmt_info)
4699 {
4700 /* Reduction pattern */
4701 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4702 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4703 }
4704
4705 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4706 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4707 partial results are added and not subtracted. */
4708 if (code == MINUS_EXPR)
4709 code = PLUS_EXPR;
4710
4711 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4712 scalar_type = TREE_TYPE (scalar_dest);
4713 scalar_results.create (group_size);
4714 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4715 bitsize = TYPE_SIZE (scalar_type);
4716
4717 /* In case this is a reduction in an inner-loop while vectorizing an outer
4718 loop - we don't need to extract a single scalar result at the end of the
4719 inner-loop (unless it is double reduction, i.e., the use of reduction is
4720 outside the outer-loop). The final vector of partial results will be used
4721 in the vectorized outer-loop, or reduced to a scalar result at the end of
4722 the outer-loop. */
4723 if (nested_in_vect_loop && !double_reduc)
4724 goto vect_finalize_reduction;
4725
4726 /* SLP reduction without reduction chain, e.g.,
4727 # a1 = phi <a2, a0>
4728 # b1 = phi <b2, b0>
4729 a2 = operation (a1)
4730 b2 = operation (b1) */
4731 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4732
4733 /* True if we should implement SLP_REDUC using native reduction operations
4734 instead of scalar operations. */
4735 direct_slp_reduc = (reduc_fn != IFN_LAST
4736 && slp_reduc
4737 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4738
4739 /* In case of reduction chain, e.g.,
4740 # a1 = phi <a3, a0>
4741 a2 = operation (a1)
4742 a3 = operation (a2),
4743
4744 we may end up with more than one vector result. Here we reduce them to
4745 one vector. */
4746 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4747 {
4748 tree first_vect = PHI_RESULT (new_phis[0]);
4749 gassign *new_vec_stmt = NULL;
4750 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4751 for (k = 1; k < new_phis.length (); k++)
4752 {
4753 gimple *next_phi = new_phis[k];
4754 tree second_vect = PHI_RESULT (next_phi);
4755 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4756 new_vec_stmt = gimple_build_assign (tem, code,
4757 first_vect, second_vect);
4758 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4759 first_vect = tem;
4760 }
4761
4762 new_phi_result = first_vect;
4763 if (new_vec_stmt)
4764 {
4765 new_phis.truncate (0);
4766 new_phis.safe_push (new_vec_stmt);
4767 }
4768 }
4769 /* Likewise if we couldn't use a single defuse cycle. */
4770 else if (ncopies > 1)
4771 {
4772 gcc_assert (new_phis.length () == 1);
4773 tree first_vect = PHI_RESULT (new_phis[0]);
4774 gassign *new_vec_stmt = NULL;
4775 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4776 stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4777 for (int k = 1; k < ncopies; ++k)
4778 {
4779 next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4780 tree second_vect = PHI_RESULT (next_phi_info->stmt);
4781 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4782 new_vec_stmt = gimple_build_assign (tem, code,
4783 first_vect, second_vect);
4784 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4785 first_vect = tem;
4786 }
4787 new_phi_result = first_vect;
4788 new_phis.truncate (0);
4789 new_phis.safe_push (new_vec_stmt);
4790 }
4791 else
4792 new_phi_result = PHI_RESULT (new_phis[0]);
4793
4794 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4795 && reduc_fn != IFN_LAST)
4796 {
4797 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4798 various data values where the condition matched and another vector
4799 (INDUCTION_INDEX) containing all the indexes of those matches. We
4800 need to extract the last matching index (which will be the index with
4801 highest value) and use this to index into the data vector.
4802 For the case where there were no matches, the data vector will contain
4803 all default values and the index vector will be all zeros. */
4804
4805 /* Get various versions of the type of the vector of indexes. */
4806 tree index_vec_type = TREE_TYPE (induction_index);
4807 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4808 tree index_scalar_type = TREE_TYPE (index_vec_type);
4809 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4810 (index_vec_type);
4811
4812 /* Get an unsigned integer version of the type of the data vector. */
4813 int scalar_precision
4814 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4815 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4816 tree vectype_unsigned = build_vector_type
4817 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4818
4819 /* First we need to create a vector (ZERO_VEC) of zeros and another
4820 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4821 can create using a MAX reduction and then expanding.
4822 In the case where the loop never made any matches, the max index will
4823 be zero. */
4824
4825 /* Vector of {0, 0, 0,...}. */
4826 tree zero_vec = make_ssa_name (vectype);
4827 tree zero_vec_rhs = build_zero_cst (vectype);
4828 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4829 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4830
4831 /* Find maximum value from the vector of found indexes. */
4832 tree max_index = make_ssa_name (index_scalar_type);
4833 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4834 1, induction_index);
4835 gimple_call_set_lhs (max_index_stmt, max_index);
4836 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4837
4838 /* Vector of {max_index, max_index, max_index,...}. */
4839 tree max_index_vec = make_ssa_name (index_vec_type);
4840 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4841 max_index);
4842 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4843 max_index_vec_rhs);
4844 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4845
4846 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4847 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4848 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4849 otherwise. Only one value should match, resulting in a vector
4850 (VEC_COND) with one data value and the rest zeros.
4851 In the case where the loop never made any matches, every index will
4852 match, resulting in a vector with all data values (which will all be
4853 the default value). */
4854
4855 /* Compare the max index vector to the vector of found indexes to find
4856 the position of the max value. */
4857 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4858 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4859 induction_index,
4860 max_index_vec);
4861 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4862
4863 /* Use the compare to choose either values from the data vector or
4864 zero. */
4865 tree vec_cond = make_ssa_name (vectype);
4866 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4867 vec_compare, new_phi_result,
4868 zero_vec);
4869 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4870
4871 /* Finally we need to extract the data value from the vector (VEC_COND)
4872 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4873 reduction, but because this doesn't exist, we can use a MAX reduction
4874 instead. The data value might be signed or a float so we need to cast
4875 it first.
4876 In the case where the loop never made any matches, the data values are
4877 all identical, and so will reduce down correctly. */
4878
4879 /* Make the matched data values unsigned. */
4880 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4881 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4882 vec_cond);
4883 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4884 VIEW_CONVERT_EXPR,
4885 vec_cond_cast_rhs);
4886 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4887
4888 /* Reduce down to a scalar value. */
4889 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4890 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4891 1, vec_cond_cast);
4892 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4893 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4894
4895 /* Convert the reduced value back to the result type and set as the
4896 result. */
4897 gimple_seq stmts = NULL;
4898 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4899 data_reduc);
4900 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4901 scalar_results.safe_push (new_temp);
4902 }
4903 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4904 && reduc_fn == IFN_LAST)
4905 {
4906 /* Condition reduction without supported IFN_REDUC_MAX. Generate
4907 idx = 0;
4908 idx_val = induction_index[0];
4909 val = data_reduc[0];
4910 for (idx = 0, val = init, i = 0; i < nelts; ++i)
4911 if (induction_index[i] > idx_val)
4912 val = data_reduc[i], idx_val = induction_index[i];
4913 return val; */
4914
4915 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4916 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4917 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4918 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4919 /* Enforced by vectorizable_reduction, which ensures we have target
4920 support before allowing a conditional reduction on variable-length
4921 vectors. */
4922 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4923 tree idx_val = NULL_TREE, val = NULL_TREE;
4924 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4925 {
4926 tree old_idx_val = idx_val;
4927 tree old_val = val;
4928 idx_val = make_ssa_name (idx_eltype);
4929 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4930 build3 (BIT_FIELD_REF, idx_eltype,
4931 induction_index,
4932 bitsize_int (el_size),
4933 bitsize_int (off)));
4934 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4935 val = make_ssa_name (data_eltype);
4936 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4937 build3 (BIT_FIELD_REF,
4938 data_eltype,
4939 new_phi_result,
4940 bitsize_int (el_size),
4941 bitsize_int (off)));
4942 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4943 if (off != 0)
4944 {
4945 tree new_idx_val = idx_val;
4946 tree new_val = val;
4947 if (off != v_size - el_size)
4948 {
4949 new_idx_val = make_ssa_name (idx_eltype);
4950 epilog_stmt = gimple_build_assign (new_idx_val,
4951 MAX_EXPR, idx_val,
4952 old_idx_val);
4953 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4954 }
4955 new_val = make_ssa_name (data_eltype);
4956 epilog_stmt = gimple_build_assign (new_val,
4957 COND_EXPR,
4958 build2 (GT_EXPR,
4959 boolean_type_node,
4960 idx_val,
4961 old_idx_val),
4962 val, old_val);
4963 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4964 idx_val = new_idx_val;
4965 val = new_val;
4966 }
4967 }
4968 /* Convert the reduced value back to the result type and set as the
4969 result. */
4970 gimple_seq stmts = NULL;
4971 val = gimple_convert (&stmts, scalar_type, val);
4972 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4973 scalar_results.safe_push (val);
4974 }
4975
4976 /* 2.3 Create the reduction code, using one of the three schemes described
4977 above. In SLP we simply need to extract all the elements from the
4978 vector (without reducing them), so we use scalar shifts. */
4979 else if (reduc_fn != IFN_LAST && !slp_reduc)
4980 {
4981 tree tmp;
4982 tree vec_elem_type;
4983
4984 /* Case 1: Create:
4985 v_out2 = reduc_expr <v_out1> */
4986
4987 if (dump_enabled_p ())
4988 dump_printf_loc (MSG_NOTE, vect_location,
4989 "Reduce using direct vector reduction.\n");
4990
4991 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4992 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4993 {
4994 tree tmp_dest
4995 = vect_create_destination_var (scalar_dest, vec_elem_type);
4996 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4997 new_phi_result);
4998 gimple_set_lhs (epilog_stmt, tmp_dest);
4999 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5000 gimple_set_lhs (epilog_stmt, new_temp);
5001 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5002
5003 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5004 new_temp);
5005 }
5006 else
5007 {
5008 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5009 new_phi_result);
5010 gimple_set_lhs (epilog_stmt, new_scalar_dest);
5011 }
5012
5013 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5014 gimple_set_lhs (epilog_stmt, new_temp);
5015 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5016
5017 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5018 == INTEGER_INDUC_COND_REDUCTION)
5019 && !operand_equal_p (initial_def, induc_val, 0))
5020 {
5021 /* Earlier we set the initial value to be a vector if induc_val
5022 values. Check the result and if it is induc_val then replace
5023 with the original initial value, unless induc_val is
5024 the same as initial_def already. */
5025 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5026 induc_val);
5027
5028 tmp = make_ssa_name (new_scalar_dest);
5029 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5030 initial_def, new_temp);
5031 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5032 new_temp = tmp;
5033 }
5034
5035 scalar_results.safe_push (new_temp);
5036 }
5037 else if (direct_slp_reduc)
5038 {
5039 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5040 with the elements for other SLP statements replaced with the
5041 neutral value. We can then do a normal reduction on each vector. */
5042
5043 /* Enforced by vectorizable_reduction. */
5044 gcc_assert (new_phis.length () == 1);
5045 gcc_assert (pow2p_hwi (group_size));
5046
5047 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5048 vec<stmt_vec_info> orig_phis
5049 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5050 gimple_seq seq = NULL;
5051
5052 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5053 and the same element size as VECTYPE. */
5054 tree index = build_index_vector (vectype, 0, 1);
5055 tree index_type = TREE_TYPE (index);
5056 tree index_elt_type = TREE_TYPE (index_type);
5057 tree mask_type = build_same_sized_truth_vector_type (index_type);
5058
5059 /* Create a vector that, for each element, identifies which of
5060 the REDUC_GROUP_SIZE results should use it. */
5061 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5062 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5063 build_vector_from_val (index_type, index_mask));
5064
5065 /* Get a neutral vector value. This is simply a splat of the neutral
5066 scalar value if we have one, otherwise the initial scalar value
5067 is itself a neutral value. */
5068 tree vector_identity = NULL_TREE;
5069 if (neutral_op)
5070 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5071 neutral_op);
5072 for (unsigned int i = 0; i < group_size; ++i)
5073 {
5074 /* If there's no univeral neutral value, we can use the
5075 initial scalar value from the original PHI. This is used
5076 for MIN and MAX reduction, for example. */
5077 if (!neutral_op)
5078 {
5079 tree scalar_value
5080 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5081 loop_preheader_edge (loop));
5082 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5083 scalar_value);
5084 }
5085
5086 /* Calculate the equivalent of:
5087
5088 sel[j] = (index[j] == i);
5089
5090 which selects the elements of NEW_PHI_RESULT that should
5091 be included in the result. */
5092 tree compare_val = build_int_cst (index_elt_type, i);
5093 compare_val = build_vector_from_val (index_type, compare_val);
5094 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5095 index, compare_val);
5096
5097 /* Calculate the equivalent of:
5098
5099 vec = seq ? new_phi_result : vector_identity;
5100
5101 VEC is now suitable for a full vector reduction. */
5102 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5103 sel, new_phi_result, vector_identity);
5104
5105 /* Do the reduction and convert it to the appropriate type. */
5106 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5107 TREE_TYPE (vectype), vec);
5108 scalar = gimple_convert (&seq, scalar_type, scalar);
5109 scalar_results.safe_push (scalar);
5110 }
5111 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5112 }
5113 else
5114 {
5115 bool reduce_with_shift;
5116 tree vec_temp;
5117
5118 /* COND reductions all do the final reduction with MAX_EXPR
5119 or MIN_EXPR. */
5120 if (code == COND_EXPR)
5121 {
5122 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5123 == INTEGER_INDUC_COND_REDUCTION)
5124 code = induc_code;
5125 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5126 == CONST_COND_REDUCTION)
5127 code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5128 else
5129 code = MAX_EXPR;
5130 }
5131
5132 /* See if the target wants to do the final (shift) reduction
5133 in a vector mode of smaller size and first reduce upper/lower
5134 halves against each other. */
5135 enum machine_mode mode1 = mode;
5136 tree vectype1 = vectype;
5137 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5138 unsigned sz1 = sz;
5139 if (!slp_reduc
5140 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5141 sz1 = GET_MODE_SIZE (mode1).to_constant ();
5142
5143 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5144 reduce_with_shift = have_whole_vector_shift (mode1);
5145 if (!VECTOR_MODE_P (mode1))
5146 reduce_with_shift = false;
5147 else
5148 {
5149 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5150 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5151 reduce_with_shift = false;
5152 }
5153
5154 /* First reduce the vector to the desired vector size we should
5155 do shift reduction on by combining upper and lower halves. */
5156 new_temp = new_phi_result;
5157 while (sz > sz1)
5158 {
5159 gcc_assert (!slp_reduc);
5160 sz /= 2;
5161 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5162
5163 /* The target has to make sure we support lowpart/highpart
5164 extraction, either via direct vector extract or through
5165 an integer mode punning. */
5166 tree dst1, dst2;
5167 if (convert_optab_handler (vec_extract_optab,
5168 TYPE_MODE (TREE_TYPE (new_temp)),
5169 TYPE_MODE (vectype1))
5170 != CODE_FOR_nothing)
5171 {
5172 /* Extract sub-vectors directly once vec_extract becomes
5173 a conversion optab. */
5174 dst1 = make_ssa_name (vectype1);
5175 epilog_stmt
5176 = gimple_build_assign (dst1, BIT_FIELD_REF,
5177 build3 (BIT_FIELD_REF, vectype1,
5178 new_temp, TYPE_SIZE (vectype1),
5179 bitsize_int (0)));
5180 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5181 dst2 = make_ssa_name (vectype1);
5182 epilog_stmt
5183 = gimple_build_assign (dst2, BIT_FIELD_REF,
5184 build3 (BIT_FIELD_REF, vectype1,
5185 new_temp, TYPE_SIZE (vectype1),
5186 bitsize_int (sz * BITS_PER_UNIT)));
5187 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5188 }
5189 else
5190 {
5191 /* Extract via punning to appropriately sized integer mode
5192 vector. */
5193 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5194 1);
5195 tree etype = build_vector_type (eltype, 2);
5196 gcc_assert (convert_optab_handler (vec_extract_optab,
5197 TYPE_MODE (etype),
5198 TYPE_MODE (eltype))
5199 != CODE_FOR_nothing);
5200 tree tem = make_ssa_name (etype);
5201 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5202 build1 (VIEW_CONVERT_EXPR,
5203 etype, new_temp));
5204 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5205 new_temp = tem;
5206 tem = make_ssa_name (eltype);
5207 epilog_stmt
5208 = gimple_build_assign (tem, BIT_FIELD_REF,
5209 build3 (BIT_FIELD_REF, eltype,
5210 new_temp, TYPE_SIZE (eltype),
5211 bitsize_int (0)));
5212 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5213 dst1 = make_ssa_name (vectype1);
5214 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5215 build1 (VIEW_CONVERT_EXPR,
5216 vectype1, tem));
5217 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5218 tem = make_ssa_name (eltype);
5219 epilog_stmt
5220 = gimple_build_assign (tem, BIT_FIELD_REF,
5221 build3 (BIT_FIELD_REF, eltype,
5222 new_temp, TYPE_SIZE (eltype),
5223 bitsize_int (sz * BITS_PER_UNIT)));
5224 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5225 dst2 = make_ssa_name (vectype1);
5226 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5227 build1 (VIEW_CONVERT_EXPR,
5228 vectype1, tem));
5229 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5230 }
5231
5232 new_temp = make_ssa_name (vectype1);
5233 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5234 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5235 }
5236
5237 if (reduce_with_shift && !slp_reduc)
5238 {
5239 int element_bitsize = tree_to_uhwi (bitsize);
5240 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5241 for variable-length vectors and also requires direct target support
5242 for loop reductions. */
5243 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5244 int nelements = vec_size_in_bits / element_bitsize;
5245 vec_perm_builder sel;
5246 vec_perm_indices indices;
5247
5248 int elt_offset;
5249
5250 tree zero_vec = build_zero_cst (vectype1);
5251 /* Case 2: Create:
5252 for (offset = nelements/2; offset >= 1; offset/=2)
5253 {
5254 Create: va' = vec_shift <va, offset>
5255 Create: va = vop <va, va'>
5256 } */
5257
5258 tree rhs;
5259
5260 if (dump_enabled_p ())
5261 dump_printf_loc (MSG_NOTE, vect_location,
5262 "Reduce using vector shifts\n");
5263
5264 mode1 = TYPE_MODE (vectype1);
5265 vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5266 for (elt_offset = nelements / 2;
5267 elt_offset >= 1;
5268 elt_offset /= 2)
5269 {
5270 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5271 indices.new_vector (sel, 2, nelements);
5272 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5273 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5274 new_temp, zero_vec, mask);
5275 new_name = make_ssa_name (vec_dest, epilog_stmt);
5276 gimple_assign_set_lhs (epilog_stmt, new_name);
5277 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5278
5279 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5280 new_temp);
5281 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5282 gimple_assign_set_lhs (epilog_stmt, new_temp);
5283 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5284 }
5285
5286 /* 2.4 Extract the final scalar result. Create:
5287 s_out3 = extract_field <v_out2, bitpos> */
5288
5289 if (dump_enabled_p ())
5290 dump_printf_loc (MSG_NOTE, vect_location,
5291 "extract scalar result\n");
5292
5293 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5294 bitsize, bitsize_zero_node);
5295 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5296 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5297 gimple_assign_set_lhs (epilog_stmt, new_temp);
5298 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5299 scalar_results.safe_push (new_temp);
5300 }
5301 else
5302 {
5303 /* Case 3: Create:
5304 s = extract_field <v_out2, 0>
5305 for (offset = element_size;
5306 offset < vector_size;
5307 offset += element_size;)
5308 {
5309 Create: s' = extract_field <v_out2, offset>
5310 Create: s = op <s, s'> // For non SLP cases
5311 } */
5312
5313 if (dump_enabled_p ())
5314 dump_printf_loc (MSG_NOTE, vect_location,
5315 "Reduce using scalar code.\n");
5316
5317 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5318 int element_bitsize = tree_to_uhwi (bitsize);
5319 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5320 {
5321 int bit_offset;
5322 if (gimple_code (new_phi) == GIMPLE_PHI)
5323 vec_temp = PHI_RESULT (new_phi);
5324 else
5325 vec_temp = gimple_assign_lhs (new_phi);
5326 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5327 bitsize_zero_node);
5328 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5329 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5330 gimple_assign_set_lhs (epilog_stmt, new_temp);
5331 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5332
5333 /* In SLP we don't need to apply reduction operation, so we just
5334 collect s' values in SCALAR_RESULTS. */
5335 if (slp_reduc)
5336 scalar_results.safe_push (new_temp);
5337
5338 for (bit_offset = element_bitsize;
5339 bit_offset < vec_size_in_bits;
5340 bit_offset += element_bitsize)
5341 {
5342 tree bitpos = bitsize_int (bit_offset);
5343 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5344 bitsize, bitpos);
5345
5346 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5347 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5348 gimple_assign_set_lhs (epilog_stmt, new_name);
5349 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5350
5351 if (slp_reduc)
5352 {
5353 /* In SLP we don't need to apply reduction operation, so
5354 we just collect s' values in SCALAR_RESULTS. */
5355 new_temp = new_name;
5356 scalar_results.safe_push (new_name);
5357 }
5358 else
5359 {
5360 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5361 new_name, new_temp);
5362 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5363 gimple_assign_set_lhs (epilog_stmt, new_temp);
5364 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5365 }
5366 }
5367 }
5368
5369 /* The only case where we need to reduce scalar results in SLP, is
5370 unrolling. If the size of SCALAR_RESULTS is greater than
5371 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5372 REDUC_GROUP_SIZE. */
5373 if (slp_reduc)
5374 {
5375 tree res, first_res, new_res;
5376 gimple *new_stmt;
5377
5378 /* Reduce multiple scalar results in case of SLP unrolling. */
5379 for (j = group_size; scalar_results.iterate (j, &res);
5380 j++)
5381 {
5382 first_res = scalar_results[j % group_size];
5383 new_stmt = gimple_build_assign (new_scalar_dest, code,
5384 first_res, res);
5385 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5386 gimple_assign_set_lhs (new_stmt, new_res);
5387 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5388 scalar_results[j % group_size] = new_res;
5389 }
5390 }
5391 else
5392 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5393 scalar_results.safe_push (new_temp);
5394 }
5395
5396 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5397 == INTEGER_INDUC_COND_REDUCTION)
5398 && !operand_equal_p (initial_def, induc_val, 0))
5399 {
5400 /* Earlier we set the initial value to be a vector if induc_val
5401 values. Check the result and if it is induc_val then replace
5402 with the original initial value, unless induc_val is
5403 the same as initial_def already. */
5404 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5405 induc_val);
5406
5407 tree tmp = make_ssa_name (new_scalar_dest);
5408 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5409 initial_def, new_temp);
5410 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5411 scalar_results[0] = tmp;
5412 }
5413 }
5414
5415 vect_finalize_reduction:
5416
5417 if (double_reduc)
5418 loop = loop->inner;
5419
5420 /* 2.5 Adjust the final result by the initial value of the reduction
5421 variable. (When such adjustment is not needed, then
5422 'adjustment_def' is zero). For example, if code is PLUS we create:
5423 new_temp = loop_exit_def + adjustment_def */
5424
5425 if (adjustment_def)
5426 {
5427 gcc_assert (!slp_reduc);
5428 if (nested_in_vect_loop)
5429 {
5430 new_phi = new_phis[0];
5431 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5432 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5433 new_dest = vect_create_destination_var (scalar_dest, vectype);
5434 }
5435 else
5436 {
5437 new_temp = scalar_results[0];
5438 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5439 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5440 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5441 }
5442
5443 epilog_stmt = gimple_build_assign (new_dest, expr);
5444 new_temp = make_ssa_name (new_dest, epilog_stmt);
5445 gimple_assign_set_lhs (epilog_stmt, new_temp);
5446 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5447 if (nested_in_vect_loop)
5448 {
5449 stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5450 STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5451 = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5452
5453 if (!double_reduc)
5454 scalar_results.quick_push (new_temp);
5455 else
5456 scalar_results[0] = new_temp;
5457 }
5458 else
5459 scalar_results[0] = new_temp;
5460
5461 new_phis[0] = epilog_stmt;
5462 }
5463
5464 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5465 phis with new adjusted scalar results, i.e., replace use <s_out0>
5466 with use <s_out4>.
5467
5468 Transform:
5469 loop_exit:
5470 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5471 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5472 v_out2 = reduce <v_out1>
5473 s_out3 = extract_field <v_out2, 0>
5474 s_out4 = adjust_result <s_out3>
5475 use <s_out0>
5476 use <s_out0>
5477
5478 into:
5479
5480 loop_exit:
5481 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5482 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5483 v_out2 = reduce <v_out1>
5484 s_out3 = extract_field <v_out2, 0>
5485 s_out4 = adjust_result <s_out3>
5486 use <s_out4>
5487 use <s_out4> */
5488
5489
5490 /* In SLP reduction chain we reduce vector results into one vector if
5491 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5492 LHS of the last stmt in the reduction chain, since we are looking for
5493 the loop exit phi node. */
5494 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5495 {
5496 stmt_vec_info dest_stmt_info
5497 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5498 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5499 group_size = 1;
5500 }
5501
5502 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5503 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5504 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5505 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5506 correspond to the first vector stmt, etc.
5507 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5508 if (group_size > new_phis.length ())
5509 {
5510 ratio = group_size / new_phis.length ();
5511 gcc_assert (!(group_size % new_phis.length ()));
5512 }
5513 else
5514 ratio = 1;
5515
5516 stmt_vec_info epilog_stmt_info = NULL;
5517 for (k = 0; k < group_size; k++)
5518 {
5519 if (k % ratio == 0)
5520 {
5521 epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
5522 reduction_phi_info = reduction_phis[k / ratio];
5523 if (double_reduc)
5524 inner_phi = inner_phis[k / ratio];
5525 }
5526
5527 if (slp_reduc)
5528 {
5529 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5530
5531 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5532 /* SLP statements can't participate in patterns. */
5533 gcc_assert (!orig_stmt_info);
5534 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5535 }
5536
5537 phis.create (3);
5538 /* Find the loop-closed-use at the loop exit of the original scalar
5539 result. (The reduction result is expected to have two immediate uses -
5540 one at the latch block, and one at the loop exit). */
5541 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5542 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5543 && !is_gimple_debug (USE_STMT (use_p)))
5544 phis.safe_push (USE_STMT (use_p));
5545
5546 /* While we expect to have found an exit_phi because of loop-closed-ssa
5547 form we can end up without one if the scalar cycle is dead. */
5548
5549 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5550 {
5551 if (outer_loop)
5552 {
5553 stmt_vec_info exit_phi_vinfo
5554 = loop_vinfo->lookup_stmt (exit_phi);
5555 gphi *vect_phi;
5556
5557 /* FORNOW. Currently not supporting the case that an inner-loop
5558 reduction is not used in the outer-loop (but only outside the
5559 outer-loop), unless it is double reduction. */
5560 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5561 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5562 || double_reduc);
5563
5564 if (double_reduc)
5565 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5566 else
5567 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
5568 if (!double_reduc
5569 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5570 != vect_double_reduction_def)
5571 continue;
5572
5573 /* Handle double reduction:
5574
5575 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5576 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5577 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5578 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5579
5580 At that point the regular reduction (stmt2 and stmt3) is
5581 already vectorized, as well as the exit phi node, stmt4.
5582 Here we vectorize the phi node of double reduction, stmt1, and
5583 update all relevant statements. */
5584
5585 /* Go through all the uses of s2 to find double reduction phi
5586 node, i.e., stmt1 above. */
5587 orig_name = PHI_RESULT (exit_phi);
5588 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5589 {
5590 stmt_vec_info use_stmt_vinfo;
5591 tree vect_phi_init, preheader_arg, vect_phi_res;
5592 basic_block bb = gimple_bb (use_stmt);
5593
5594 /* Check that USE_STMT is really double reduction phi
5595 node. */
5596 if (gimple_code (use_stmt) != GIMPLE_PHI
5597 || gimple_phi_num_args (use_stmt) != 2
5598 || bb->loop_father != outer_loop)
5599 continue;
5600 use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5601 if (!use_stmt_vinfo
5602 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5603 != vect_double_reduction_def)
5604 continue;
5605
5606 /* Create vector phi node for double reduction:
5607 vs1 = phi <vs0, vs2>
5608 vs1 was created previously in this function by a call to
5609 vect_get_vec_def_for_operand and is stored in
5610 vec_initial_def;
5611 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5612 vs0 is created here. */
5613
5614 /* Create vector phi node. */
5615 vect_phi = create_phi_node (vec_initial_def, bb);
5616 loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5617
5618 /* Create vs0 - initial def of the double reduction phi. */
5619 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5620 loop_preheader_edge (outer_loop));
5621 vect_phi_init = get_initial_def_for_reduction
5622 (stmt_info, preheader_arg, NULL);
5623
5624 /* Update phi node arguments with vs0 and vs2. */
5625 add_phi_arg (vect_phi, vect_phi_init,
5626 loop_preheader_edge (outer_loop),
5627 UNKNOWN_LOCATION);
5628 add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5629 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5630 if (dump_enabled_p ())
5631 dump_printf_loc (MSG_NOTE, vect_location,
5632 "created double reduction phi node: %G",
5633 vect_phi);
5634
5635 vect_phi_res = PHI_RESULT (vect_phi);
5636
5637 /* Replace the use, i.e., set the correct vs1 in the regular
5638 reduction phi node. FORNOW, NCOPIES is always 1, so the
5639 loop is redundant. */
5640 stmt_vec_info use_info = reduction_phi_info;
5641 for (j = 0; j < ncopies; j++)
5642 {
5643 edge pr_edge = loop_preheader_edge (loop);
5644 SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5645 pr_edge->dest_idx, vect_phi_res);
5646 use_info = STMT_VINFO_RELATED_STMT (use_info);
5647 }
5648 }
5649 }
5650 }
5651
5652 phis.release ();
5653 if (nested_in_vect_loop)
5654 {
5655 if (double_reduc)
5656 loop = outer_loop;
5657 else
5658 continue;
5659 }
5660
5661 phis.create (3);
5662 /* Find the loop-closed-use at the loop exit of the original scalar
5663 result. (The reduction result is expected to have two immediate uses,
5664 one at the latch block, and one at the loop exit). For double
5665 reductions we are looking for exit phis of the outer loop. */
5666 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5667 {
5668 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5669 {
5670 if (!is_gimple_debug (USE_STMT (use_p)))
5671 phis.safe_push (USE_STMT (use_p));
5672 }
5673 else
5674 {
5675 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5676 {
5677 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5678
5679 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5680 {
5681 if (!flow_bb_inside_loop_p (loop,
5682 gimple_bb (USE_STMT (phi_use_p)))
5683 && !is_gimple_debug (USE_STMT (phi_use_p)))
5684 phis.safe_push (USE_STMT (phi_use_p));
5685 }
5686 }
5687 }
5688 }
5689
5690 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5691 {
5692 /* Replace the uses: */
5693 orig_name = PHI_RESULT (exit_phi);
5694 scalar_result = scalar_results[k];
5695 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5696 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5697 SET_USE (use_p, scalar_result);
5698 }
5699
5700 phis.release ();
5701 }
5702 }
5703
5704 /* Return a vector of type VECTYPE that is equal to the vector select
5705 operation "MASK ? VEC : IDENTITY". Insert the select statements
5706 before GSI. */
5707
5708 static tree
5709 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5710 tree vec, tree identity)
5711 {
5712 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5713 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5714 mask, vec, identity);
5715 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5716 return cond;
5717 }
5718
5719 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5720 order, starting with LHS. Insert the extraction statements before GSI and
5721 associate the new scalar SSA names with variable SCALAR_DEST.
5722 Return the SSA name for the result. */
5723
5724 static tree
5725 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5726 tree_code code, tree lhs, tree vector_rhs)
5727 {
5728 tree vectype = TREE_TYPE (vector_rhs);
5729 tree scalar_type = TREE_TYPE (vectype);
5730 tree bitsize = TYPE_SIZE (scalar_type);
5731 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5732 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5733
5734 for (unsigned HOST_WIDE_INT bit_offset = 0;
5735 bit_offset < vec_size_in_bits;
5736 bit_offset += element_bitsize)
5737 {
5738 tree bitpos = bitsize_int (bit_offset);
5739 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5740 bitsize, bitpos);
5741
5742 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5743 rhs = make_ssa_name (scalar_dest, stmt);
5744 gimple_assign_set_lhs (stmt, rhs);
5745 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5746
5747 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5748 tree new_name = make_ssa_name (scalar_dest, stmt);
5749 gimple_assign_set_lhs (stmt, new_name);
5750 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5751 lhs = new_name;
5752 }
5753 return lhs;
5754 }
5755
5756 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5757 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5758 statement. CODE is the operation performed by STMT_INFO and OPS are
5759 its scalar operands. REDUC_INDEX is the index of the operand in
5760 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5761 implements in-order reduction, or IFN_LAST if we should open-code it.
5762 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5763 that should be used to control the operation in a fully-masked loop. */
5764
5765 static bool
5766 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5767 gimple_stmt_iterator *gsi,
5768 stmt_vec_info *vec_stmt, slp_tree slp_node,
5769 gimple *reduc_def_stmt,
5770 tree_code code, internal_fn reduc_fn,
5771 tree ops[3], tree vectype_in,
5772 int reduc_index, vec_loop_masks *masks)
5773 {
5774 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5775 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5776 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5777 stmt_vec_info new_stmt_info = NULL;
5778
5779 int ncopies;
5780 if (slp_node)
5781 ncopies = 1;
5782 else
5783 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5784
5785 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5786 gcc_assert (ncopies == 1);
5787 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5788 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5789 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5790 == FOLD_LEFT_REDUCTION);
5791
5792 if (slp_node)
5793 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5794 TYPE_VECTOR_SUBPARTS (vectype_in)));
5795
5796 tree op0 = ops[1 - reduc_index];
5797
5798 int group_size = 1;
5799 stmt_vec_info scalar_dest_def_info;
5800 auto_vec<tree> vec_oprnds0;
5801 if (slp_node)
5802 {
5803 vect_get_vec_defs (op0, NULL_TREE, stmt_info, &vec_oprnds0, NULL,
5804 slp_node);
5805 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5806 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5807 }
5808 else
5809 {
5810 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5811 vec_oprnds0.create (1);
5812 vec_oprnds0.quick_push (loop_vec_def0);
5813 scalar_dest_def_info = stmt_info;
5814 }
5815
5816 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5817 tree scalar_type = TREE_TYPE (scalar_dest);
5818 tree reduc_var = gimple_phi_result (reduc_def_stmt);
5819
5820 int vec_num = vec_oprnds0.length ();
5821 gcc_assert (vec_num == 1 || slp_node);
5822 tree vec_elem_type = TREE_TYPE (vectype_out);
5823 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5824
5825 tree vector_identity = NULL_TREE;
5826 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5827 vector_identity = build_zero_cst (vectype_out);
5828
5829 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5830 int i;
5831 tree def0;
5832 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5833 {
5834 gimple *new_stmt;
5835 tree mask = NULL_TREE;
5836 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5837 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5838
5839 /* Handle MINUS by adding the negative. */
5840 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5841 {
5842 tree negated = make_ssa_name (vectype_out);
5843 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5844 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5845 def0 = negated;
5846 }
5847
5848 if (mask)
5849 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5850 vector_identity);
5851
5852 /* On the first iteration the input is simply the scalar phi
5853 result, and for subsequent iterations it is the output of
5854 the preceding operation. */
5855 if (reduc_fn != IFN_LAST)
5856 {
5857 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5858 /* For chained SLP reductions the output of the previous reduction
5859 operation serves as the input of the next. For the final statement
5860 the output cannot be a temporary - we reuse the original
5861 scalar destination of the last statement. */
5862 if (i != vec_num - 1)
5863 {
5864 gimple_set_lhs (new_stmt, scalar_dest_var);
5865 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5866 gimple_set_lhs (new_stmt, reduc_var);
5867 }
5868 }
5869 else
5870 {
5871 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5872 reduc_var, def0);
5873 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5874 /* Remove the statement, so that we can use the same code paths
5875 as for statements that we've just created. */
5876 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5877 gsi_remove (&tmp_gsi, false);
5878 }
5879
5880 if (i == vec_num - 1)
5881 {
5882 gimple_set_lhs (new_stmt, scalar_dest);
5883 new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5884 new_stmt);
5885 }
5886 else
5887 new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5888 new_stmt, gsi);
5889
5890 if (slp_node)
5891 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5892 }
5893
5894 if (!slp_node)
5895 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5896
5897 return true;
5898 }
5899
5900 /* Function is_nonwrapping_integer_induction.
5901
5902 Check if STMT_VINO (which is part of loop LOOP) both increments and
5903 does not cause overflow. */
5904
5905 static bool
5906 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
5907 {
5908 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5909 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5910 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5911 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5912 widest_int ni, max_loop_value, lhs_max;
5913 wi::overflow_type overflow = wi::OVF_NONE;
5914
5915 /* Make sure the loop is integer based. */
5916 if (TREE_CODE (base) != INTEGER_CST
5917 || TREE_CODE (step) != INTEGER_CST)
5918 return false;
5919
5920 /* Check that the max size of the loop will not wrap. */
5921
5922 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5923 return true;
5924
5925 if (! max_stmt_executions (loop, &ni))
5926 return false;
5927
5928 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5929 &overflow);
5930 if (overflow)
5931 return false;
5932
5933 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5934 TYPE_SIGN (lhs_type), &overflow);
5935 if (overflow)
5936 return false;
5937
5938 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5939 <= TYPE_PRECISION (lhs_type));
5940 }
5941
5942 /* Function vectorizable_reduction.
5943
5944 Check if STMT_INFO performs a reduction operation that can be vectorized.
5945 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5946 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5947 Return true if STMT_INFO is vectorizable in this way.
5948
5949 This function also handles reduction idioms (patterns) that have been
5950 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
5951 may be of this form:
5952 X = pattern_expr (arg0, arg1, ..., X)
5953 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5954 sequence that had been detected and replaced by the pattern-stmt
5955 (STMT_INFO).
5956
5957 This function also handles reduction of condition expressions, for example:
5958 for (int i = 0; i < N; i++)
5959 if (a[i] < value)
5960 last = a[i];
5961 This is handled by vectorising the loop and creating an additional vector
5962 containing the loop indexes for which "a[i] < value" was true. In the
5963 function epilogue this is reduced to a single max value and then used to
5964 index into the vector of results.
5965
5966 In some cases of reduction patterns, the type of the reduction variable X is
5967 different than the type of the other arguments of STMT_INFO.
5968 In such cases, the vectype that is used when transforming STMT_INFO into
5969 a vector stmt is different than the vectype that is used to determine the
5970 vectorization factor, because it consists of a different number of elements
5971 than the actual number of elements that are being operated upon in parallel.
5972
5973 For example, consider an accumulation of shorts into an int accumulator.
5974 On some targets it's possible to vectorize this pattern operating on 8
5975 shorts at a time (hence, the vectype for purposes of determining the
5976 vectorization factor should be V8HI); on the other hand, the vectype that
5977 is used to create the vector form is actually V4SI (the type of the result).
5978
5979 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5980 indicates what is the actual level of parallelism (V8HI in the example), so
5981 that the right vectorization factor would be derived. This vectype
5982 corresponds to the type of arguments to the reduction stmt, and should *NOT*
5983 be used to create the vectorized stmt. The right vectype for the vectorized
5984 stmt is obtained from the type of the result X:
5985 get_vectype_for_scalar_type (TREE_TYPE (X))
5986
5987 This means that, contrary to "regular" reductions (or "regular" stmts in
5988 general), the following equation:
5989 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5990 does *NOT* necessarily hold for reduction patterns. */
5991
5992 bool
5993 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5994 stmt_vec_info *vec_stmt, slp_tree slp_node,
5995 slp_instance slp_node_instance,
5996 stmt_vector_for_cost *cost_vec)
5997 {
5998 tree vec_dest;
5999 tree scalar_dest;
6000 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6001 tree vectype_in = NULL_TREE;
6002 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6003 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6004 enum tree_code code, orig_code;
6005 internal_fn reduc_fn;
6006 machine_mode vec_mode;
6007 int op_type;
6008 optab optab;
6009 tree new_temp = NULL_TREE;
6010 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6011 stmt_vec_info cond_stmt_vinfo = NULL;
6012 enum tree_code cond_reduc_op_code = ERROR_MARK;
6013 tree scalar_type;
6014 bool is_simple_use;
6015 int i;
6016 int ncopies;
6017 int epilog_copies;
6018 stmt_vec_info prev_stmt_info, prev_phi_info;
6019 bool single_defuse_cycle = false;
6020 stmt_vec_info new_stmt_info = NULL;
6021 int j;
6022 tree ops[3];
6023 enum vect_def_type dts[3];
6024 bool nested_cycle = false, found_nested_cycle_def = false;
6025 bool double_reduc = false;
6026 basic_block def_bb;
6027 struct loop * def_stmt_loop;
6028 tree def_arg;
6029 auto_vec<tree> vec_oprnds0;
6030 auto_vec<tree> vec_oprnds1;
6031 auto_vec<tree> vec_oprnds2;
6032 auto_vec<tree> vect_defs;
6033 auto_vec<stmt_vec_info> phis;
6034 int vec_num;
6035 tree def0, tem;
6036 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6037 tree cond_reduc_val = NULL_TREE;
6038
6039 /* Make sure it was already recognized as a reduction computation. */
6040 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6041 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6042 return false;
6043
6044 if (nested_in_vect_loop_p (loop, stmt_info))
6045 {
6046 loop = loop->inner;
6047 nested_cycle = true;
6048 }
6049
6050 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6051 gcc_assert (slp_node
6052 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6053
6054 if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
6055 {
6056 tree phi_result = gimple_phi_result (phi);
6057 /* Analysis is fully done on the reduction stmt invocation. */
6058 if (! vec_stmt)
6059 {
6060 if (slp_node)
6061 slp_node_instance->reduc_phis = slp_node;
6062
6063 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6064 return true;
6065 }
6066
6067 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6068 /* Leave the scalar phi in place. Note that checking
6069 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6070 for reductions involving a single statement. */
6071 return true;
6072
6073 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6074 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6075
6076 if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6077 == EXTRACT_LAST_REDUCTION)
6078 /* Leave the scalar phi in place. */
6079 return true;
6080
6081 gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6082 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6083 {
6084 tree op = gimple_op (reduc_stmt, k);
6085 if (op == phi_result)
6086 continue;
6087 if (k == 1
6088 && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6089 continue;
6090 if (!vectype_in
6091 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6092 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6093 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6094 break;
6095 }
6096 /* For a nested cycle we might end up with an operation like
6097 phi_result * phi_result. */
6098 if (!vectype_in)
6099 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
6100 gcc_assert (vectype_in);
6101
6102 if (slp_node)
6103 ncopies = 1;
6104 else
6105 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6106
6107 stmt_vec_info use_stmt_info;
6108 if (ncopies > 1
6109 && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6110 && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6111 && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info)
6112 single_defuse_cycle = true;
6113
6114 /* Create the destination vector */
6115 scalar_dest = gimple_assign_lhs (reduc_stmt);
6116 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6117
6118 if (slp_node)
6119 /* The size vect_schedule_slp_instance computes is off for us. */
6120 vec_num = vect_get_num_vectors
6121 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6122 * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6123 vectype_in);
6124 else
6125 vec_num = 1;
6126
6127 /* Generate the reduction PHIs upfront. */
6128 prev_phi_info = NULL;
6129 for (j = 0; j < ncopies; j++)
6130 {
6131 if (j == 0 || !single_defuse_cycle)
6132 {
6133 for (i = 0; i < vec_num; i++)
6134 {
6135 /* Create the reduction-phi that defines the reduction
6136 operand. */
6137 gimple *new_phi = create_phi_node (vec_dest, loop->header);
6138 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6139
6140 if (slp_node)
6141 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6142 else
6143 {
6144 if (j == 0)
6145 STMT_VINFO_VEC_STMT (stmt_info)
6146 = *vec_stmt = new_phi_info;
6147 else
6148 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6149 prev_phi_info = new_phi_info;
6150 }
6151 }
6152 }
6153 }
6154
6155 return true;
6156 }
6157
6158 /* 1. Is vectorizable reduction? */
6159 /* Not supportable if the reduction variable is used in the loop, unless
6160 it's a reduction chain. */
6161 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6162 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6163 return false;
6164
6165 /* Reductions that are not used even in an enclosing outer-loop,
6166 are expected to be "live" (used out of the loop). */
6167 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6168 && !STMT_VINFO_LIVE_P (stmt_info))
6169 return false;
6170
6171 /* 2. Has this been recognized as a reduction pattern?
6172
6173 Check if STMT represents a pattern that has been recognized
6174 in earlier analysis stages. For stmts that represent a pattern,
6175 the STMT_VINFO_RELATED_STMT field records the last stmt in
6176 the original sequence that constitutes the pattern. */
6177
6178 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6179 if (orig_stmt_info)
6180 {
6181 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6182 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6183 }
6184
6185 /* 3. Check the operands of the operation. The first operands are defined
6186 inside the loop body. The last operand is the reduction variable,
6187 which is defined by the loop-header-phi. */
6188
6189 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6190
6191 /* Flatten RHS. */
6192 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6193 {
6194 case GIMPLE_BINARY_RHS:
6195 code = gimple_assign_rhs_code (stmt);
6196 op_type = TREE_CODE_LENGTH (code);
6197 gcc_assert (op_type == binary_op);
6198 ops[0] = gimple_assign_rhs1 (stmt);
6199 ops[1] = gimple_assign_rhs2 (stmt);
6200 break;
6201
6202 case GIMPLE_TERNARY_RHS:
6203 code = gimple_assign_rhs_code (stmt);
6204 op_type = TREE_CODE_LENGTH (code);
6205 gcc_assert (op_type == ternary_op);
6206 ops[0] = gimple_assign_rhs1 (stmt);
6207 ops[1] = gimple_assign_rhs2 (stmt);
6208 ops[2] = gimple_assign_rhs3 (stmt);
6209 break;
6210
6211 case GIMPLE_UNARY_RHS:
6212 return false;
6213
6214 default:
6215 gcc_unreachable ();
6216 }
6217
6218 if (code == COND_EXPR && slp_node)
6219 return false;
6220
6221 scalar_dest = gimple_assign_lhs (stmt);
6222 scalar_type = TREE_TYPE (scalar_dest);
6223 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6224 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6225 return false;
6226
6227 /* Do not try to vectorize bit-precision reductions. */
6228 if (!type_has_mode_precision_p (scalar_type))
6229 return false;
6230
6231 /* All uses but the last are expected to be defined in the loop.
6232 The last use is the reduction variable. In case of nested cycle this
6233 assumption is not true: we use reduc_index to record the index of the
6234 reduction variable. */
6235 stmt_vec_info reduc_def_info = NULL;
6236 int reduc_index = -1;
6237 for (i = 0; i < op_type; i++)
6238 {
6239 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6240 if (i == 0 && code == COND_EXPR)
6241 continue;
6242
6243 stmt_vec_info def_stmt_info;
6244 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6245 &def_stmt_info);
6246 dt = dts[i];
6247 gcc_assert (is_simple_use);
6248 if (dt == vect_reduction_def)
6249 {
6250 reduc_def_info = def_stmt_info;
6251 reduc_index = i;
6252 continue;
6253 }
6254 else if (tem)
6255 {
6256 /* To properly compute ncopies we are interested in the widest
6257 input type in case we're looking at a widening accumulation. */
6258 if (!vectype_in
6259 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6260 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6261 vectype_in = tem;
6262 }
6263
6264 if (dt != vect_internal_def
6265 && dt != vect_external_def
6266 && dt != vect_constant_def
6267 && dt != vect_induction_def
6268 && !(dt == vect_nested_cycle && nested_cycle))
6269 return false;
6270
6271 if (dt == vect_nested_cycle)
6272 {
6273 found_nested_cycle_def = true;
6274 reduc_def_info = def_stmt_info;
6275 reduc_index = i;
6276 }
6277
6278 if (i == 1 && code == COND_EXPR)
6279 {
6280 /* Record how value of COND_EXPR is defined. */
6281 if (dt == vect_constant_def)
6282 {
6283 cond_reduc_dt = dt;
6284 cond_reduc_val = ops[i];
6285 }
6286 if (dt == vect_induction_def
6287 && def_stmt_info
6288 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6289 {
6290 cond_reduc_dt = dt;
6291 cond_stmt_vinfo = def_stmt_info;
6292 }
6293 }
6294 }
6295
6296 if (!vectype_in)
6297 vectype_in = vectype_out;
6298
6299 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6300 directy used in stmt. */
6301 if (reduc_index == -1)
6302 {
6303 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6304 {
6305 if (dump_enabled_p ())
6306 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6307 "in-order reduction chain without SLP.\n");
6308 return false;
6309 }
6310
6311 if (orig_stmt_info)
6312 reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6313 else
6314 reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6315 }
6316
6317 if (! reduc_def_info)
6318 return false;
6319
6320 gphi *reduc_def_phi = dyn_cast <gphi *> (reduc_def_info->stmt);
6321 if (!reduc_def_phi)
6322 return false;
6323
6324 if (!(reduc_index == -1
6325 || dts[reduc_index] == vect_reduction_def
6326 || dts[reduc_index] == vect_nested_cycle
6327 || ((dts[reduc_index] == vect_internal_def
6328 || dts[reduc_index] == vect_external_def
6329 || dts[reduc_index] == vect_constant_def
6330 || dts[reduc_index] == vect_induction_def)
6331 && nested_cycle && found_nested_cycle_def)))
6332 {
6333 /* For pattern recognized stmts, orig_stmt might be a reduction,
6334 but some helper statements for the pattern might not, or
6335 might be COND_EXPRs with reduction uses in the condition. */
6336 gcc_assert (orig_stmt_info);
6337 return false;
6338 }
6339
6340 /* PHIs should not participate in patterns. */
6341 gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6342 enum vect_reduction_type v_reduc_type
6343 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6344 stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6345
6346 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6347 /* If we have a condition reduction, see if we can simplify it further. */
6348 if (v_reduc_type == COND_REDUCTION)
6349 {
6350 /* TODO: We can't yet handle reduction chains, since we need to treat
6351 each COND_EXPR in the chain specially, not just the last one.
6352 E.g. for:
6353
6354 x_1 = PHI <x_3, ...>
6355 x_2 = a_2 ? ... : x_1;
6356 x_3 = a_3 ? ... : x_2;
6357
6358 we're interested in the last element in x_3 for which a_2 || a_3
6359 is true, whereas the current reduction chain handling would
6360 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6361 as a reduction operation. */
6362 if (reduc_index == -1)
6363 {
6364 if (dump_enabled_p ())
6365 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6366 "conditional reduction chains not supported\n");
6367 return false;
6368 }
6369
6370 /* vect_is_simple_reduction ensured that operand 2 is the
6371 loop-carried operand. */
6372 gcc_assert (reduc_index == 2);
6373
6374 /* Loop peeling modifies initial value of reduction PHI, which
6375 makes the reduction stmt to be transformed different to the
6376 original stmt analyzed. We need to record reduction code for
6377 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6378 it can be used directly at transform stage. */
6379 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6380 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6381 {
6382 /* Also set the reduction type to CONST_COND_REDUCTION. */
6383 gcc_assert (cond_reduc_dt == vect_constant_def);
6384 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6385 }
6386 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6387 vectype_in, OPTIMIZE_FOR_SPEED))
6388 {
6389 if (dump_enabled_p ())
6390 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6391 "optimizing condition reduction with"
6392 " FOLD_EXTRACT_LAST.\n");
6393 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6394 }
6395 else if (cond_reduc_dt == vect_induction_def)
6396 {
6397 tree base
6398 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6399 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6400
6401 gcc_assert (TREE_CODE (base) == INTEGER_CST
6402 && TREE_CODE (step) == INTEGER_CST);
6403 cond_reduc_val = NULL_TREE;
6404 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6405 above base; punt if base is the minimum value of the type for
6406 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6407 if (tree_int_cst_sgn (step) == -1)
6408 {
6409 cond_reduc_op_code = MIN_EXPR;
6410 if (tree_int_cst_sgn (base) == -1)
6411 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6412 else if (tree_int_cst_lt (base,
6413 TYPE_MAX_VALUE (TREE_TYPE (base))))
6414 cond_reduc_val
6415 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6416 }
6417 else
6418 {
6419 cond_reduc_op_code = MAX_EXPR;
6420 if (tree_int_cst_sgn (base) == 1)
6421 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6422 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6423 base))
6424 cond_reduc_val
6425 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6426 }
6427 if (cond_reduc_val)
6428 {
6429 if (dump_enabled_p ())
6430 dump_printf_loc (MSG_NOTE, vect_location,
6431 "condition expression based on "
6432 "integer induction.\n");
6433 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6434 = INTEGER_INDUC_COND_REDUCTION;
6435 }
6436 }
6437 else if (cond_reduc_dt == vect_constant_def)
6438 {
6439 enum vect_def_type cond_initial_dt;
6440 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6441 tree cond_initial_val
6442 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6443
6444 gcc_assert (cond_reduc_val != NULL_TREE);
6445 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6446 if (cond_initial_dt == vect_constant_def
6447 && types_compatible_p (TREE_TYPE (cond_initial_val),
6448 TREE_TYPE (cond_reduc_val)))
6449 {
6450 tree e = fold_binary (LE_EXPR, boolean_type_node,
6451 cond_initial_val, cond_reduc_val);
6452 if (e && (integer_onep (e) || integer_zerop (e)))
6453 {
6454 if (dump_enabled_p ())
6455 dump_printf_loc (MSG_NOTE, vect_location,
6456 "condition expression based on "
6457 "compile time constant.\n");
6458 /* Record reduction code at analysis stage. */
6459 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6460 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6461 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6462 = CONST_COND_REDUCTION;
6463 }
6464 }
6465 }
6466 }
6467
6468 if (orig_stmt_info)
6469 gcc_assert (tmp == orig_stmt_info
6470 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6471 else
6472 /* We changed STMT to be the first stmt in reduction chain, hence we
6473 check that in this case the first element in the chain is STMT. */
6474 gcc_assert (tmp == stmt_info
6475 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6476
6477 if (STMT_VINFO_LIVE_P (reduc_def_info))
6478 return false;
6479
6480 if (slp_node)
6481 ncopies = 1;
6482 else
6483 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6484
6485 gcc_assert (ncopies >= 1);
6486
6487 vec_mode = TYPE_MODE (vectype_in);
6488 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6489
6490 if (nested_cycle)
6491 {
6492 def_bb = gimple_bb (reduc_def_phi);
6493 def_stmt_loop = def_bb->loop_father;
6494 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6495 loop_preheader_edge (def_stmt_loop));
6496 stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6497 if (def_arg_stmt_info
6498 && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6499 == vect_double_reduction_def))
6500 double_reduc = true;
6501 }
6502
6503 if (code == COND_EXPR)
6504 {
6505 /* Only call during the analysis stage, otherwise we'll lose
6506 STMT_VINFO_TYPE. We'll pass ops[0] as reduc_op, it's only
6507 used as a flag during analysis. */
6508 if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
6509 ops[0], 0, NULL,
6510 cost_vec))
6511 {
6512 if (dump_enabled_p ())
6513 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6514 "unsupported condition in reduction\n");
6515 return false;
6516 }
6517 }
6518 else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6519 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6520 {
6521 /* Only call during the analysis stage, otherwise we'll lose
6522 STMT_VINFO_TYPE. We only support this for nested cycles
6523 without double reductions at the moment. */
6524 if (!nested_cycle
6525 || double_reduc
6526 || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL,
6527 NULL, cost_vec)))
6528 {
6529 if (dump_enabled_p ())
6530 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6531 "unsupported shift or rotation in reduction\n");
6532 return false;
6533 }
6534 }
6535 else
6536 {
6537 /* 4. Supportable by target? */
6538
6539 /* 4.1. check support for the operation in the loop */
6540 optab = optab_for_tree_code (code, vectype_in, optab_default);
6541 if (!optab)
6542 {
6543 if (dump_enabled_p ())
6544 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6545 "no optab.\n");
6546
6547 return false;
6548 }
6549
6550 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6551 {
6552 if (dump_enabled_p ())
6553 dump_printf (MSG_NOTE, "op not supported by target.\n");
6554
6555 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6556 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6557 return false;
6558
6559 if (dump_enabled_p ())
6560 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6561 }
6562
6563 /* Worthwhile without SIMD support? */
6564 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6565 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6566 {
6567 if (dump_enabled_p ())
6568 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6569 "not worthwhile without SIMD support.\n");
6570
6571 return false;
6572 }
6573 }
6574
6575 /* 4.2. Check support for the epilog operation.
6576
6577 If STMT represents a reduction pattern, then the type of the
6578 reduction variable may be different than the type of the rest
6579 of the arguments. For example, consider the case of accumulation
6580 of shorts into an int accumulator; The original code:
6581 S1: int_a = (int) short_a;
6582 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6583
6584 was replaced with:
6585 STMT: int_acc = widen_sum <short_a, int_acc>
6586
6587 This means that:
6588 1. The tree-code that is used to create the vector operation in the
6589 epilog code (that reduces the partial results) is not the
6590 tree-code of STMT, but is rather the tree-code of the original
6591 stmt from the pattern that STMT is replacing. I.e, in the example
6592 above we want to use 'widen_sum' in the loop, but 'plus' in the
6593 epilog.
6594 2. The type (mode) we use to check available target support
6595 for the vector operation to be created in the *epilog*, is
6596 determined by the type of the reduction variable (in the example
6597 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6598 However the type (mode) we use to check available target support
6599 for the vector operation to be created *inside the loop*, is
6600 determined by the type of the other arguments to STMT (in the
6601 example we'd check this: optab_handler (widen_sum_optab,
6602 vect_short_mode)).
6603
6604 This is contrary to "regular" reductions, in which the types of all
6605 the arguments are the same as the type of the reduction variable.
6606 For "regular" reductions we can therefore use the same vector type
6607 (and also the same tree-code) when generating the epilog code and
6608 when generating the code inside the loop. */
6609
6610 vect_reduction_type reduction_type
6611 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6612 if (orig_stmt_info
6613 && (reduction_type == TREE_CODE_REDUCTION
6614 || reduction_type == FOLD_LEFT_REDUCTION))
6615 {
6616 /* This is a reduction pattern: get the vectype from the type of the
6617 reduction variable, and get the tree-code from orig_stmt. */
6618 orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6619 gcc_assert (vectype_out);
6620 vec_mode = TYPE_MODE (vectype_out);
6621 }
6622 else
6623 {
6624 /* Regular reduction: use the same vectype and tree-code as used for
6625 the vector code inside the loop can be used for the epilog code. */
6626 orig_code = code;
6627
6628 if (code == MINUS_EXPR)
6629 orig_code = PLUS_EXPR;
6630
6631 /* For simple condition reductions, replace with the actual expression
6632 we want to base our reduction around. */
6633 if (reduction_type == CONST_COND_REDUCTION)
6634 {
6635 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6636 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6637 }
6638 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6639 orig_code = cond_reduc_op_code;
6640 }
6641
6642 reduc_fn = IFN_LAST;
6643
6644 if (reduction_type == TREE_CODE_REDUCTION
6645 || reduction_type == FOLD_LEFT_REDUCTION
6646 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6647 || reduction_type == CONST_COND_REDUCTION)
6648 {
6649 if (reduction_type == FOLD_LEFT_REDUCTION
6650 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6651 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6652 {
6653 if (reduc_fn != IFN_LAST
6654 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6655 OPTIMIZE_FOR_SPEED))
6656 {
6657 if (dump_enabled_p ())
6658 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6659 "reduc op not supported by target.\n");
6660
6661 reduc_fn = IFN_LAST;
6662 }
6663 }
6664 else
6665 {
6666 if (!nested_cycle || double_reduc)
6667 {
6668 if (dump_enabled_p ())
6669 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6670 "no reduc code for scalar code.\n");
6671
6672 return false;
6673 }
6674 }
6675 }
6676 else if (reduction_type == COND_REDUCTION)
6677 {
6678 int scalar_precision
6679 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6680 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6681 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6682 nunits_out);
6683
6684 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6685 OPTIMIZE_FOR_SPEED))
6686 reduc_fn = IFN_REDUC_MAX;
6687 }
6688
6689 if (reduction_type != EXTRACT_LAST_REDUCTION
6690 && (!nested_cycle || double_reduc)
6691 && reduc_fn == IFN_LAST
6692 && !nunits_out.is_constant ())
6693 {
6694 if (dump_enabled_p ())
6695 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6696 "missing target support for reduction on"
6697 " variable-length vectors.\n");
6698 return false;
6699 }
6700
6701 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6702 && ncopies > 1)
6703 {
6704 if (dump_enabled_p ())
6705 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6706 "multiple types in double reduction or condition "
6707 "reduction.\n");
6708 return false;
6709 }
6710
6711 /* For SLP reductions, see if there is a neutral value we can use. */
6712 tree neutral_op = NULL_TREE;
6713 if (slp_node)
6714 neutral_op = neutral_op_for_slp_reduction
6715 (slp_node_instance->reduc_phis, code,
6716 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6717
6718 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6719 {
6720 /* We can't support in-order reductions of code such as this:
6721
6722 for (int i = 0; i < n1; ++i)
6723 for (int j = 0; j < n2; ++j)
6724 l += a[j];
6725
6726 since GCC effectively transforms the loop when vectorizing:
6727
6728 for (int i = 0; i < n1 / VF; ++i)
6729 for (int j = 0; j < n2; ++j)
6730 for (int k = 0; k < VF; ++k)
6731 l += a[j];
6732
6733 which is a reassociation of the original operation. */
6734 if (dump_enabled_p ())
6735 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6736 "in-order double reduction not supported.\n");
6737
6738 return false;
6739 }
6740
6741 if (reduction_type == FOLD_LEFT_REDUCTION
6742 && slp_node
6743 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6744 {
6745 /* We cannot use in-order reductions in this case because there is
6746 an implicit reassociation of the operations involved. */
6747 if (dump_enabled_p ())
6748 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6749 "in-order unchained SLP reductions not supported.\n");
6750 return false;
6751 }
6752
6753 /* For double reductions, and for SLP reductions with a neutral value,
6754 we construct a variable-length initial vector by loading a vector
6755 full of the neutral value and then shift-and-inserting the start
6756 values into the low-numbered elements. */
6757 if ((double_reduc || neutral_op)
6758 && !nunits_out.is_constant ()
6759 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6760 vectype_out, OPTIMIZE_FOR_SPEED))
6761 {
6762 if (dump_enabled_p ())
6763 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6764 "reduction on variable-length vectors requires"
6765 " target support for a vector-shift-and-insert"
6766 " operation.\n");
6767 return false;
6768 }
6769
6770 /* Check extra constraints for variable-length unchained SLP reductions. */
6771 if (STMT_SLP_TYPE (stmt_info)
6772 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6773 && !nunits_out.is_constant ())
6774 {
6775 /* We checked above that we could build the initial vector when
6776 there's a neutral element value. Check here for the case in
6777 which each SLP statement has its own initial value and in which
6778 that value needs to be repeated for every instance of the
6779 statement within the initial vector. */
6780 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6781 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6782 if (!neutral_op
6783 && !can_duplicate_and_interleave_p (group_size, elt_mode))
6784 {
6785 if (dump_enabled_p ())
6786 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6787 "unsupported form of SLP reduction for"
6788 " variable-length vectors: cannot build"
6789 " initial vector.\n");
6790 return false;
6791 }
6792 /* The epilogue code relies on the number of elements being a multiple
6793 of the group size. The duplicate-and-interleave approach to setting
6794 up the the initial vector does too. */
6795 if (!multiple_p (nunits_out, group_size))
6796 {
6797 if (dump_enabled_p ())
6798 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6799 "unsupported form of SLP reduction for"
6800 " variable-length vectors: the vector size"
6801 " is not a multiple of the number of results.\n");
6802 return false;
6803 }
6804 }
6805
6806 /* In case of widenning multiplication by a constant, we update the type
6807 of the constant to be the type of the other operand. We check that the
6808 constant fits the type in the pattern recognition pass. */
6809 if (code == DOT_PROD_EXPR
6810 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6811 {
6812 if (TREE_CODE (ops[0]) == INTEGER_CST)
6813 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6814 else if (TREE_CODE (ops[1]) == INTEGER_CST)
6815 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6816 else
6817 {
6818 if (dump_enabled_p ())
6819 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6820 "invalid types in dot-prod\n");
6821
6822 return false;
6823 }
6824 }
6825
6826 if (reduction_type == COND_REDUCTION)
6827 {
6828 widest_int ni;
6829
6830 if (! max_loop_iterations (loop, &ni))
6831 {
6832 if (dump_enabled_p ())
6833 dump_printf_loc (MSG_NOTE, vect_location,
6834 "loop count not known, cannot create cond "
6835 "reduction.\n");
6836 return false;
6837 }
6838 /* Convert backedges to iterations. */
6839 ni += 1;
6840
6841 /* The additional index will be the same type as the condition. Check
6842 that the loop can fit into this less one (because we'll use up the
6843 zero slot for when there are no matches). */
6844 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6845 if (wi::geu_p (ni, wi::to_widest (max_index)))
6846 {
6847 if (dump_enabled_p ())
6848 dump_printf_loc (MSG_NOTE, vect_location,
6849 "loop size is greater than data size.\n");
6850 return false;
6851 }
6852 }
6853
6854 /* In case the vectorization factor (VF) is bigger than the number
6855 of elements that we can fit in a vectype (nunits), we have to generate
6856 more than one vector stmt - i.e - we need to "unroll" the
6857 vector stmt by a factor VF/nunits. For more details see documentation
6858 in vectorizable_operation. */
6859
6860 /* If the reduction is used in an outer loop we need to generate
6861 VF intermediate results, like so (e.g. for ncopies=2):
6862 r0 = phi (init, r0)
6863 r1 = phi (init, r1)
6864 r0 = x0 + r0;
6865 r1 = x1 + r1;
6866 (i.e. we generate VF results in 2 registers).
6867 In this case we have a separate def-use cycle for each copy, and therefore
6868 for each copy we get the vector def for the reduction variable from the
6869 respective phi node created for this copy.
6870
6871 Otherwise (the reduction is unused in the loop nest), we can combine
6872 together intermediate results, like so (e.g. for ncopies=2):
6873 r = phi (init, r)
6874 r = x0 + r;
6875 r = x1 + r;
6876 (i.e. we generate VF/2 results in a single register).
6877 In this case for each copy we get the vector def for the reduction variable
6878 from the vectorized reduction operation generated in the previous iteration.
6879
6880 This only works when we see both the reduction PHI and its only consumer
6881 in vectorizable_reduction and there are no intermediate stmts
6882 participating. */
6883 stmt_vec_info use_stmt_info;
6884 tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6885 if (ncopies > 1
6886 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6887 && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6888 && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
6889 {
6890 single_defuse_cycle = true;
6891 epilog_copies = 1;
6892 }
6893 else
6894 epilog_copies = ncopies;
6895
6896 /* If the reduction stmt is one of the patterns that have lane
6897 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
6898 if ((ncopies > 1
6899 && ! single_defuse_cycle)
6900 && (code == DOT_PROD_EXPR
6901 || code == WIDEN_SUM_EXPR
6902 || code == SAD_EXPR))
6903 {
6904 if (dump_enabled_p ())
6905 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6906 "multi def-use cycle not possible for lane-reducing "
6907 "reduction operation\n");
6908 return false;
6909 }
6910
6911 if (slp_node)
6912 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6913 else
6914 vec_num = 1;
6915
6916 internal_fn cond_fn = get_conditional_internal_fn (code);
6917 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6918
6919 if (!vec_stmt) /* transformation not required. */
6920 {
6921 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
6922 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6923 {
6924 if (reduction_type != FOLD_LEFT_REDUCTION
6925 && (cond_fn == IFN_LAST
6926 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6927 OPTIMIZE_FOR_SPEED)))
6928 {
6929 if (dump_enabled_p ())
6930 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6931 "can't use a fully-masked loop because no"
6932 " conditional operation is available.\n");
6933 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6934 }
6935 else if (reduc_index == -1)
6936 {
6937 if (dump_enabled_p ())
6938 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6939 "can't use a fully-masked loop for chained"
6940 " reductions.\n");
6941 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6942 }
6943 else
6944 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6945 vectype_in);
6946 }
6947 if (dump_enabled_p ()
6948 && reduction_type == FOLD_LEFT_REDUCTION)
6949 dump_printf_loc (MSG_NOTE, vect_location,
6950 "using an in-order (fold-left) reduction.\n");
6951 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6952 return true;
6953 }
6954
6955 /* Transform. */
6956
6957 if (dump_enabled_p ())
6958 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6959
6960 /* FORNOW: Multiple types are not supported for condition. */
6961 if (code == COND_EXPR)
6962 gcc_assert (ncopies == 1);
6963
6964 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6965
6966 if (reduction_type == FOLD_LEFT_REDUCTION)
6967 return vectorize_fold_left_reduction
6968 (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6969 reduc_fn, ops, vectype_in, reduc_index, masks);
6970
6971 if (reduction_type == EXTRACT_LAST_REDUCTION)
6972 {
6973 gcc_assert (!slp_node);
6974 return vectorizable_condition (stmt_info, gsi, vec_stmt,
6975 NULL, reduc_index, NULL, NULL);
6976 }
6977
6978 /* Create the destination vector */
6979 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6980
6981 prev_stmt_info = NULL;
6982 prev_phi_info = NULL;
6983 if (!slp_node)
6984 {
6985 vec_oprnds0.create (1);
6986 vec_oprnds1.create (1);
6987 if (op_type == ternary_op)
6988 vec_oprnds2.create (1);
6989 }
6990
6991 phis.create (vec_num);
6992 vect_defs.create (vec_num);
6993 if (!slp_node)
6994 vect_defs.quick_push (NULL_TREE);
6995
6996 if (slp_node)
6997 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
6998 else
6999 phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
7000
7001 for (j = 0; j < ncopies; j++)
7002 {
7003 if (code == COND_EXPR)
7004 {
7005 gcc_assert (!slp_node);
7006 vectorizable_condition (stmt_info, gsi, vec_stmt,
7007 PHI_RESULT (phis[0]->stmt),
7008 reduc_index, NULL, NULL);
7009 /* Multiple types are not supported for condition. */
7010 break;
7011 }
7012 if (code == LSHIFT_EXPR
7013 || code == RSHIFT_EXPR)
7014 {
7015 vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL);
7016 break;
7017 }
7018
7019 /* Handle uses. */
7020 if (j == 0)
7021 {
7022 if (slp_node)
7023 {
7024 /* Get vec defs for all the operands except the reduction index,
7025 ensuring the ordering of the ops in the vector is kept. */
7026 auto_vec<tree, 3> slp_ops;
7027 auto_vec<vec<tree>, 3> vec_defs;
7028
7029 slp_ops.quick_push (ops[0]);
7030 slp_ops.quick_push (ops[1]);
7031 if (op_type == ternary_op)
7032 slp_ops.quick_push (ops[2]);
7033
7034 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7035
7036 vec_oprnds0.safe_splice (vec_defs[0]);
7037 vec_defs[0].release ();
7038 vec_oprnds1.safe_splice (vec_defs[1]);
7039 vec_defs[1].release ();
7040 if (op_type == ternary_op)
7041 {
7042 vec_oprnds2.safe_splice (vec_defs[2]);
7043 vec_defs[2].release ();
7044 }
7045 }
7046 else
7047 {
7048 vec_oprnds0.quick_push
7049 (vect_get_vec_def_for_operand (ops[0], stmt_info));
7050 vec_oprnds1.quick_push
7051 (vect_get_vec_def_for_operand (ops[1], stmt_info));
7052 if (op_type == ternary_op)
7053 vec_oprnds2.quick_push
7054 (vect_get_vec_def_for_operand (ops[2], stmt_info));
7055 }
7056 }
7057 else
7058 {
7059 if (!slp_node)
7060 {
7061 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7062
7063 if (single_defuse_cycle && reduc_index == 0)
7064 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7065 else
7066 vec_oprnds0[0]
7067 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7068 vec_oprnds0[0]);
7069 if (single_defuse_cycle && reduc_index == 1)
7070 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7071 else
7072 vec_oprnds1[0]
7073 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7074 vec_oprnds1[0]);
7075 if (op_type == ternary_op)
7076 {
7077 if (single_defuse_cycle && reduc_index == 2)
7078 vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7079 else
7080 vec_oprnds2[0]
7081 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7082 vec_oprnds2[0]);
7083 }
7084 }
7085 }
7086
7087 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7088 {
7089 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7090 if (masked_loop_p)
7091 {
7092 /* Make sure that the reduction accumulator is vop[0]. */
7093 if (reduc_index == 1)
7094 {
7095 gcc_assert (commutative_tree_code (code));
7096 std::swap (vop[0], vop[1]);
7097 }
7098 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7099 vectype_in, i * ncopies + j);
7100 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7101 vop[0], vop[1],
7102 vop[0]);
7103 new_temp = make_ssa_name (vec_dest, call);
7104 gimple_call_set_lhs (call, new_temp);
7105 gimple_call_set_nothrow (call, true);
7106 new_stmt_info
7107 = vect_finish_stmt_generation (stmt_info, call, gsi);
7108 }
7109 else
7110 {
7111 if (op_type == ternary_op)
7112 vop[2] = vec_oprnds2[i];
7113
7114 gassign *new_stmt = gimple_build_assign (vec_dest, code,
7115 vop[0], vop[1], vop[2]);
7116 new_temp = make_ssa_name (vec_dest, new_stmt);
7117 gimple_assign_set_lhs (new_stmt, new_temp);
7118 new_stmt_info
7119 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7120 }
7121
7122 if (slp_node)
7123 {
7124 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7125 vect_defs.quick_push (new_temp);
7126 }
7127 else
7128 vect_defs[0] = new_temp;
7129 }
7130
7131 if (slp_node)
7132 continue;
7133
7134 if (j == 0)
7135 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7136 else
7137 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7138
7139 prev_stmt_info = new_stmt_info;
7140 }
7141
7142 /* Finalize the reduction-phi (set its arguments) and create the
7143 epilog reduction code. */
7144 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7145 vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7146
7147 vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
7148 epilog_copies, reduc_fn, phis,
7149 double_reduc, slp_node, slp_node_instance,
7150 cond_reduc_val, cond_reduc_op_code,
7151 neutral_op);
7152
7153 return true;
7154 }
7155
7156 /* Function vect_min_worthwhile_factor.
7157
7158 For a loop where we could vectorize the operation indicated by CODE,
7159 return the minimum vectorization factor that makes it worthwhile
7160 to use generic vectors. */
7161 static unsigned int
7162 vect_min_worthwhile_factor (enum tree_code code)
7163 {
7164 switch (code)
7165 {
7166 case PLUS_EXPR:
7167 case MINUS_EXPR:
7168 case NEGATE_EXPR:
7169 return 4;
7170
7171 case BIT_AND_EXPR:
7172 case BIT_IOR_EXPR:
7173 case BIT_XOR_EXPR:
7174 case BIT_NOT_EXPR:
7175 return 2;
7176
7177 default:
7178 return INT_MAX;
7179 }
7180 }
7181
7182 /* Return true if VINFO indicates we are doing loop vectorization and if
7183 it is worth decomposing CODE operations into scalar operations for
7184 that loop's vectorization factor. */
7185
7186 bool
7187 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7188 {
7189 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7190 unsigned HOST_WIDE_INT value;
7191 return (loop_vinfo
7192 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7193 && value >= vect_min_worthwhile_factor (code));
7194 }
7195
7196 /* Function vectorizable_induction
7197
7198 Check if STMT_INFO performs an induction computation that can be vectorized.
7199 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7200 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7201 Return true if STMT_INFO is vectorizable in this way. */
7202
7203 bool
7204 vectorizable_induction (stmt_vec_info stmt_info,
7205 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7206 stmt_vec_info *vec_stmt, slp_tree slp_node,
7207 stmt_vector_for_cost *cost_vec)
7208 {
7209 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7210 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7211 unsigned ncopies;
7212 bool nested_in_vect_loop = false;
7213 struct loop *iv_loop;
7214 tree vec_def;
7215 edge pe = loop_preheader_edge (loop);
7216 basic_block new_bb;
7217 tree new_vec, vec_init, vec_step, t;
7218 tree new_name;
7219 gimple *new_stmt;
7220 gphi *induction_phi;
7221 tree induc_def, vec_dest;
7222 tree init_expr, step_expr;
7223 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7224 unsigned i;
7225 tree expr;
7226 gimple_seq stmts;
7227 imm_use_iterator imm_iter;
7228 use_operand_p use_p;
7229 gimple *exit_phi;
7230 edge latch_e;
7231 tree loop_arg;
7232 gimple_stmt_iterator si;
7233
7234 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7235 if (!phi)
7236 return false;
7237
7238 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7239 return false;
7240
7241 /* Make sure it was recognized as induction computation. */
7242 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7243 return false;
7244
7245 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7246 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7247
7248 if (slp_node)
7249 ncopies = 1;
7250 else
7251 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7252 gcc_assert (ncopies >= 1);
7253
7254 /* FORNOW. These restrictions should be relaxed. */
7255 if (nested_in_vect_loop_p (loop, stmt_info))
7256 {
7257 imm_use_iterator imm_iter;
7258 use_operand_p use_p;
7259 gimple *exit_phi;
7260 edge latch_e;
7261 tree loop_arg;
7262
7263 if (ncopies > 1)
7264 {
7265 if (dump_enabled_p ())
7266 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7267 "multiple types in nested loop.\n");
7268 return false;
7269 }
7270
7271 /* FORNOW: outer loop induction with SLP not supported. */
7272 if (STMT_SLP_TYPE (stmt_info))
7273 return false;
7274
7275 exit_phi = NULL;
7276 latch_e = loop_latch_edge (loop->inner);
7277 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7278 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7279 {
7280 gimple *use_stmt = USE_STMT (use_p);
7281 if (is_gimple_debug (use_stmt))
7282 continue;
7283
7284 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7285 {
7286 exit_phi = use_stmt;
7287 break;
7288 }
7289 }
7290 if (exit_phi)
7291 {
7292 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7293 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7294 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7295 {
7296 if (dump_enabled_p ())
7297 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7298 "inner-loop induction only used outside "
7299 "of the outer vectorized loop.\n");
7300 return false;
7301 }
7302 }
7303
7304 nested_in_vect_loop = true;
7305 iv_loop = loop->inner;
7306 }
7307 else
7308 iv_loop = loop;
7309 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7310
7311 if (slp_node && !nunits.is_constant ())
7312 {
7313 /* The current SLP code creates the initial value element-by-element. */
7314 if (dump_enabled_p ())
7315 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7316 "SLP induction not supported for variable-length"
7317 " vectors.\n");
7318 return false;
7319 }
7320
7321 if (!vec_stmt) /* transformation not required. */
7322 {
7323 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7324 DUMP_VECT_SCOPE ("vectorizable_induction");
7325 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7326 return true;
7327 }
7328
7329 /* Transform. */
7330
7331 /* Compute a vector variable, initialized with the first VF values of
7332 the induction variable. E.g., for an iv with IV_PHI='X' and
7333 evolution S, for a vector of 4 units, we want to compute:
7334 [X, X + S, X + 2*S, X + 3*S]. */
7335
7336 if (dump_enabled_p ())
7337 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7338
7339 latch_e = loop_latch_edge (iv_loop);
7340 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7341
7342 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7343 gcc_assert (step_expr != NULL_TREE);
7344
7345 pe = loop_preheader_edge (iv_loop);
7346 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7347 loop_preheader_edge (iv_loop));
7348
7349 stmts = NULL;
7350 if (!nested_in_vect_loop)
7351 {
7352 /* Convert the initial value to the desired type. */
7353 tree new_type = TREE_TYPE (vectype);
7354 init_expr = gimple_convert (&stmts, new_type, init_expr);
7355
7356 /* If we are using the loop mask to "peel" for alignment then we need
7357 to adjust the start value here. */
7358 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7359 if (skip_niters != NULL_TREE)
7360 {
7361 if (FLOAT_TYPE_P (vectype))
7362 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7363 skip_niters);
7364 else
7365 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7366 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7367 skip_niters, step_expr);
7368 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7369 init_expr, skip_step);
7370 }
7371 }
7372
7373 /* Convert the step to the desired type. */
7374 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7375
7376 if (stmts)
7377 {
7378 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7379 gcc_assert (!new_bb);
7380 }
7381
7382 /* Find the first insertion point in the BB. */
7383 basic_block bb = gimple_bb (phi);
7384 si = gsi_after_labels (bb);
7385
7386 /* For SLP induction we have to generate several IVs as for example
7387 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7388 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7389 [VF*S, VF*S, VF*S, VF*S] for all. */
7390 if (slp_node)
7391 {
7392 /* Enforced above. */
7393 unsigned int const_nunits = nunits.to_constant ();
7394
7395 /* Generate [VF*S, VF*S, ... ]. */
7396 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7397 {
7398 expr = build_int_cst (integer_type_node, vf);
7399 expr = fold_convert (TREE_TYPE (step_expr), expr);
7400 }
7401 else
7402 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7403 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7404 expr, step_expr);
7405 if (! CONSTANT_CLASS_P (new_name))
7406 new_name = vect_init_vector (stmt_info, new_name,
7407 TREE_TYPE (step_expr), NULL);
7408 new_vec = build_vector_from_val (vectype, new_name);
7409 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7410
7411 /* Now generate the IVs. */
7412 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7413 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7414 unsigned elts = const_nunits * nvects;
7415 unsigned nivs = least_common_multiple (group_size,
7416 const_nunits) / const_nunits;
7417 gcc_assert (elts % group_size == 0);
7418 tree elt = init_expr;
7419 unsigned ivn;
7420 for (ivn = 0; ivn < nivs; ++ivn)
7421 {
7422 tree_vector_builder elts (vectype, const_nunits, 1);
7423 stmts = NULL;
7424 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7425 {
7426 if (ivn*const_nunits + eltn >= group_size
7427 && (ivn * const_nunits + eltn) % group_size == 0)
7428 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7429 elt, step_expr);
7430 elts.quick_push (elt);
7431 }
7432 vec_init = gimple_build_vector (&stmts, &elts);
7433 if (stmts)
7434 {
7435 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7436 gcc_assert (!new_bb);
7437 }
7438
7439 /* Create the induction-phi that defines the induction-operand. */
7440 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7441 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7442 stmt_vec_info induction_phi_info
7443 = loop_vinfo->add_stmt (induction_phi);
7444 induc_def = PHI_RESULT (induction_phi);
7445
7446 /* Create the iv update inside the loop */
7447 vec_def = make_ssa_name (vec_dest);
7448 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7449 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7450 loop_vinfo->add_stmt (new_stmt);
7451
7452 /* Set the arguments of the phi node: */
7453 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7454 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7455 UNKNOWN_LOCATION);
7456
7457 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7458 }
7459
7460 /* Re-use IVs when we can. */
7461 if (ivn < nvects)
7462 {
7463 unsigned vfp
7464 = least_common_multiple (group_size, const_nunits) / group_size;
7465 /* Generate [VF'*S, VF'*S, ... ]. */
7466 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7467 {
7468 expr = build_int_cst (integer_type_node, vfp);
7469 expr = fold_convert (TREE_TYPE (step_expr), expr);
7470 }
7471 else
7472 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7473 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7474 expr, step_expr);
7475 if (! CONSTANT_CLASS_P (new_name))
7476 new_name = vect_init_vector (stmt_info, new_name,
7477 TREE_TYPE (step_expr), NULL);
7478 new_vec = build_vector_from_val (vectype, new_name);
7479 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7480 for (; ivn < nvects; ++ivn)
7481 {
7482 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7483 tree def;
7484 if (gimple_code (iv) == GIMPLE_PHI)
7485 def = gimple_phi_result (iv);
7486 else
7487 def = gimple_assign_lhs (iv);
7488 new_stmt = gimple_build_assign (make_ssa_name (vectype),
7489 PLUS_EXPR,
7490 def, vec_step);
7491 if (gimple_code (iv) == GIMPLE_PHI)
7492 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7493 else
7494 {
7495 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7496 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7497 }
7498 SLP_TREE_VEC_STMTS (slp_node).quick_push
7499 (loop_vinfo->add_stmt (new_stmt));
7500 }
7501 }
7502
7503 return true;
7504 }
7505
7506 /* Create the vector that holds the initial_value of the induction. */
7507 if (nested_in_vect_loop)
7508 {
7509 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7510 been created during vectorization of previous stmts. We obtain it
7511 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7512 vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7513 /* If the initial value is not of proper type, convert it. */
7514 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7515 {
7516 new_stmt
7517 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7518 vect_simple_var,
7519 "vec_iv_"),
7520 VIEW_CONVERT_EXPR,
7521 build1 (VIEW_CONVERT_EXPR, vectype,
7522 vec_init));
7523 vec_init = gimple_assign_lhs (new_stmt);
7524 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7525 new_stmt);
7526 gcc_assert (!new_bb);
7527 loop_vinfo->add_stmt (new_stmt);
7528 }
7529 }
7530 else
7531 {
7532 /* iv_loop is the loop to be vectorized. Create:
7533 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7534 stmts = NULL;
7535 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7536
7537 unsigned HOST_WIDE_INT const_nunits;
7538 if (nunits.is_constant (&const_nunits))
7539 {
7540 tree_vector_builder elts (vectype, const_nunits, 1);
7541 elts.quick_push (new_name);
7542 for (i = 1; i < const_nunits; i++)
7543 {
7544 /* Create: new_name_i = new_name + step_expr */
7545 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7546 new_name, step_expr);
7547 elts.quick_push (new_name);
7548 }
7549 /* Create a vector from [new_name_0, new_name_1, ...,
7550 new_name_nunits-1] */
7551 vec_init = gimple_build_vector (&stmts, &elts);
7552 }
7553 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7554 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7555 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7556 new_name, step_expr);
7557 else
7558 {
7559 /* Build:
7560 [base, base, base, ...]
7561 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7562 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7563 gcc_assert (flag_associative_math);
7564 tree index = build_index_vector (vectype, 0, 1);
7565 tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7566 new_name);
7567 tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7568 step_expr);
7569 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7570 vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7571 vec_init, step_vec);
7572 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7573 vec_init, base_vec);
7574 }
7575
7576 if (stmts)
7577 {
7578 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7579 gcc_assert (!new_bb);
7580 }
7581 }
7582
7583
7584 /* Create the vector that holds the step of the induction. */
7585 if (nested_in_vect_loop)
7586 /* iv_loop is nested in the loop to be vectorized. Generate:
7587 vec_step = [S, S, S, S] */
7588 new_name = step_expr;
7589 else
7590 {
7591 /* iv_loop is the loop to be vectorized. Generate:
7592 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7593 gimple_seq seq = NULL;
7594 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7595 {
7596 expr = build_int_cst (integer_type_node, vf);
7597 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7598 }
7599 else
7600 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7601 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7602 expr, step_expr);
7603 if (seq)
7604 {
7605 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7606 gcc_assert (!new_bb);
7607 }
7608 }
7609
7610 t = unshare_expr (new_name);
7611 gcc_assert (CONSTANT_CLASS_P (new_name)
7612 || TREE_CODE (new_name) == SSA_NAME);
7613 new_vec = build_vector_from_val (vectype, t);
7614 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7615
7616
7617 /* Create the following def-use cycle:
7618 loop prolog:
7619 vec_init = ...
7620 vec_step = ...
7621 loop:
7622 vec_iv = PHI <vec_init, vec_loop>
7623 ...
7624 STMT
7625 ...
7626 vec_loop = vec_iv + vec_step; */
7627
7628 /* Create the induction-phi that defines the induction-operand. */
7629 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7630 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7631 stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7632 induc_def = PHI_RESULT (induction_phi);
7633
7634 /* Create the iv update inside the loop */
7635 vec_def = make_ssa_name (vec_dest);
7636 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7637 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7638 stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7639
7640 /* Set the arguments of the phi node: */
7641 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7642 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7643 UNKNOWN_LOCATION);
7644
7645 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7646
7647 /* In case that vectorization factor (VF) is bigger than the number
7648 of elements that we can fit in a vectype (nunits), we have to generate
7649 more than one vector stmt - i.e - we need to "unroll" the
7650 vector stmt by a factor VF/nunits. For more details see documentation
7651 in vectorizable_operation. */
7652
7653 if (ncopies > 1)
7654 {
7655 gimple_seq seq = NULL;
7656 stmt_vec_info prev_stmt_vinfo;
7657 /* FORNOW. This restriction should be relaxed. */
7658 gcc_assert (!nested_in_vect_loop);
7659
7660 /* Create the vector that holds the step of the induction. */
7661 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7662 {
7663 expr = build_int_cst (integer_type_node, nunits);
7664 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7665 }
7666 else
7667 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7668 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7669 expr, step_expr);
7670 if (seq)
7671 {
7672 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7673 gcc_assert (!new_bb);
7674 }
7675
7676 t = unshare_expr (new_name);
7677 gcc_assert (CONSTANT_CLASS_P (new_name)
7678 || TREE_CODE (new_name) == SSA_NAME);
7679 new_vec = build_vector_from_val (vectype, t);
7680 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7681
7682 vec_def = induc_def;
7683 prev_stmt_vinfo = induction_phi_info;
7684 for (i = 1; i < ncopies; i++)
7685 {
7686 /* vec_i = vec_prev + vec_step */
7687 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7688 vec_def, vec_step);
7689 vec_def = make_ssa_name (vec_dest, new_stmt);
7690 gimple_assign_set_lhs (new_stmt, vec_def);
7691
7692 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7693 new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7694 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7695 prev_stmt_vinfo = new_stmt_info;
7696 }
7697 }
7698
7699 if (nested_in_vect_loop)
7700 {
7701 /* Find the loop-closed exit-phi of the induction, and record
7702 the final vector of induction results: */
7703 exit_phi = NULL;
7704 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7705 {
7706 gimple *use_stmt = USE_STMT (use_p);
7707 if (is_gimple_debug (use_stmt))
7708 continue;
7709
7710 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7711 {
7712 exit_phi = use_stmt;
7713 break;
7714 }
7715 }
7716 if (exit_phi)
7717 {
7718 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7719 /* FORNOW. Currently not supporting the case that an inner-loop induction
7720 is not used in the outer-loop (i.e. only outside the outer-loop). */
7721 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7722 && !STMT_VINFO_LIVE_P (stmt_vinfo));
7723
7724 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7725 if (dump_enabled_p ())
7726 dump_printf_loc (MSG_NOTE, vect_location,
7727 "vector of inductions after inner-loop:%G",
7728 new_stmt);
7729 }
7730 }
7731
7732
7733 if (dump_enabled_p ())
7734 dump_printf_loc (MSG_NOTE, vect_location,
7735 "transform induction: created def-use cycle: %G%G",
7736 induction_phi, SSA_NAME_DEF_STMT (vec_def));
7737
7738 return true;
7739 }
7740
7741 /* Function vectorizable_live_operation.
7742
7743 STMT_INFO computes a value that is used outside the loop. Check if
7744 it can be supported. */
7745
7746 bool
7747 vectorizable_live_operation (stmt_vec_info stmt_info,
7748 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7749 slp_tree slp_node, int slp_index,
7750 stmt_vec_info *vec_stmt,
7751 stmt_vector_for_cost *)
7752 {
7753 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7754 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7755 imm_use_iterator imm_iter;
7756 tree lhs, lhs_type, bitsize, vec_bitsize;
7757 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7758 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7759 int ncopies;
7760 gimple *use_stmt;
7761 auto_vec<tree> vec_oprnds;
7762 int vec_entry = 0;
7763 poly_uint64 vec_index = 0;
7764
7765 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7766
7767 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7768 return false;
7769
7770 /* FORNOW. CHECKME. */
7771 if (nested_in_vect_loop_p (loop, stmt_info))
7772 return false;
7773
7774 /* If STMT is not relevant and it is a simple assignment and its inputs are
7775 invariant then it can remain in place, unvectorized. The original last
7776 scalar value that it computes will be used. */
7777 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7778 {
7779 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7780 if (dump_enabled_p ())
7781 dump_printf_loc (MSG_NOTE, vect_location,
7782 "statement is simple and uses invariant. Leaving in "
7783 "place.\n");
7784 return true;
7785 }
7786
7787 if (slp_node)
7788 ncopies = 1;
7789 else
7790 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7791
7792 if (slp_node)
7793 {
7794 gcc_assert (slp_index >= 0);
7795
7796 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7797 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7798
7799 /* Get the last occurrence of the scalar index from the concatenation of
7800 all the slp vectors. Calculate which slp vector it is and the index
7801 within. */
7802 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7803
7804 /* Calculate which vector contains the result, and which lane of
7805 that vector we need. */
7806 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7807 {
7808 if (dump_enabled_p ())
7809 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7810 "Cannot determine which vector holds the"
7811 " final result.\n");
7812 return false;
7813 }
7814 }
7815
7816 if (!vec_stmt)
7817 {
7818 /* No transformation required. */
7819 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7820 {
7821 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7822 OPTIMIZE_FOR_SPEED))
7823 {
7824 if (dump_enabled_p ())
7825 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7826 "can't use a fully-masked loop because "
7827 "the target doesn't support extract last "
7828 "reduction.\n");
7829 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7830 }
7831 else if (slp_node)
7832 {
7833 if (dump_enabled_p ())
7834 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7835 "can't use a fully-masked loop because an "
7836 "SLP statement is live after the loop.\n");
7837 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7838 }
7839 else if (ncopies > 1)
7840 {
7841 if (dump_enabled_p ())
7842 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7843 "can't use a fully-masked loop because"
7844 " ncopies is greater than 1.\n");
7845 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7846 }
7847 else
7848 {
7849 gcc_assert (ncopies == 1 && !slp_node);
7850 vect_record_loop_mask (loop_vinfo,
7851 &LOOP_VINFO_MASKS (loop_vinfo),
7852 1, vectype);
7853 }
7854 }
7855 return true;
7856 }
7857
7858 /* Use the lhs of the original scalar statement. */
7859 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
7860
7861 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7862 : gimple_get_lhs (stmt);
7863 lhs_type = TREE_TYPE (lhs);
7864
7865 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7866 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7867 : TYPE_SIZE (TREE_TYPE (vectype)));
7868 vec_bitsize = TYPE_SIZE (vectype);
7869
7870 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
7871 tree vec_lhs, bitstart;
7872 if (slp_node)
7873 {
7874 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7875
7876 /* Get the correct slp vectorized stmt. */
7877 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7878 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7879 vec_lhs = gimple_phi_result (phi);
7880 else
7881 vec_lhs = gimple_get_lhs (vec_stmt);
7882
7883 /* Get entry to use. */
7884 bitstart = bitsize_int (vec_index);
7885 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7886 }
7887 else
7888 {
7889 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7890 vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7891 gcc_checking_assert (ncopies == 1
7892 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7893
7894 /* For multiple copies, get the last copy. */
7895 for (int i = 1; i < ncopies; ++i)
7896 vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
7897
7898 /* Get the last lane in the vector. */
7899 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7900 }
7901
7902 gimple_seq stmts = NULL;
7903 tree new_tree;
7904 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7905 {
7906 /* Emit:
7907
7908 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7909
7910 where VEC_LHS is the vectorized live-out result and MASK is
7911 the loop mask for the final iteration. */
7912 gcc_assert (ncopies == 1 && !slp_node);
7913 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7914 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
7915 1, vectype, 0);
7916 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
7917 scalar_type, mask, vec_lhs);
7918
7919 /* Convert the extracted vector element to the required scalar type. */
7920 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7921 }
7922 else
7923 {
7924 tree bftype = TREE_TYPE (vectype);
7925 if (VECTOR_BOOLEAN_TYPE_P (vectype))
7926 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7927 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7928 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
7929 &stmts, true, NULL_TREE);
7930 }
7931
7932 if (stmts)
7933 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7934
7935 /* Replace use of lhs with newly computed result. If the use stmt is a
7936 single arg PHI, just replace all uses of PHI result. It's necessary
7937 because lcssa PHI defining lhs may be before newly inserted stmt. */
7938 use_operand_p use_p;
7939 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7940 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7941 && !is_gimple_debug (use_stmt))
7942 {
7943 if (gimple_code (use_stmt) == GIMPLE_PHI
7944 && gimple_phi_num_args (use_stmt) == 1)
7945 {
7946 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7947 }
7948 else
7949 {
7950 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7951 SET_USE (use_p, new_tree);
7952 }
7953 update_stmt (use_stmt);
7954 }
7955
7956 return true;
7957 }
7958
7959 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
7960
7961 static void
7962 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info)
7963 {
7964 ssa_op_iter op_iter;
7965 imm_use_iterator imm_iter;
7966 def_operand_p def_p;
7967 gimple *ustmt;
7968
7969 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
7970 {
7971 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7972 {
7973 basic_block bb;
7974
7975 if (!is_gimple_debug (ustmt))
7976 continue;
7977
7978 bb = gimple_bb (ustmt);
7979
7980 if (!flow_bb_inside_loop_p (loop, bb))
7981 {
7982 if (gimple_debug_bind_p (ustmt))
7983 {
7984 if (dump_enabled_p ())
7985 dump_printf_loc (MSG_NOTE, vect_location,
7986 "killing debug use\n");
7987
7988 gimple_debug_bind_reset_value (ustmt);
7989 update_stmt (ustmt);
7990 }
7991 else
7992 gcc_unreachable ();
7993 }
7994 }
7995 }
7996 }
7997
7998 /* Given loop represented by LOOP_VINFO, return true if computation of
7999 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8000 otherwise. */
8001
8002 static bool
8003 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8004 {
8005 /* Constant case. */
8006 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8007 {
8008 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8009 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8010
8011 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8012 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8013 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8014 return true;
8015 }
8016
8017 widest_int max;
8018 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8019 /* Check the upper bound of loop niters. */
8020 if (get_max_loop_iterations (loop, &max))
8021 {
8022 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8023 signop sgn = TYPE_SIGN (type);
8024 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8025 if (max < type_max)
8026 return true;
8027 }
8028 return false;
8029 }
8030
8031 /* Return a mask type with half the number of elements as TYPE. */
8032
8033 tree
8034 vect_halve_mask_nunits (tree type)
8035 {
8036 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8037 return build_truth_vector_type (nunits, current_vector_size);
8038 }
8039
8040 /* Return a mask type with twice as many elements as TYPE. */
8041
8042 tree
8043 vect_double_mask_nunits (tree type)
8044 {
8045 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8046 return build_truth_vector_type (nunits, current_vector_size);
8047 }
8048
8049 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8050 contain a sequence of NVECTORS masks that each control a vector of type
8051 VECTYPE. */
8052
8053 void
8054 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8055 unsigned int nvectors, tree vectype)
8056 {
8057 gcc_assert (nvectors != 0);
8058 if (masks->length () < nvectors)
8059 masks->safe_grow_cleared (nvectors);
8060 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8061 /* The number of scalars per iteration and the number of vectors are
8062 both compile-time constants. */
8063 unsigned int nscalars_per_iter
8064 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8065 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8066 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8067 {
8068 rgm->max_nscalars_per_iter = nscalars_per_iter;
8069 rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8070 }
8071 }
8072
8073 /* Given a complete set of masks MASKS, extract mask number INDEX
8074 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8075 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8076
8077 See the comment above vec_loop_masks for more details about the mask
8078 arrangement. */
8079
8080 tree
8081 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8082 unsigned int nvectors, tree vectype, unsigned int index)
8083 {
8084 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8085 tree mask_type = rgm->mask_type;
8086
8087 /* Populate the rgroup's mask array, if this is the first time we've
8088 used it. */
8089 if (rgm->masks.is_empty ())
8090 {
8091 rgm->masks.safe_grow_cleared (nvectors);
8092 for (unsigned int i = 0; i < nvectors; ++i)
8093 {
8094 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8095 /* Provide a dummy definition until the real one is available. */
8096 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8097 rgm->masks[i] = mask;
8098 }
8099 }
8100
8101 tree mask = rgm->masks[index];
8102 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8103 TYPE_VECTOR_SUBPARTS (vectype)))
8104 {
8105 /* A loop mask for data type X can be reused for data type Y
8106 if X has N times more elements than Y and if Y's elements
8107 are N times bigger than X's. In this case each sequence
8108 of N elements in the loop mask will be all-zero or all-one.
8109 We can then view-convert the mask so that each sequence of
8110 N elements is replaced by a single element. */
8111 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8112 TYPE_VECTOR_SUBPARTS (vectype)));
8113 gimple_seq seq = NULL;
8114 mask_type = build_same_sized_truth_vector_type (vectype);
8115 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8116 if (seq)
8117 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8118 }
8119 return mask;
8120 }
8121
8122 /* Scale profiling counters by estimation for LOOP which is vectorized
8123 by factor VF. */
8124
8125 static void
8126 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8127 {
8128 edge preheader = loop_preheader_edge (loop);
8129 /* Reduce loop iterations by the vectorization factor. */
8130 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8131 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8132
8133 if (freq_h.nonzero_p ())
8134 {
8135 profile_probability p;
8136
8137 /* Avoid dropping loop body profile counter to 0 because of zero count
8138 in loop's preheader. */
8139 if (!(freq_e == profile_count::zero ()))
8140 freq_e = freq_e.force_nonzero ();
8141 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8142 scale_loop_frequencies (loop, p);
8143 }
8144
8145 edge exit_e = single_exit (loop);
8146 exit_e->probability = profile_probability::always ()
8147 .apply_scale (1, new_est_niter + 1);
8148
8149 edge exit_l = single_pred_edge (loop->latch);
8150 profile_probability prob = exit_l->probability;
8151 exit_l->probability = exit_e->probability.invert ();
8152 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8153 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8154 }
8155
8156 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8157 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8158 stmt_vec_info. */
8159
8160 static void
8161 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8162 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8163 {
8164 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8165 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8166
8167 if (dump_enabled_p ())
8168 dump_printf_loc (MSG_NOTE, vect_location,
8169 "------>vectorizing statement: %G", stmt_info->stmt);
8170
8171 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8172 vect_loop_kill_debug_uses (loop, stmt_info);
8173
8174 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8175 && !STMT_VINFO_LIVE_P (stmt_info))
8176 return;
8177
8178 if (STMT_VINFO_VECTYPE (stmt_info))
8179 {
8180 poly_uint64 nunits
8181 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8182 if (!STMT_SLP_TYPE (stmt_info)
8183 && maybe_ne (nunits, vf)
8184 && dump_enabled_p ())
8185 /* For SLP VF is set according to unrolling factor, and not
8186 to vector size, hence for SLP this print is not valid. */
8187 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8188 }
8189
8190 /* Pure SLP statements have already been vectorized. We still need
8191 to apply loop vectorization to hybrid SLP statements. */
8192 if (PURE_SLP_STMT (stmt_info))
8193 return;
8194
8195 if (dump_enabled_p ())
8196 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8197
8198 if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8199 *seen_store = stmt_info;
8200 }
8201
8202 /* Function vect_transform_loop.
8203
8204 The analysis phase has determined that the loop is vectorizable.
8205 Vectorize the loop - created vectorized stmts to replace the scalar
8206 stmts in the loop, and update the loop exit condition.
8207 Returns scalar epilogue loop if any. */
8208
8209 struct loop *
8210 vect_transform_loop (loop_vec_info loop_vinfo)
8211 {
8212 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8213 struct loop *epilogue = NULL;
8214 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8215 int nbbs = loop->num_nodes;
8216 int i;
8217 tree niters_vector = NULL_TREE;
8218 tree step_vector = NULL_TREE;
8219 tree niters_vector_mult_vf = NULL_TREE;
8220 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8221 unsigned int lowest_vf = constant_lower_bound (vf);
8222 gimple *stmt;
8223 bool check_profitability = false;
8224 unsigned int th;
8225
8226 DUMP_VECT_SCOPE ("vec_transform_loop");
8227
8228 loop_vinfo->shared->check_datarefs ();
8229
8230 /* Use the more conservative vectorization threshold. If the number
8231 of iterations is constant assume the cost check has been performed
8232 by our caller. If the threshold makes all loops profitable that
8233 run at least the (estimated) vectorization factor number of times
8234 checking is pointless, too. */
8235 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8236 if (th >= vect_vf_for_cost (loop_vinfo)
8237 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8238 {
8239 if (dump_enabled_p ())
8240 dump_printf_loc (MSG_NOTE, vect_location,
8241 "Profitability threshold is %d loop iterations.\n",
8242 th);
8243 check_profitability = true;
8244 }
8245
8246 /* Make sure there exists a single-predecessor exit bb. Do this before
8247 versioning. */
8248 edge e = single_exit (loop);
8249 if (! single_pred_p (e->dest))
8250 {
8251 split_loop_exit_edge (e, true);
8252 if (dump_enabled_p ())
8253 dump_printf (MSG_NOTE, "split exit edge\n");
8254 }
8255
8256 /* Version the loop first, if required, so the profitability check
8257 comes first. */
8258
8259 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8260 {
8261 poly_uint64 versioning_threshold
8262 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8263 if (check_profitability
8264 && ordered_p (poly_uint64 (th), versioning_threshold))
8265 {
8266 versioning_threshold = ordered_max (poly_uint64 (th),
8267 versioning_threshold);
8268 check_profitability = false;
8269 }
8270 vect_loop_versioning (loop_vinfo, th, check_profitability,
8271 versioning_threshold);
8272 check_profitability = false;
8273 }
8274
8275 /* Make sure there exists a single-predecessor exit bb also on the
8276 scalar loop copy. Do this after versioning but before peeling
8277 so CFG structure is fine for both scalar and if-converted loop
8278 to make slpeel_duplicate_current_defs_from_edges face matched
8279 loop closed PHI nodes on the exit. */
8280 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8281 {
8282 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8283 if (! single_pred_p (e->dest))
8284 {
8285 split_loop_exit_edge (e, true);
8286 if (dump_enabled_p ())
8287 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8288 }
8289 }
8290
8291 tree niters = vect_build_loop_niters (loop_vinfo);
8292 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8293 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8294 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8295 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8296 &step_vector, &niters_vector_mult_vf, th,
8297 check_profitability, niters_no_overflow);
8298
8299 if (niters_vector == NULL_TREE)
8300 {
8301 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8302 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8303 && known_eq (lowest_vf, vf))
8304 {
8305 niters_vector
8306 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8307 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8308 step_vector = build_one_cst (TREE_TYPE (niters));
8309 }
8310 else
8311 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8312 &step_vector, niters_no_overflow);
8313 }
8314
8315 /* 1) Make sure the loop header has exactly two entries
8316 2) Make sure we have a preheader basic block. */
8317
8318 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8319
8320 split_edge (loop_preheader_edge (loop));
8321
8322 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8323 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8324 /* This will deal with any possible peeling. */
8325 vect_prepare_for_masked_peels (loop_vinfo);
8326
8327 /* Schedule the SLP instances first, then handle loop vectorization
8328 below. */
8329 if (!loop_vinfo->slp_instances.is_empty ())
8330 {
8331 DUMP_VECT_SCOPE ("scheduling SLP instances");
8332 vect_schedule_slp (loop_vinfo);
8333 }
8334
8335 /* FORNOW: the vectorizer supports only loops which body consist
8336 of one basic block (header + empty latch). When the vectorizer will
8337 support more involved loop forms, the order by which the BBs are
8338 traversed need to be reconsidered. */
8339
8340 for (i = 0; i < nbbs; i++)
8341 {
8342 basic_block bb = bbs[i];
8343 stmt_vec_info stmt_info;
8344
8345 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8346 gsi_next (&si))
8347 {
8348 gphi *phi = si.phi ();
8349 if (dump_enabled_p ())
8350 dump_printf_loc (MSG_NOTE, vect_location,
8351 "------>vectorizing phi: %G", phi);
8352 stmt_info = loop_vinfo->lookup_stmt (phi);
8353 if (!stmt_info)
8354 continue;
8355
8356 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8357 vect_loop_kill_debug_uses (loop, stmt_info);
8358
8359 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8360 && !STMT_VINFO_LIVE_P (stmt_info))
8361 continue;
8362
8363 if (STMT_VINFO_VECTYPE (stmt_info)
8364 && (maybe_ne
8365 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8366 && dump_enabled_p ())
8367 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8368
8369 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8370 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8371 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8372 && ! PURE_SLP_STMT (stmt_info))
8373 {
8374 if (dump_enabled_p ())
8375 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8376 vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8377 }
8378 }
8379
8380 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8381 !gsi_end_p (si);)
8382 {
8383 stmt = gsi_stmt (si);
8384 /* During vectorization remove existing clobber stmts. */
8385 if (gimple_clobber_p (stmt))
8386 {
8387 unlink_stmt_vdef (stmt);
8388 gsi_remove (&si, true);
8389 release_defs (stmt);
8390 }
8391 else
8392 {
8393 stmt_info = loop_vinfo->lookup_stmt (stmt);
8394
8395 /* vector stmts created in the outer-loop during vectorization of
8396 stmts in an inner-loop may not have a stmt_info, and do not
8397 need to be vectorized. */
8398 stmt_vec_info seen_store = NULL;
8399 if (stmt_info)
8400 {
8401 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8402 {
8403 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8404 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8405 !gsi_end_p (subsi); gsi_next (&subsi))
8406 {
8407 stmt_vec_info pat_stmt_info
8408 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8409 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8410 &si, &seen_store);
8411 }
8412 stmt_vec_info pat_stmt_info
8413 = STMT_VINFO_RELATED_STMT (stmt_info);
8414 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8415 &seen_store);
8416 }
8417 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8418 &seen_store);
8419 }
8420 gsi_next (&si);
8421 if (seen_store)
8422 {
8423 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8424 /* Interleaving. If IS_STORE is TRUE, the
8425 vectorization of the interleaving chain was
8426 completed - free all the stores in the chain. */
8427 vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8428 else
8429 /* Free the attached stmt_vec_info and remove the stmt. */
8430 loop_vinfo->remove_stmt (stmt_info);
8431 }
8432 }
8433 }
8434
8435 /* Stub out scalar statements that must not survive vectorization.
8436 Doing this here helps with grouped statements, or statements that
8437 are involved in patterns. */
8438 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8439 !gsi_end_p (gsi); gsi_next (&gsi))
8440 {
8441 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8442 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8443 {
8444 tree lhs = gimple_get_lhs (call);
8445 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8446 {
8447 tree zero = build_zero_cst (TREE_TYPE (lhs));
8448 gimple *new_stmt = gimple_build_assign (lhs, zero);
8449 gsi_replace (&gsi, new_stmt, true);
8450 }
8451 }
8452 }
8453 } /* BBs in loop */
8454
8455 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8456 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8457 if (integer_onep (step_vector))
8458 niters_no_overflow = true;
8459 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8460 niters_vector_mult_vf, !niters_no_overflow);
8461
8462 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8463 scale_profile_for_vect_loop (loop, assumed_vf);
8464
8465 /* True if the final iteration might not handle a full vector's
8466 worth of scalar iterations. */
8467 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8468 /* The minimum number of iterations performed by the epilogue. This
8469 is 1 when peeling for gaps because we always need a final scalar
8470 iteration. */
8471 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8472 /* +1 to convert latch counts to loop iteration counts,
8473 -min_epilogue_iters to remove iterations that cannot be performed
8474 by the vector code. */
8475 int bias_for_lowest = 1 - min_epilogue_iters;
8476 int bias_for_assumed = bias_for_lowest;
8477 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8478 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8479 {
8480 /* When the amount of peeling is known at compile time, the first
8481 iteration will have exactly alignment_npeels active elements.
8482 In the worst case it will have at least one. */
8483 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8484 bias_for_lowest += lowest_vf - min_first_active;
8485 bias_for_assumed += assumed_vf - min_first_active;
8486 }
8487 /* In these calculations the "- 1" converts loop iteration counts
8488 back to latch counts. */
8489 if (loop->any_upper_bound)
8490 loop->nb_iterations_upper_bound
8491 = (final_iter_may_be_partial
8492 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8493 lowest_vf) - 1
8494 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8495 lowest_vf) - 1);
8496 if (loop->any_likely_upper_bound)
8497 loop->nb_iterations_likely_upper_bound
8498 = (final_iter_may_be_partial
8499 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8500 + bias_for_lowest, lowest_vf) - 1
8501 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8502 + bias_for_lowest, lowest_vf) - 1);
8503 if (loop->any_estimate)
8504 loop->nb_iterations_estimate
8505 = (final_iter_may_be_partial
8506 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8507 assumed_vf) - 1
8508 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8509 assumed_vf) - 1);
8510
8511 if (dump_enabled_p ())
8512 {
8513 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8514 {
8515 dump_printf_loc (MSG_NOTE, vect_location,
8516 "LOOP VECTORIZED\n");
8517 if (loop->inner)
8518 dump_printf_loc (MSG_NOTE, vect_location,
8519 "OUTER LOOP VECTORIZED\n");
8520 dump_printf (MSG_NOTE, "\n");
8521 }
8522 else
8523 {
8524 dump_printf_loc (MSG_NOTE, vect_location,
8525 "LOOP EPILOGUE VECTORIZED (VS=");
8526 dump_dec (MSG_NOTE, current_vector_size);
8527 dump_printf (MSG_NOTE, ")\n");
8528 }
8529 }
8530
8531 /* Free SLP instances here because otherwise stmt reference counting
8532 won't work. */
8533 slp_instance instance;
8534 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8535 vect_free_slp_instance (instance, true);
8536 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8537 /* Clear-up safelen field since its value is invalid after vectorization
8538 since vectorized loop can have loop-carried dependencies. */
8539 loop->safelen = 0;
8540
8541 /* Don't vectorize epilogue for epilogue. */
8542 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8543 epilogue = NULL;
8544
8545 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8546 epilogue = NULL;
8547
8548 if (epilogue)
8549 {
8550 auto_vector_sizes vector_sizes;
8551 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8552 unsigned int next_size = 0;
8553
8554 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8555 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8556 && known_eq (vf, lowest_vf))
8557 {
8558 unsigned int eiters
8559 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8560 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8561 eiters = eiters % lowest_vf;
8562 epilogue->nb_iterations_upper_bound = eiters - 1;
8563
8564 unsigned int ratio;
8565 while (next_size < vector_sizes.length ()
8566 && !(constant_multiple_p (current_vector_size,
8567 vector_sizes[next_size], &ratio)
8568 && eiters >= lowest_vf / ratio))
8569 next_size += 1;
8570 }
8571 else
8572 while (next_size < vector_sizes.length ()
8573 && maybe_lt (current_vector_size, vector_sizes[next_size]))
8574 next_size += 1;
8575
8576 if (next_size == vector_sizes.length ())
8577 epilogue = NULL;
8578 }
8579
8580 if (epilogue)
8581 {
8582 epilogue->force_vectorize = loop->force_vectorize;
8583 epilogue->safelen = loop->safelen;
8584 epilogue->dont_vectorize = false;
8585
8586 /* We may need to if-convert epilogue to vectorize it. */
8587 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8588 tree_if_conversion (epilogue);
8589 }
8590
8591 return epilogue;
8592 }
8593
8594 /* The code below is trying to perform simple optimization - revert
8595 if-conversion for masked stores, i.e. if the mask of a store is zero
8596 do not perform it and all stored value producers also if possible.
8597 For example,
8598 for (i=0; i<n; i++)
8599 if (c[i])
8600 {
8601 p1[i] += 1;
8602 p2[i] = p3[i] +2;
8603 }
8604 this transformation will produce the following semi-hammock:
8605
8606 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8607 {
8608 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8609 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8610 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8611 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8612 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8613 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8614 }
8615 */
8616
8617 void
8618 optimize_mask_stores (struct loop *loop)
8619 {
8620 basic_block *bbs = get_loop_body (loop);
8621 unsigned nbbs = loop->num_nodes;
8622 unsigned i;
8623 basic_block bb;
8624 struct loop *bb_loop;
8625 gimple_stmt_iterator gsi;
8626 gimple *stmt;
8627 auto_vec<gimple *> worklist;
8628
8629 vect_location = find_loop_location (loop);
8630 /* Pick up all masked stores in loop if any. */
8631 for (i = 0; i < nbbs; i++)
8632 {
8633 bb = bbs[i];
8634 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8635 gsi_next (&gsi))
8636 {
8637 stmt = gsi_stmt (gsi);
8638 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8639 worklist.safe_push (stmt);
8640 }
8641 }
8642
8643 free (bbs);
8644 if (worklist.is_empty ())
8645 return;
8646
8647 /* Loop has masked stores. */
8648 while (!worklist.is_empty ())
8649 {
8650 gimple *last, *last_store;
8651 edge e, efalse;
8652 tree mask;
8653 basic_block store_bb, join_bb;
8654 gimple_stmt_iterator gsi_to;
8655 tree vdef, new_vdef;
8656 gphi *phi;
8657 tree vectype;
8658 tree zero;
8659
8660 last = worklist.pop ();
8661 mask = gimple_call_arg (last, 2);
8662 bb = gimple_bb (last);
8663 /* Create then_bb and if-then structure in CFG, then_bb belongs to
8664 the same loop as if_bb. It could be different to LOOP when two
8665 level loop-nest is vectorized and mask_store belongs to the inner
8666 one. */
8667 e = split_block (bb, last);
8668 bb_loop = bb->loop_father;
8669 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8670 join_bb = e->dest;
8671 store_bb = create_empty_bb (bb);
8672 add_bb_to_loop (store_bb, bb_loop);
8673 e->flags = EDGE_TRUE_VALUE;
8674 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8675 /* Put STORE_BB to likely part. */
8676 efalse->probability = profile_probability::unlikely ();
8677 store_bb->count = efalse->count ();
8678 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8679 if (dom_info_available_p (CDI_DOMINATORS))
8680 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8681 if (dump_enabled_p ())
8682 dump_printf_loc (MSG_NOTE, vect_location,
8683 "Create new block %d to sink mask stores.",
8684 store_bb->index);
8685 /* Create vector comparison with boolean result. */
8686 vectype = TREE_TYPE (mask);
8687 zero = build_zero_cst (vectype);
8688 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8689 gsi = gsi_last_bb (bb);
8690 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8691 /* Create new PHI node for vdef of the last masked store:
8692 .MEM_2 = VDEF <.MEM_1>
8693 will be converted to
8694 .MEM.3 = VDEF <.MEM_1>
8695 and new PHI node will be created in join bb
8696 .MEM_2 = PHI <.MEM_1, .MEM_3>
8697 */
8698 vdef = gimple_vdef (last);
8699 new_vdef = make_ssa_name (gimple_vop (cfun), last);
8700 gimple_set_vdef (last, new_vdef);
8701 phi = create_phi_node (vdef, join_bb);
8702 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8703
8704 /* Put all masked stores with the same mask to STORE_BB if possible. */
8705 while (true)
8706 {
8707 gimple_stmt_iterator gsi_from;
8708 gimple *stmt1 = NULL;
8709
8710 /* Move masked store to STORE_BB. */
8711 last_store = last;
8712 gsi = gsi_for_stmt (last);
8713 gsi_from = gsi;
8714 /* Shift GSI to the previous stmt for further traversal. */
8715 gsi_prev (&gsi);
8716 gsi_to = gsi_start_bb (store_bb);
8717 gsi_move_before (&gsi_from, &gsi_to);
8718 /* Setup GSI_TO to the non-empty block start. */
8719 gsi_to = gsi_start_bb (store_bb);
8720 if (dump_enabled_p ())
8721 dump_printf_loc (MSG_NOTE, vect_location,
8722 "Move stmt to created bb\n%G", last);
8723 /* Move all stored value producers if possible. */
8724 while (!gsi_end_p (gsi))
8725 {
8726 tree lhs;
8727 imm_use_iterator imm_iter;
8728 use_operand_p use_p;
8729 bool res;
8730
8731 /* Skip debug statements. */
8732 if (is_gimple_debug (gsi_stmt (gsi)))
8733 {
8734 gsi_prev (&gsi);
8735 continue;
8736 }
8737 stmt1 = gsi_stmt (gsi);
8738 /* Do not consider statements writing to memory or having
8739 volatile operand. */
8740 if (gimple_vdef (stmt1)
8741 || gimple_has_volatile_ops (stmt1))
8742 break;
8743 gsi_from = gsi;
8744 gsi_prev (&gsi);
8745 lhs = gimple_get_lhs (stmt1);
8746 if (!lhs)
8747 break;
8748
8749 /* LHS of vectorized stmt must be SSA_NAME. */
8750 if (TREE_CODE (lhs) != SSA_NAME)
8751 break;
8752
8753 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8754 {
8755 /* Remove dead scalar statement. */
8756 if (has_zero_uses (lhs))
8757 {
8758 gsi_remove (&gsi_from, true);
8759 continue;
8760 }
8761 }
8762
8763 /* Check that LHS does not have uses outside of STORE_BB. */
8764 res = true;
8765 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8766 {
8767 gimple *use_stmt;
8768 use_stmt = USE_STMT (use_p);
8769 if (is_gimple_debug (use_stmt))
8770 continue;
8771 if (gimple_bb (use_stmt) != store_bb)
8772 {
8773 res = false;
8774 break;
8775 }
8776 }
8777 if (!res)
8778 break;
8779
8780 if (gimple_vuse (stmt1)
8781 && gimple_vuse (stmt1) != gimple_vuse (last_store))
8782 break;
8783
8784 /* Can move STMT1 to STORE_BB. */
8785 if (dump_enabled_p ())
8786 dump_printf_loc (MSG_NOTE, vect_location,
8787 "Move stmt to created bb\n%G", stmt1);
8788 gsi_move_before (&gsi_from, &gsi_to);
8789 /* Shift GSI_TO for further insertion. */
8790 gsi_prev (&gsi_to);
8791 }
8792 /* Put other masked stores with the same mask to STORE_BB. */
8793 if (worklist.is_empty ()
8794 || gimple_call_arg (worklist.last (), 2) != mask
8795 || worklist.last () != stmt1)
8796 break;
8797 last = worklist.pop ();
8798 }
8799 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8800 }
8801 }