re PR tree-optimization/58010 (ICE in vect_create_epilog_for_reduction, at tree-vect...
[gcc.git] / gcc / tree-vect-loop.c
1 /* Loop Vectorization
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "basic-block.h"
30 #include "gimple-pretty-print.h"
31 #include "tree-flow.h"
32 #include "tree-pass.h"
33 #include "cfgloop.h"
34 #include "expr.h"
35 #include "recog.h"
36 #include "optabs.h"
37 #include "params.h"
38 #include "diagnostic-core.h"
39 #include "tree-chrec.h"
40 #include "tree-scalar-evolution.h"
41 #include "tree-vectorizer.h"
42 #include "target.h"
43
44 /* Loop Vectorization Pass.
45
46 This pass tries to vectorize loops.
47
48 For example, the vectorizer transforms the following simple loop:
49
50 short a[N]; short b[N]; short c[N]; int i;
51
52 for (i=0; i<N; i++){
53 a[i] = b[i] + c[i];
54 }
55
56 as if it was manually vectorized by rewriting the source code into:
57
58 typedef int __attribute__((mode(V8HI))) v8hi;
59 short a[N]; short b[N]; short c[N]; int i;
60 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
61 v8hi va, vb, vc;
62
63 for (i=0; i<N/8; i++){
64 vb = pb[i];
65 vc = pc[i];
66 va = vb + vc;
67 pa[i] = va;
68 }
69
70 The main entry to this pass is vectorize_loops(), in which
71 the vectorizer applies a set of analyses on a given set of loops,
72 followed by the actual vectorization transformation for the loops that
73 had successfully passed the analysis phase.
74 Throughout this pass we make a distinction between two types of
75 data: scalars (which are represented by SSA_NAMES), and memory references
76 ("data-refs"). These two types of data require different handling both
77 during analysis and transformation. The types of data-refs that the
78 vectorizer currently supports are ARRAY_REFS which base is an array DECL
79 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
80 accesses are required to have a simple (consecutive) access pattern.
81
82 Analysis phase:
83 ===============
84 The driver for the analysis phase is vect_analyze_loop().
85 It applies a set of analyses, some of which rely on the scalar evolution
86 analyzer (scev) developed by Sebastian Pop.
87
88 During the analysis phase the vectorizer records some information
89 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
90 loop, as well as general information about the loop as a whole, which is
91 recorded in a "loop_vec_info" struct attached to each loop.
92
93 Transformation phase:
94 =====================
95 The loop transformation phase scans all the stmts in the loop, and
96 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
97 the loop that needs to be vectorized. It inserts the vector code sequence
98 just before the scalar stmt S, and records a pointer to the vector code
99 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
100 attached to S). This pointer will be used for the vectorization of following
101 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
102 otherwise, we rely on dead code elimination for removing it.
103
104 For example, say stmt S1 was vectorized into stmt VS1:
105
106 VS1: vb = px[i];
107 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
108 S2: a = b;
109
110 To vectorize stmt S2, the vectorizer first finds the stmt that defines
111 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
112 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
113 resulting sequence would be:
114
115 VS1: vb = px[i];
116 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
117 VS2: va = vb;
118 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
119
120 Operands that are not SSA_NAMEs, are data-refs that appear in
121 load/store operations (like 'x[i]' in S1), and are handled differently.
122
123 Target modeling:
124 =================
125 Currently the only target specific information that is used is the
126 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
127 Targets that can support different sizes of vectors, for now will need
128 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
129 flexibility will be added in the future.
130
131 Since we only vectorize operations which vector form can be
132 expressed using existing tree codes, to verify that an operation is
133 supported, the vectorizer checks the relevant optab at the relevant
134 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
135 the value found is CODE_FOR_nothing, then there's no target support, and
136 we can't vectorize the stmt.
137
138 For additional information on this project see:
139 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
140 */
141
142 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
143
144 /* Function vect_determine_vectorization_factor
145
146 Determine the vectorization factor (VF). VF is the number of data elements
147 that are operated upon in parallel in a single iteration of the vectorized
148 loop. For example, when vectorizing a loop that operates on 4byte elements,
149 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
150 elements can fit in a single vector register.
151
152 We currently support vectorization of loops in which all types operated upon
153 are of the same size. Therefore this function currently sets VF according to
154 the size of the types operated upon, and fails if there are multiple sizes
155 in the loop.
156
157 VF is also the factor by which the loop iterations are strip-mined, e.g.:
158 original loop:
159 for (i=0; i<N; i++){
160 a[i] = b[i] + c[i];
161 }
162
163 vectorized loop:
164 for (i=0; i<N; i+=VF){
165 a[i:VF] = b[i:VF] + c[i:VF];
166 }
167 */
168
169 static bool
170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
171 {
172 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
173 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
174 int nbbs = loop->num_nodes;
175 gimple_stmt_iterator si;
176 unsigned int vectorization_factor = 0;
177 tree scalar_type;
178 gimple phi;
179 tree vectype;
180 unsigned int nunits;
181 stmt_vec_info stmt_info;
182 int i;
183 HOST_WIDE_INT dummy;
184 gimple stmt, pattern_stmt = NULL;
185 gimple_seq pattern_def_seq = NULL;
186 gimple_stmt_iterator pattern_def_si = gsi_none ();
187 bool analyze_pattern_stmt = false;
188
189 if (dump_enabled_p ())
190 dump_printf_loc (MSG_NOTE, vect_location,
191 "=== vect_determine_vectorization_factor ===");
192
193 for (i = 0; i < nbbs; i++)
194 {
195 basic_block bb = bbs[i];
196
197 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
198 {
199 phi = gsi_stmt (si);
200 stmt_info = vinfo_for_stmt (phi);
201 if (dump_enabled_p ())
202 {
203 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
204 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
205 }
206
207 gcc_assert (stmt_info);
208
209 if (STMT_VINFO_RELEVANT_P (stmt_info))
210 {
211 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
212 scalar_type = TREE_TYPE (PHI_RESULT (phi));
213
214 if (dump_enabled_p ())
215 {
216 dump_printf_loc (MSG_NOTE, vect_location,
217 "get vectype for scalar type: ");
218 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
219 }
220
221 vectype = get_vectype_for_scalar_type (scalar_type);
222 if (!vectype)
223 {
224 if (dump_enabled_p ())
225 {
226 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
227 "not vectorized: unsupported "
228 "data-type ");
229 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
230 scalar_type);
231 }
232 return false;
233 }
234 STMT_VINFO_VECTYPE (stmt_info) = vectype;
235
236 if (dump_enabled_p ())
237 {
238 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
239 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
240 }
241
242 nunits = TYPE_VECTOR_SUBPARTS (vectype);
243 if (dump_enabled_p ())
244 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
245
246 if (!vectorization_factor
247 || (nunits > vectorization_factor))
248 vectorization_factor = nunits;
249 }
250 }
251
252 for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
253 {
254 tree vf_vectype;
255
256 if (analyze_pattern_stmt)
257 stmt = pattern_stmt;
258 else
259 stmt = gsi_stmt (si);
260
261 stmt_info = vinfo_for_stmt (stmt);
262
263 if (dump_enabled_p ())
264 {
265 dump_printf_loc (MSG_NOTE, vect_location,
266 "==> examining statement: ");
267 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
268 }
269
270 gcc_assert (stmt_info);
271
272 /* Skip stmts which do not need to be vectorized. */
273 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
274 && !STMT_VINFO_LIVE_P (stmt_info))
275 || gimple_clobber_p (stmt))
276 {
277 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
278 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
279 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
280 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
281 {
282 stmt = pattern_stmt;
283 stmt_info = vinfo_for_stmt (pattern_stmt);
284 if (dump_enabled_p ())
285 {
286 dump_printf_loc (MSG_NOTE, vect_location,
287 "==> examining pattern statement: ");
288 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
289 }
290 }
291 else
292 {
293 if (dump_enabled_p ())
294 dump_printf_loc (MSG_NOTE, vect_location, "skip.");
295 gsi_next (&si);
296 continue;
297 }
298 }
299 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
300 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
301 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
302 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
303 analyze_pattern_stmt = true;
304
305 /* If a pattern statement has def stmts, analyze them too. */
306 if (is_pattern_stmt_p (stmt_info))
307 {
308 if (pattern_def_seq == NULL)
309 {
310 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
311 pattern_def_si = gsi_start (pattern_def_seq);
312 }
313 else if (!gsi_end_p (pattern_def_si))
314 gsi_next (&pattern_def_si);
315 if (pattern_def_seq != NULL)
316 {
317 gimple pattern_def_stmt = NULL;
318 stmt_vec_info pattern_def_stmt_info = NULL;
319
320 while (!gsi_end_p (pattern_def_si))
321 {
322 pattern_def_stmt = gsi_stmt (pattern_def_si);
323 pattern_def_stmt_info
324 = vinfo_for_stmt (pattern_def_stmt);
325 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
326 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
327 break;
328 gsi_next (&pattern_def_si);
329 }
330
331 if (!gsi_end_p (pattern_def_si))
332 {
333 if (dump_enabled_p ())
334 {
335 dump_printf_loc (MSG_NOTE, vect_location,
336 "==> examining pattern def stmt: ");
337 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
338 pattern_def_stmt, 0);
339 }
340
341 stmt = pattern_def_stmt;
342 stmt_info = pattern_def_stmt_info;
343 }
344 else
345 {
346 pattern_def_si = gsi_none ();
347 analyze_pattern_stmt = false;
348 }
349 }
350 else
351 analyze_pattern_stmt = false;
352 }
353
354 if (gimple_get_lhs (stmt) == NULL_TREE)
355 {
356 if (dump_enabled_p ())
357 {
358 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
359 "not vectorized: irregular stmt.");
360 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
361 0);
362 }
363 return false;
364 }
365
366 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
367 {
368 if (dump_enabled_p ())
369 {
370 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
371 "not vectorized: vector stmt in loop:");
372 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
373 }
374 return false;
375 }
376
377 if (STMT_VINFO_VECTYPE (stmt_info))
378 {
379 /* The only case when a vectype had been already set is for stmts
380 that contain a dataref, or for "pattern-stmts" (stmts
381 generated by the vectorizer to represent/replace a certain
382 idiom). */
383 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
384 || is_pattern_stmt_p (stmt_info)
385 || !gsi_end_p (pattern_def_si));
386 vectype = STMT_VINFO_VECTYPE (stmt_info);
387 }
388 else
389 {
390 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
391 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
392 if (dump_enabled_p ())
393 {
394 dump_printf_loc (MSG_NOTE, vect_location,
395 "get vectype for scalar type: ");
396 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
397 }
398 vectype = get_vectype_for_scalar_type (scalar_type);
399 if (!vectype)
400 {
401 if (dump_enabled_p ())
402 {
403 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
404 "not vectorized: unsupported "
405 "data-type ");
406 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
407 scalar_type);
408 }
409 return false;
410 }
411
412 STMT_VINFO_VECTYPE (stmt_info) = vectype;
413
414 if (dump_enabled_p ())
415 {
416 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
417 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
418 }
419 }
420
421 /* The vectorization factor is according to the smallest
422 scalar type (or the largest vector size, but we only
423 support one vector size per loop). */
424 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
425 &dummy);
426 if (dump_enabled_p ())
427 {
428 dump_printf_loc (MSG_NOTE, vect_location,
429 "get vectype for scalar type: ");
430 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
431 }
432 vf_vectype = get_vectype_for_scalar_type (scalar_type);
433 if (!vf_vectype)
434 {
435 if (dump_enabled_p ())
436 {
437 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
438 "not vectorized: unsupported data-type ");
439 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
440 scalar_type);
441 }
442 return false;
443 }
444
445 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
446 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
447 {
448 if (dump_enabled_p ())
449 {
450 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
451 "not vectorized: different sized vector "
452 "types in statement, ");
453 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
454 vectype);
455 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
456 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
457 vf_vectype);
458 }
459 return false;
460 }
461
462 if (dump_enabled_p ())
463 {
464 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
465 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
466 }
467
468 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
469 if (dump_enabled_p ())
470 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
471 if (!vectorization_factor
472 || (nunits > vectorization_factor))
473 vectorization_factor = nunits;
474
475 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
476 {
477 pattern_def_seq = NULL;
478 gsi_next (&si);
479 }
480 }
481 }
482
483 /* TODO: Analyze cost. Decide if worth while to vectorize. */
484 if (dump_enabled_p ())
485 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d",
486 vectorization_factor);
487 if (vectorization_factor <= 1)
488 {
489 if (dump_enabled_p ())
490 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
491 "not vectorized: unsupported data-type");
492 return false;
493 }
494 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
495
496 return true;
497 }
498
499
500 /* Function vect_is_simple_iv_evolution.
501
502 FORNOW: A simple evolution of an induction variables in the loop is
503 considered a polynomial evolution. */
504
505 static bool
506 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
507 tree * step)
508 {
509 tree init_expr;
510 tree step_expr;
511 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
512 basic_block bb;
513
514 /* When there is no evolution in this loop, the evolution function
515 is not "simple". */
516 if (evolution_part == NULL_TREE)
517 return false;
518
519 /* When the evolution is a polynomial of degree >= 2
520 the evolution function is not "simple". */
521 if (tree_is_chrec (evolution_part))
522 return false;
523
524 step_expr = evolution_part;
525 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
526
527 if (dump_enabled_p ())
528 {
529 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
530 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
531 dump_printf (MSG_NOTE, ", init: ");
532 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
533 }
534
535 *init = init_expr;
536 *step = step_expr;
537
538 if (TREE_CODE (step_expr) != INTEGER_CST
539 && (TREE_CODE (step_expr) != SSA_NAME
540 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
541 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
542 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
543 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
544 || !flag_associative_math)))
545 && (TREE_CODE (step_expr) != REAL_CST
546 || !flag_associative_math))
547 {
548 if (dump_enabled_p ())
549 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
550 "step unknown.");
551 return false;
552 }
553
554 return true;
555 }
556
557 /* Function vect_analyze_scalar_cycles_1.
558
559 Examine the cross iteration def-use cycles of scalar variables
560 in LOOP. LOOP_VINFO represents the loop that is now being
561 considered for vectorization (can be LOOP, or an outer-loop
562 enclosing LOOP). */
563
564 static void
565 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
566 {
567 basic_block bb = loop->header;
568 tree init, step;
569 vec<gimple> worklist;
570 worklist.create (64);
571 gimple_stmt_iterator gsi;
572 bool double_reduc;
573
574 if (dump_enabled_p ())
575 dump_printf_loc (MSG_NOTE, vect_location,
576 "=== vect_analyze_scalar_cycles ===");
577
578 /* First - identify all inductions. Reduction detection assumes that all the
579 inductions have been identified, therefore, this order must not be
580 changed. */
581 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
582 {
583 gimple phi = gsi_stmt (gsi);
584 tree access_fn = NULL;
585 tree def = PHI_RESULT (phi);
586 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
587
588 if (dump_enabled_p ())
589 {
590 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
591 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
592 }
593
594 /* Skip virtual phi's. The data dependences that are associated with
595 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
596 if (virtual_operand_p (def))
597 continue;
598
599 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
600
601 /* Analyze the evolution function. */
602 access_fn = analyze_scalar_evolution (loop, def);
603 if (access_fn)
604 {
605 STRIP_NOPS (access_fn);
606 if (dump_enabled_p ())
607 {
608 dump_printf_loc (MSG_NOTE, vect_location,
609 "Access function of PHI: ");
610 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
611 }
612 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
613 = evolution_part_in_loop_num (access_fn, loop->num);
614 }
615
616 if (!access_fn
617 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
618 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
619 && TREE_CODE (step) != INTEGER_CST))
620 {
621 worklist.safe_push (phi);
622 continue;
623 }
624
625 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
626
627 if (dump_enabled_p ())
628 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.");
629 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
630 }
631
632
633 /* Second - identify all reductions and nested cycles. */
634 while (worklist.length () > 0)
635 {
636 gimple phi = worklist.pop ();
637 tree def = PHI_RESULT (phi);
638 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
639 gimple reduc_stmt;
640 bool nested_cycle;
641
642 if (dump_enabled_p ())
643 {
644 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
645 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
646 }
647
648 gcc_assert (!virtual_operand_p (def)
649 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
650
651 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
652 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
653 &double_reduc);
654 if (reduc_stmt)
655 {
656 if (double_reduc)
657 {
658 if (dump_enabled_p ())
659 dump_printf_loc (MSG_NOTE, vect_location,
660 "Detected double reduction.");
661
662 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
663 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
664 vect_double_reduction_def;
665 }
666 else
667 {
668 if (nested_cycle)
669 {
670 if (dump_enabled_p ())
671 dump_printf_loc (MSG_NOTE, vect_location,
672 "Detected vectorizable nested cycle.");
673
674 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
675 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
676 vect_nested_cycle;
677 }
678 else
679 {
680 if (dump_enabled_p ())
681 dump_printf_loc (MSG_NOTE, vect_location,
682 "Detected reduction.");
683
684 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
685 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
686 vect_reduction_def;
687 /* Store the reduction cycles for possible vectorization in
688 loop-aware SLP. */
689 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
690 }
691 }
692 }
693 else
694 if (dump_enabled_p ())
695 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
696 "Unknown def-use cycle pattern.");
697 }
698
699 worklist.release ();
700 }
701
702
703 /* Function vect_analyze_scalar_cycles.
704
705 Examine the cross iteration def-use cycles of scalar variables, by
706 analyzing the loop-header PHIs of scalar variables. Classify each
707 cycle as one of the following: invariant, induction, reduction, unknown.
708 We do that for the loop represented by LOOP_VINFO, and also to its
709 inner-loop, if exists.
710 Examples for scalar cycles:
711
712 Example1: reduction:
713
714 loop1:
715 for (i=0; i<N; i++)
716 sum += a[i];
717
718 Example2: induction:
719
720 loop2:
721 for (i=0; i<N; i++)
722 a[i] = i; */
723
724 static void
725 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
726 {
727 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
728
729 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
730
731 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
732 Reductions in such inner-loop therefore have different properties than
733 the reductions in the nest that gets vectorized:
734 1. When vectorized, they are executed in the same order as in the original
735 scalar loop, so we can't change the order of computation when
736 vectorizing them.
737 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
738 current checks are too strict. */
739
740 if (loop->inner)
741 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
742 }
743
744 /* Function vect_get_loop_niters.
745
746 Determine how many iterations the loop is executed.
747 If an expression that represents the number of iterations
748 can be constructed, place it in NUMBER_OF_ITERATIONS.
749 Return the loop exit condition. */
750
751 static gimple
752 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
753 {
754 tree niters;
755
756 if (dump_enabled_p ())
757 dump_printf_loc (MSG_NOTE, vect_location,
758 "=== get_loop_niters ===");
759 niters = number_of_exit_cond_executions (loop);
760
761 if (niters != NULL_TREE
762 && niters != chrec_dont_know)
763 {
764 *number_of_iterations = niters;
765
766 if (dump_enabled_p ())
767 {
768 dump_printf_loc (MSG_NOTE, vect_location, "==> get_loop_niters:");
769 dump_generic_expr (MSG_NOTE, TDF_SLIM, *number_of_iterations);
770 }
771 }
772
773 return get_loop_exit_condition (loop);
774 }
775
776
777 /* Function bb_in_loop_p
778
779 Used as predicate for dfs order traversal of the loop bbs. */
780
781 static bool
782 bb_in_loop_p (const_basic_block bb, const void *data)
783 {
784 const struct loop *const loop = (const struct loop *)data;
785 if (flow_bb_inside_loop_p (loop, bb))
786 return true;
787 return false;
788 }
789
790
791 /* Function new_loop_vec_info.
792
793 Create and initialize a new loop_vec_info struct for LOOP, as well as
794 stmt_vec_info structs for all the stmts in LOOP. */
795
796 static loop_vec_info
797 new_loop_vec_info (struct loop *loop)
798 {
799 loop_vec_info res;
800 basic_block *bbs;
801 gimple_stmt_iterator si;
802 unsigned int i, nbbs;
803
804 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
805 LOOP_VINFO_LOOP (res) = loop;
806
807 bbs = get_loop_body (loop);
808
809 /* Create/Update stmt_info for all stmts in the loop. */
810 for (i = 0; i < loop->num_nodes; i++)
811 {
812 basic_block bb = bbs[i];
813
814 /* BBs in a nested inner-loop will have been already processed (because
815 we will have called vect_analyze_loop_form for any nested inner-loop).
816 Therefore, for stmts in an inner-loop we just want to update the
817 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
818 loop_info of the outer-loop we are currently considering to vectorize
819 (instead of the loop_info of the inner-loop).
820 For stmts in other BBs we need to create a stmt_info from scratch. */
821 if (bb->loop_father != loop)
822 {
823 /* Inner-loop bb. */
824 gcc_assert (loop->inner && bb->loop_father == loop->inner);
825 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
826 {
827 gimple phi = gsi_stmt (si);
828 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
829 loop_vec_info inner_loop_vinfo =
830 STMT_VINFO_LOOP_VINFO (stmt_info);
831 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
832 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
833 }
834 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
835 {
836 gimple stmt = gsi_stmt (si);
837 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
838 loop_vec_info inner_loop_vinfo =
839 STMT_VINFO_LOOP_VINFO (stmt_info);
840 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
841 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
842 }
843 }
844 else
845 {
846 /* bb in current nest. */
847 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
848 {
849 gimple phi = gsi_stmt (si);
850 gimple_set_uid (phi, 0);
851 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
852 }
853
854 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
855 {
856 gimple stmt = gsi_stmt (si);
857 gimple_set_uid (stmt, 0);
858 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
859 }
860 }
861 }
862
863 /* CHECKME: We want to visit all BBs before their successors (except for
864 latch blocks, for which this assertion wouldn't hold). In the simple
865 case of the loop forms we allow, a dfs order of the BBs would the same
866 as reversed postorder traversal, so we are safe. */
867
868 free (bbs);
869 bbs = XCNEWVEC (basic_block, loop->num_nodes);
870 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
871 bbs, loop->num_nodes, loop);
872 gcc_assert (nbbs == loop->num_nodes);
873
874 LOOP_VINFO_BBS (res) = bbs;
875 LOOP_VINFO_NITERS (res) = NULL;
876 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
877 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
878 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
879 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
880 LOOP_VINFO_VECT_FACTOR (res) = 0;
881 LOOP_VINFO_LOOP_NEST (res).create (3);
882 LOOP_VINFO_DATAREFS (res).create (10);
883 LOOP_VINFO_DDRS (res).create (10 * 10);
884 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
885 LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
886 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
887 LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
888 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
889 LOOP_VINFO_GROUPED_STORES (res).create (10);
890 LOOP_VINFO_REDUCTIONS (res).create (10);
891 LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
892 LOOP_VINFO_SLP_INSTANCES (res).create (10);
893 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
894 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
895 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
896 LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
897
898 return res;
899 }
900
901
902 /* Function destroy_loop_vec_info.
903
904 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
905 stmts in the loop. */
906
907 void
908 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
909 {
910 struct loop *loop;
911 basic_block *bbs;
912 int nbbs;
913 gimple_stmt_iterator si;
914 int j;
915 vec<slp_instance> slp_instances;
916 slp_instance instance;
917 bool swapped;
918
919 if (!loop_vinfo)
920 return;
921
922 loop = LOOP_VINFO_LOOP (loop_vinfo);
923
924 bbs = LOOP_VINFO_BBS (loop_vinfo);
925 nbbs = clean_stmts ? loop->num_nodes : 0;
926 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
927
928 for (j = 0; j < nbbs; j++)
929 {
930 basic_block bb = bbs[j];
931 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
932 free_stmt_vec_info (gsi_stmt (si));
933
934 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
935 {
936 gimple stmt = gsi_stmt (si);
937
938 /* We may have broken canonical form by moving a constant
939 into RHS1 of a commutative op. Fix such occurrences. */
940 if (swapped && is_gimple_assign (stmt))
941 {
942 enum tree_code code = gimple_assign_rhs_code (stmt);
943
944 if ((code == PLUS_EXPR
945 || code == POINTER_PLUS_EXPR
946 || code == MULT_EXPR)
947 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
948 swap_tree_operands (stmt,
949 gimple_assign_rhs1_ptr (stmt),
950 gimple_assign_rhs2_ptr (stmt));
951 }
952
953 /* Free stmt_vec_info. */
954 free_stmt_vec_info (stmt);
955 gsi_next (&si);
956 }
957 }
958
959 free (LOOP_VINFO_BBS (loop_vinfo));
960 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
961 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
962 LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
963 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
964 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
965 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
966 FOR_EACH_VEC_ELT (slp_instances, j, instance)
967 vect_free_slp_instance (instance);
968
969 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
970 LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
971 LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
972 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
973
974 if (LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
975 LOOP_VINFO_PEELING_HTAB (loop_vinfo).dispose ();
976
977 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
978
979 free (loop_vinfo);
980 loop->aux = NULL;
981 }
982
983
984 /* Function vect_analyze_loop_1.
985
986 Apply a set of analyses on LOOP, and create a loop_vec_info struct
987 for it. The different analyses will record information in the
988 loop_vec_info struct. This is a subset of the analyses applied in
989 vect_analyze_loop, to be applied on an inner-loop nested in the loop
990 that is now considered for (outer-loop) vectorization. */
991
992 static loop_vec_info
993 vect_analyze_loop_1 (struct loop *loop)
994 {
995 loop_vec_info loop_vinfo;
996
997 if (dump_enabled_p ())
998 dump_printf_loc (MSG_NOTE, vect_location,
999 "===== analyze_loop_nest_1 =====");
1000
1001 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
1002
1003 loop_vinfo = vect_analyze_loop_form (loop);
1004 if (!loop_vinfo)
1005 {
1006 if (dump_enabled_p ())
1007 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1008 "bad inner-loop form.");
1009 return NULL;
1010 }
1011
1012 return loop_vinfo;
1013 }
1014
1015
1016 /* Function vect_analyze_loop_form.
1017
1018 Verify that certain CFG restrictions hold, including:
1019 - the loop has a pre-header
1020 - the loop has a single entry and exit
1021 - the loop exit condition is simple enough, and the number of iterations
1022 can be analyzed (a countable loop). */
1023
1024 loop_vec_info
1025 vect_analyze_loop_form (struct loop *loop)
1026 {
1027 loop_vec_info loop_vinfo;
1028 gimple loop_cond;
1029 tree number_of_iterations = NULL;
1030 loop_vec_info inner_loop_vinfo = NULL;
1031
1032 if (dump_enabled_p ())
1033 dump_printf_loc (MSG_NOTE, vect_location,
1034 "=== vect_analyze_loop_form ===");
1035
1036 /* Different restrictions apply when we are considering an inner-most loop,
1037 vs. an outer (nested) loop.
1038 (FORNOW. May want to relax some of these restrictions in the future). */
1039
1040 if (!loop->inner)
1041 {
1042 /* Inner-most loop. We currently require that the number of BBs is
1043 exactly 2 (the header and latch). Vectorizable inner-most loops
1044 look like this:
1045
1046 (pre-header)
1047 |
1048 header <--------+
1049 | | |
1050 | +--> latch --+
1051 |
1052 (exit-bb) */
1053
1054 if (loop->num_nodes != 2)
1055 {
1056 if (dump_enabled_p ())
1057 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1058 "not vectorized: control flow in loop.");
1059 return NULL;
1060 }
1061
1062 if (empty_block_p (loop->header))
1063 {
1064 if (dump_enabled_p ())
1065 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1066 "not vectorized: empty loop.");
1067 return NULL;
1068 }
1069 }
1070 else
1071 {
1072 struct loop *innerloop = loop->inner;
1073 edge entryedge;
1074
1075 /* Nested loop. We currently require that the loop is doubly-nested,
1076 contains a single inner loop, and the number of BBs is exactly 5.
1077 Vectorizable outer-loops look like this:
1078
1079 (pre-header)
1080 |
1081 header <---+
1082 | |
1083 inner-loop |
1084 | |
1085 tail ------+
1086 |
1087 (exit-bb)
1088
1089 The inner-loop has the properties expected of inner-most loops
1090 as described above. */
1091
1092 if ((loop->inner)->inner || (loop->inner)->next)
1093 {
1094 if (dump_enabled_p ())
1095 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1096 "not vectorized: multiple nested loops.");
1097 return NULL;
1098 }
1099
1100 /* Analyze the inner-loop. */
1101 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1102 if (!inner_loop_vinfo)
1103 {
1104 if (dump_enabled_p ())
1105 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1106 "not vectorized: Bad inner loop.");
1107 return NULL;
1108 }
1109
1110 if (!expr_invariant_in_loop_p (loop,
1111 LOOP_VINFO_NITERS (inner_loop_vinfo)))
1112 {
1113 if (dump_enabled_p ())
1114 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1115 "not vectorized: inner-loop count not invariant.");
1116 destroy_loop_vec_info (inner_loop_vinfo, true);
1117 return NULL;
1118 }
1119
1120 if (loop->num_nodes != 5)
1121 {
1122 if (dump_enabled_p ())
1123 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1124 "not vectorized: control flow in loop.");
1125 destroy_loop_vec_info (inner_loop_vinfo, true);
1126 return NULL;
1127 }
1128
1129 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1130 entryedge = EDGE_PRED (innerloop->header, 0);
1131 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1132 entryedge = EDGE_PRED (innerloop->header, 1);
1133
1134 if (entryedge->src != loop->header
1135 || !single_exit (innerloop)
1136 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1137 {
1138 if (dump_enabled_p ())
1139 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1140 "not vectorized: unsupported outerloop form.");
1141 destroy_loop_vec_info (inner_loop_vinfo, true);
1142 return NULL;
1143 }
1144
1145 if (dump_enabled_p ())
1146 dump_printf_loc (MSG_NOTE, vect_location,
1147 "Considering outer-loop vectorization.");
1148 }
1149
1150 if (!single_exit (loop)
1151 || EDGE_COUNT (loop->header->preds) != 2)
1152 {
1153 if (dump_enabled_p ())
1154 {
1155 if (!single_exit (loop))
1156 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1157 "not vectorized: multiple exits.");
1158 else if (EDGE_COUNT (loop->header->preds) != 2)
1159 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1160 "not vectorized: too many incoming edges.");
1161 }
1162 if (inner_loop_vinfo)
1163 destroy_loop_vec_info (inner_loop_vinfo, true);
1164 return NULL;
1165 }
1166
1167 /* We assume that the loop exit condition is at the end of the loop. i.e,
1168 that the loop is represented as a do-while (with a proper if-guard
1169 before the loop if needed), where the loop header contains all the
1170 executable statements, and the latch is empty. */
1171 if (!empty_block_p (loop->latch)
1172 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1173 {
1174 if (dump_enabled_p ())
1175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1176 "not vectorized: latch block not empty.");
1177 if (inner_loop_vinfo)
1178 destroy_loop_vec_info (inner_loop_vinfo, true);
1179 return NULL;
1180 }
1181
1182 /* Make sure there exists a single-predecessor exit bb: */
1183 if (!single_pred_p (single_exit (loop)->dest))
1184 {
1185 edge e = single_exit (loop);
1186 if (!(e->flags & EDGE_ABNORMAL))
1187 {
1188 split_loop_exit_edge (e);
1189 if (dump_enabled_p ())
1190 dump_printf (MSG_NOTE, "split exit edge.");
1191 }
1192 else
1193 {
1194 if (dump_enabled_p ())
1195 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1196 "not vectorized: abnormal loop exit edge.");
1197 if (inner_loop_vinfo)
1198 destroy_loop_vec_info (inner_loop_vinfo, true);
1199 return NULL;
1200 }
1201 }
1202
1203 loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1204 if (!loop_cond)
1205 {
1206 if (dump_enabled_p ())
1207 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1208 "not vectorized: complicated exit condition.");
1209 if (inner_loop_vinfo)
1210 destroy_loop_vec_info (inner_loop_vinfo, true);
1211 return NULL;
1212 }
1213
1214 if (!number_of_iterations)
1215 {
1216 if (dump_enabled_p ())
1217 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1218 "not vectorized: number of iterations cannot be "
1219 "computed.");
1220 if (inner_loop_vinfo)
1221 destroy_loop_vec_info (inner_loop_vinfo, true);
1222 return NULL;
1223 }
1224
1225 if (chrec_contains_undetermined (number_of_iterations))
1226 {
1227 if (dump_enabled_p ())
1228 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1229 "Infinite number of iterations.");
1230 if (inner_loop_vinfo)
1231 destroy_loop_vec_info (inner_loop_vinfo, true);
1232 return NULL;
1233 }
1234
1235 if (!NITERS_KNOWN_P (number_of_iterations))
1236 {
1237 if (dump_enabled_p ())
1238 {
1239 dump_printf_loc (MSG_NOTE, vect_location,
1240 "Symbolic number of iterations is ");
1241 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1242 }
1243 }
1244 else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1245 {
1246 if (dump_enabled_p ())
1247 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1248 "not vectorized: number of iterations = 0.");
1249 if (inner_loop_vinfo)
1250 destroy_loop_vec_info (inner_loop_vinfo, true);
1251 return NULL;
1252 }
1253
1254 loop_vinfo = new_loop_vec_info (loop);
1255 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1256 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1257
1258 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1259
1260 /* CHECKME: May want to keep it around it in the future. */
1261 if (inner_loop_vinfo)
1262 destroy_loop_vec_info (inner_loop_vinfo, false);
1263
1264 gcc_assert (!loop->aux);
1265 loop->aux = loop_vinfo;
1266 return loop_vinfo;
1267 }
1268
1269
1270 /* Function vect_analyze_loop_operations.
1271
1272 Scan the loop stmts and make sure they are all vectorizable. */
1273
1274 static bool
1275 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1276 {
1277 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1278 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1279 int nbbs = loop->num_nodes;
1280 gimple_stmt_iterator si;
1281 unsigned int vectorization_factor = 0;
1282 int i;
1283 gimple phi;
1284 stmt_vec_info stmt_info;
1285 bool need_to_vectorize = false;
1286 int min_profitable_iters;
1287 int min_scalar_loop_bound;
1288 unsigned int th;
1289 bool only_slp_in_loop = true, ok;
1290 HOST_WIDE_INT max_niter;
1291 HOST_WIDE_INT estimated_niter;
1292 int min_profitable_estimate;
1293
1294 if (dump_enabled_p ())
1295 dump_printf_loc (MSG_NOTE, vect_location,
1296 "=== vect_analyze_loop_operations ===");
1297
1298 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1299 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1300 if (slp)
1301 {
1302 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1303 vectorization factor of the loop is the unrolling factor required by
1304 the SLP instances. If that unrolling factor is 1, we say, that we
1305 perform pure SLP on loop - cross iteration parallelism is not
1306 exploited. */
1307 for (i = 0; i < nbbs; i++)
1308 {
1309 basic_block bb = bbs[i];
1310 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1311 {
1312 gimple stmt = gsi_stmt (si);
1313 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1314 gcc_assert (stmt_info);
1315 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1316 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1317 && !PURE_SLP_STMT (stmt_info))
1318 /* STMT needs both SLP and loop-based vectorization. */
1319 only_slp_in_loop = false;
1320 }
1321 }
1322
1323 if (only_slp_in_loop)
1324 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1325 else
1326 vectorization_factor = least_common_multiple (vectorization_factor,
1327 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1328
1329 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1330 if (dump_enabled_p ())
1331 dump_printf_loc (MSG_NOTE, vect_location,
1332 "Updating vectorization factor to %d ",
1333 vectorization_factor);
1334 }
1335
1336 for (i = 0; i < nbbs; i++)
1337 {
1338 basic_block bb = bbs[i];
1339
1340 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1341 {
1342 phi = gsi_stmt (si);
1343 ok = true;
1344
1345 stmt_info = vinfo_for_stmt (phi);
1346 if (dump_enabled_p ())
1347 {
1348 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1349 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1350 }
1351
1352 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1353 (i.e., a phi in the tail of the outer-loop). */
1354 if (! is_loop_header_bb_p (bb))
1355 {
1356 /* FORNOW: we currently don't support the case that these phis
1357 are not used in the outerloop (unless it is double reduction,
1358 i.e., this phi is vect_reduction_def), cause this case
1359 requires to actually do something here. */
1360 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1361 || STMT_VINFO_LIVE_P (stmt_info))
1362 && STMT_VINFO_DEF_TYPE (stmt_info)
1363 != vect_double_reduction_def)
1364 {
1365 if (dump_enabled_p ())
1366 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1367 "Unsupported loop-closed phi in "
1368 "outer-loop.");
1369 return false;
1370 }
1371
1372 /* If PHI is used in the outer loop, we check that its operand
1373 is defined in the inner loop. */
1374 if (STMT_VINFO_RELEVANT_P (stmt_info))
1375 {
1376 tree phi_op;
1377 gimple op_def_stmt;
1378
1379 if (gimple_phi_num_args (phi) != 1)
1380 return false;
1381
1382 phi_op = PHI_ARG_DEF (phi, 0);
1383 if (TREE_CODE (phi_op) != SSA_NAME)
1384 return false;
1385
1386 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1387 if (!op_def_stmt
1388 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1389 || !vinfo_for_stmt (op_def_stmt))
1390 return false;
1391
1392 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1393 != vect_used_in_outer
1394 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1395 != vect_used_in_outer_by_reduction)
1396 return false;
1397 }
1398
1399 continue;
1400 }
1401
1402 gcc_assert (stmt_info);
1403
1404 if (STMT_VINFO_LIVE_P (stmt_info))
1405 {
1406 /* FORNOW: not yet supported. */
1407 if (dump_enabled_p ())
1408 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1409 "not vectorized: value used after loop.");
1410 return false;
1411 }
1412
1413 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1414 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1415 {
1416 /* A scalar-dependence cycle that we don't support. */
1417 if (dump_enabled_p ())
1418 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1419 "not vectorized: scalar dependence cycle.");
1420 return false;
1421 }
1422
1423 if (STMT_VINFO_RELEVANT_P (stmt_info))
1424 {
1425 need_to_vectorize = true;
1426 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1427 ok = vectorizable_induction (phi, NULL, NULL);
1428 }
1429
1430 if (!ok)
1431 {
1432 if (dump_enabled_p ())
1433 {
1434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1435 "not vectorized: relevant phi not "
1436 "supported: ");
1437 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1438 }
1439 return false;
1440 }
1441 }
1442
1443 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1444 {
1445 gimple stmt = gsi_stmt (si);
1446 if (!gimple_clobber_p (stmt)
1447 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1448 return false;
1449 }
1450 } /* bbs */
1451
1452 /* All operations in the loop are either irrelevant (deal with loop
1453 control, or dead), or only used outside the loop and can be moved
1454 out of the loop (e.g. invariants, inductions). The loop can be
1455 optimized away by scalar optimizations. We're better off not
1456 touching this loop. */
1457 if (!need_to_vectorize)
1458 {
1459 if (dump_enabled_p ())
1460 dump_printf_loc (MSG_NOTE, vect_location,
1461 "All the computation can be taken out of the loop.");
1462 if (dump_enabled_p ())
1463 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1464 "not vectorized: redundant loop. no profit to "
1465 "vectorize.");
1466 return false;
1467 }
1468
1469 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1470 dump_printf_loc (MSG_NOTE, vect_location,
1471 "vectorization_factor = %d, niters = "
1472 HOST_WIDE_INT_PRINT_DEC, vectorization_factor,
1473 LOOP_VINFO_INT_NITERS (loop_vinfo));
1474
1475 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1476 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1477 || ((max_niter = max_stmt_executions_int (loop)) != -1
1478 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1479 {
1480 if (dump_enabled_p ())
1481 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1482 "not vectorized: iteration count too small.");
1483 if (dump_enabled_p ())
1484 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1485 "not vectorized: iteration count smaller than "
1486 "vectorization factor.");
1487 return false;
1488 }
1489
1490 /* Analyze cost. Decide if worth while to vectorize. */
1491
1492 /* Once VF is set, SLP costs should be updated since the number of created
1493 vector stmts depends on VF. */
1494 vect_update_slp_costs_according_to_vf (loop_vinfo);
1495
1496 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1497 &min_profitable_estimate);
1498 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1499
1500 if (min_profitable_iters < 0)
1501 {
1502 if (dump_enabled_p ())
1503 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1504 "not vectorized: vectorization not profitable.");
1505 if (dump_enabled_p ())
1506 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1507 "not vectorized: vector version will never be "
1508 "profitable.");
1509 return false;
1510 }
1511
1512 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1513 * vectorization_factor) - 1);
1514
1515
1516 /* Use the cost model only if it is more conservative than user specified
1517 threshold. */
1518
1519 th = (unsigned) min_scalar_loop_bound;
1520 if (min_profitable_iters
1521 && (!min_scalar_loop_bound
1522 || min_profitable_iters > min_scalar_loop_bound))
1523 th = (unsigned) min_profitable_iters;
1524
1525 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1526 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1527 {
1528 if (dump_enabled_p ())
1529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1530 "not vectorized: vectorization not profitable.");
1531 if (dump_enabled_p ())
1532 dump_printf_loc (MSG_NOTE, vect_location,
1533 "not vectorized: iteration count smaller than user "
1534 "specified loop bound parameter or minimum profitable "
1535 "iterations (whichever is more conservative).");
1536 return false;
1537 }
1538
1539 if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1540 && ((unsigned HOST_WIDE_INT) estimated_niter
1541 <= MAX (th, (unsigned)min_profitable_estimate)))
1542 {
1543 if (dump_enabled_p ())
1544 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1545 "not vectorized: estimated iteration count too "
1546 "small.");
1547 if (dump_enabled_p ())
1548 dump_printf_loc (MSG_NOTE, vect_location,
1549 "not vectorized: estimated iteration count smaller "
1550 "than specified loop bound parameter or minimum "
1551 "profitable iterations (whichever is more "
1552 "conservative).");
1553 return false;
1554 }
1555
1556 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1557 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1558 || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1559 {
1560 if (dump_enabled_p ())
1561 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required.");
1562 if (!vect_can_advance_ivs_p (loop_vinfo))
1563 {
1564 if (dump_enabled_p ())
1565 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1566 "not vectorized: can't create epilog loop 1.");
1567 return false;
1568 }
1569 if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1570 {
1571 if (dump_enabled_p ())
1572 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1573 "not vectorized: can't create epilog loop 2.");
1574 return false;
1575 }
1576 }
1577
1578 return true;
1579 }
1580
1581
1582 /* Function vect_analyze_loop_2.
1583
1584 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1585 for it. The different analyses will record information in the
1586 loop_vec_info struct. */
1587 static bool
1588 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1589 {
1590 bool ok, slp = false;
1591 int max_vf = MAX_VECTORIZATION_FACTOR;
1592 int min_vf = 2;
1593
1594 /* Find all data references in the loop (which correspond to vdefs/vuses)
1595 and analyze their evolution in the loop. Also adjust the minimal
1596 vectorization factor according to the loads and stores.
1597
1598 FORNOW: Handle only simple, array references, which
1599 alignment can be forced, and aligned pointer-references. */
1600
1601 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1602 if (!ok)
1603 {
1604 if (dump_enabled_p ())
1605 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1606 "bad data references.");
1607 return false;
1608 }
1609
1610 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1611 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1612
1613 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1614 if (!ok)
1615 {
1616 if (dump_enabled_p ())
1617 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1618 "bad data access.");
1619 return false;
1620 }
1621
1622 /* Classify all cross-iteration scalar data-flow cycles.
1623 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1624
1625 vect_analyze_scalar_cycles (loop_vinfo);
1626
1627 vect_pattern_recog (loop_vinfo, NULL);
1628
1629 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1630
1631 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1632 if (!ok)
1633 {
1634 if (dump_enabled_p ())
1635 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1636 "unexpected pattern.");
1637 return false;
1638 }
1639
1640 /* Analyze data dependences between the data-refs in the loop
1641 and adjust the maximum vectorization factor according to
1642 the dependences.
1643 FORNOW: fail at the first data dependence that we encounter. */
1644
1645 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1646 if (!ok
1647 || max_vf < min_vf)
1648 {
1649 if (dump_enabled_p ())
1650 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1651 "bad data dependence.");
1652 return false;
1653 }
1654
1655 ok = vect_determine_vectorization_factor (loop_vinfo);
1656 if (!ok)
1657 {
1658 if (dump_enabled_p ())
1659 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1660 "can't determine vectorization factor.");
1661 return false;
1662 }
1663 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1664 {
1665 if (dump_enabled_p ())
1666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1667 "bad data dependence.");
1668 return false;
1669 }
1670
1671 /* Analyze the alignment of the data-refs in the loop.
1672 Fail if a data reference is found that cannot be vectorized. */
1673
1674 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1675 if (!ok)
1676 {
1677 if (dump_enabled_p ())
1678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1679 "bad data alignment.");
1680 return false;
1681 }
1682
1683 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1684 It is important to call pruning after vect_analyze_data_ref_accesses,
1685 since we use grouping information gathered by interleaving analysis. */
1686 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1687 if (!ok)
1688 {
1689 if (dump_enabled_p ())
1690 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1691 "too long list of versioning for alias "
1692 "run-time tests.");
1693 return false;
1694 }
1695
1696 /* This pass will decide on using loop versioning and/or loop peeling in
1697 order to enhance the alignment of data references in the loop. */
1698
1699 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1700 if (!ok)
1701 {
1702 if (dump_enabled_p ())
1703 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1704 "bad data alignment.");
1705 return false;
1706 }
1707
1708 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1709 ok = vect_analyze_slp (loop_vinfo, NULL);
1710 if (ok)
1711 {
1712 /* Decide which possible SLP instances to SLP. */
1713 slp = vect_make_slp_decision (loop_vinfo);
1714
1715 /* Find stmts that need to be both vectorized and SLPed. */
1716 vect_detect_hybrid_slp (loop_vinfo);
1717 }
1718 else
1719 return false;
1720
1721 /* Scan all the operations in the loop and make sure they are
1722 vectorizable. */
1723
1724 ok = vect_analyze_loop_operations (loop_vinfo, slp);
1725 if (!ok)
1726 {
1727 if (dump_enabled_p ())
1728 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1729 "bad operation or unsupported loop bound.");
1730 return false;
1731 }
1732
1733 return true;
1734 }
1735
1736 /* Function vect_analyze_loop.
1737
1738 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1739 for it. The different analyses will record information in the
1740 loop_vec_info struct. */
1741 loop_vec_info
1742 vect_analyze_loop (struct loop *loop)
1743 {
1744 loop_vec_info loop_vinfo;
1745 unsigned int vector_sizes;
1746
1747 /* Autodetect first vector size we try. */
1748 current_vector_size = 0;
1749 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1750
1751 if (dump_enabled_p ())
1752 dump_printf_loc (MSG_NOTE, vect_location,
1753 "===== analyze_loop_nest =====");
1754
1755 if (loop_outer (loop)
1756 && loop_vec_info_for_loop (loop_outer (loop))
1757 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1758 {
1759 if (dump_enabled_p ())
1760 dump_printf_loc (MSG_NOTE, vect_location,
1761 "outer-loop already vectorized.");
1762 return NULL;
1763 }
1764
1765 while (1)
1766 {
1767 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1768 loop_vinfo = vect_analyze_loop_form (loop);
1769 if (!loop_vinfo)
1770 {
1771 if (dump_enabled_p ())
1772 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1773 "bad loop form.");
1774 return NULL;
1775 }
1776
1777 if (vect_analyze_loop_2 (loop_vinfo))
1778 {
1779 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1780
1781 return loop_vinfo;
1782 }
1783
1784 destroy_loop_vec_info (loop_vinfo, true);
1785
1786 vector_sizes &= ~current_vector_size;
1787 if (vector_sizes == 0
1788 || current_vector_size == 0)
1789 return NULL;
1790
1791 /* Try the next biggest vector size. */
1792 current_vector_size = 1 << floor_log2 (vector_sizes);
1793 if (dump_enabled_p ())
1794 dump_printf_loc (MSG_NOTE, vect_location,
1795 "***** Re-trying analysis with "
1796 "vector size %d\n", current_vector_size);
1797 }
1798 }
1799
1800
1801 /* Function reduction_code_for_scalar_code
1802
1803 Input:
1804 CODE - tree_code of a reduction operations.
1805
1806 Output:
1807 REDUC_CODE - the corresponding tree-code to be used to reduce the
1808 vector of partial results into a single scalar result (which
1809 will also reside in a vector) or ERROR_MARK if the operation is
1810 a supported reduction operation, but does not have such tree-code.
1811
1812 Return FALSE if CODE currently cannot be vectorized as reduction. */
1813
1814 static bool
1815 reduction_code_for_scalar_code (enum tree_code code,
1816 enum tree_code *reduc_code)
1817 {
1818 switch (code)
1819 {
1820 case MAX_EXPR:
1821 *reduc_code = REDUC_MAX_EXPR;
1822 return true;
1823
1824 case MIN_EXPR:
1825 *reduc_code = REDUC_MIN_EXPR;
1826 return true;
1827
1828 case PLUS_EXPR:
1829 *reduc_code = REDUC_PLUS_EXPR;
1830 return true;
1831
1832 case MULT_EXPR:
1833 case MINUS_EXPR:
1834 case BIT_IOR_EXPR:
1835 case BIT_XOR_EXPR:
1836 case BIT_AND_EXPR:
1837 *reduc_code = ERROR_MARK;
1838 return true;
1839
1840 default:
1841 return false;
1842 }
1843 }
1844
1845
1846 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1847 STMT is printed with a message MSG. */
1848
1849 static void
1850 report_vect_op (int msg_type, gimple stmt, const char *msg)
1851 {
1852 dump_printf_loc (msg_type, vect_location, "%s", msg);
1853 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1854 }
1855
1856
1857 /* Detect SLP reduction of the form:
1858
1859 #a1 = phi <a5, a0>
1860 a2 = operation (a1)
1861 a3 = operation (a2)
1862 a4 = operation (a3)
1863 a5 = operation (a4)
1864
1865 #a = phi <a5>
1866
1867 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1868 FIRST_STMT is the first reduction stmt in the chain
1869 (a2 = operation (a1)).
1870
1871 Return TRUE if a reduction chain was detected. */
1872
1873 static bool
1874 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1875 {
1876 struct loop *loop = (gimple_bb (phi))->loop_father;
1877 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1878 enum tree_code code;
1879 gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1880 stmt_vec_info use_stmt_info, current_stmt_info;
1881 tree lhs;
1882 imm_use_iterator imm_iter;
1883 use_operand_p use_p;
1884 int nloop_uses, size = 0, n_out_of_loop_uses;
1885 bool found = false;
1886
1887 if (loop != vect_loop)
1888 return false;
1889
1890 lhs = PHI_RESULT (phi);
1891 code = gimple_assign_rhs_code (first_stmt);
1892 while (1)
1893 {
1894 nloop_uses = 0;
1895 n_out_of_loop_uses = 0;
1896 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1897 {
1898 gimple use_stmt = USE_STMT (use_p);
1899 if (is_gimple_debug (use_stmt))
1900 continue;
1901
1902 use_stmt = USE_STMT (use_p);
1903
1904 /* Check if we got back to the reduction phi. */
1905 if (use_stmt == phi)
1906 {
1907 loop_use_stmt = use_stmt;
1908 found = true;
1909 break;
1910 }
1911
1912 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1913 {
1914 if (vinfo_for_stmt (use_stmt)
1915 && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1916 {
1917 loop_use_stmt = use_stmt;
1918 nloop_uses++;
1919 }
1920 }
1921 else
1922 n_out_of_loop_uses++;
1923
1924 /* There are can be either a single use in the loop or two uses in
1925 phi nodes. */
1926 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1927 return false;
1928 }
1929
1930 if (found)
1931 break;
1932
1933 /* We reached a statement with no loop uses. */
1934 if (nloop_uses == 0)
1935 return false;
1936
1937 /* This is a loop exit phi, and we haven't reached the reduction phi. */
1938 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
1939 return false;
1940
1941 if (!is_gimple_assign (loop_use_stmt)
1942 || code != gimple_assign_rhs_code (loop_use_stmt)
1943 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
1944 return false;
1945
1946 /* Insert USE_STMT into reduction chain. */
1947 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
1948 if (current_stmt)
1949 {
1950 current_stmt_info = vinfo_for_stmt (current_stmt);
1951 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
1952 GROUP_FIRST_ELEMENT (use_stmt_info)
1953 = GROUP_FIRST_ELEMENT (current_stmt_info);
1954 }
1955 else
1956 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
1957
1958 lhs = gimple_assign_lhs (loop_use_stmt);
1959 current_stmt = loop_use_stmt;
1960 size++;
1961 }
1962
1963 if (!found || loop_use_stmt != phi || size < 2)
1964 return false;
1965
1966 /* Swap the operands, if needed, to make the reduction operand be the second
1967 operand. */
1968 lhs = PHI_RESULT (phi);
1969 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1970 while (next_stmt)
1971 {
1972 if (gimple_assign_rhs2 (next_stmt) == lhs)
1973 {
1974 tree op = gimple_assign_rhs1 (next_stmt);
1975 gimple def_stmt = NULL;
1976
1977 if (TREE_CODE (op) == SSA_NAME)
1978 def_stmt = SSA_NAME_DEF_STMT (op);
1979
1980 /* Check that the other def is either defined in the loop
1981 ("vect_internal_def"), or it's an induction (defined by a
1982 loop-header phi-node). */
1983 if (def_stmt
1984 && gimple_bb (def_stmt)
1985 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1986 && (is_gimple_assign (def_stmt)
1987 || is_gimple_call (def_stmt)
1988 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1989 == vect_induction_def
1990 || (gimple_code (def_stmt) == GIMPLE_PHI
1991 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1992 == vect_internal_def
1993 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
1994 {
1995 lhs = gimple_assign_lhs (next_stmt);
1996 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1997 continue;
1998 }
1999
2000 return false;
2001 }
2002 else
2003 {
2004 tree op = gimple_assign_rhs2 (next_stmt);
2005 gimple def_stmt = NULL;
2006
2007 if (TREE_CODE (op) == SSA_NAME)
2008 def_stmt = SSA_NAME_DEF_STMT (op);
2009
2010 /* Check that the other def is either defined in the loop
2011 ("vect_internal_def"), or it's an induction (defined by a
2012 loop-header phi-node). */
2013 if (def_stmt
2014 && gimple_bb (def_stmt)
2015 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2016 && (is_gimple_assign (def_stmt)
2017 || is_gimple_call (def_stmt)
2018 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2019 == vect_induction_def
2020 || (gimple_code (def_stmt) == GIMPLE_PHI
2021 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2022 == vect_internal_def
2023 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2024 {
2025 if (dump_enabled_p ())
2026 {
2027 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2028 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2029 }
2030
2031 swap_tree_operands (next_stmt,
2032 gimple_assign_rhs1_ptr (next_stmt),
2033 gimple_assign_rhs2_ptr (next_stmt));
2034 update_stmt (next_stmt);
2035
2036 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2037 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2038 }
2039 else
2040 return false;
2041 }
2042
2043 lhs = gimple_assign_lhs (next_stmt);
2044 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2045 }
2046
2047 /* Save the chain for further analysis in SLP detection. */
2048 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2049 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2050 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2051
2052 return true;
2053 }
2054
2055
2056 /* Function vect_is_simple_reduction_1
2057
2058 (1) Detect a cross-iteration def-use cycle that represents a simple
2059 reduction computation. We look for the following pattern:
2060
2061 loop_header:
2062 a1 = phi < a0, a2 >
2063 a3 = ...
2064 a2 = operation (a3, a1)
2065
2066 such that:
2067 1. operation is commutative and associative and it is safe to
2068 change the order of the computation (if CHECK_REDUCTION is true)
2069 2. no uses for a2 in the loop (a2 is used out of the loop)
2070 3. no uses of a1 in the loop besides the reduction operation
2071 4. no uses of a1 outside the loop.
2072
2073 Conditions 1,4 are tested here.
2074 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2075
2076 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2077 nested cycles, if CHECK_REDUCTION is false.
2078
2079 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2080 reductions:
2081
2082 a1 = phi < a0, a2 >
2083 inner loop (def of a3)
2084 a2 = phi < a3 >
2085
2086 If MODIFY is true it tries also to rework the code in-place to enable
2087 detection of more reduction patterns. For the time being we rewrite
2088 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2089 */
2090
2091 static gimple
2092 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2093 bool check_reduction, bool *double_reduc,
2094 bool modify)
2095 {
2096 struct loop *loop = (gimple_bb (phi))->loop_father;
2097 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2098 edge latch_e = loop_latch_edge (loop);
2099 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2100 gimple def_stmt, def1 = NULL, def2 = NULL;
2101 enum tree_code orig_code, code;
2102 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2103 tree type;
2104 int nloop_uses;
2105 tree name;
2106 imm_use_iterator imm_iter;
2107 use_operand_p use_p;
2108 bool phi_def;
2109
2110 *double_reduc = false;
2111
2112 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2113 otherwise, we assume outer loop vectorization. */
2114 gcc_assert ((check_reduction && loop == vect_loop)
2115 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2116
2117 name = PHI_RESULT (phi);
2118 nloop_uses = 0;
2119 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2120 {
2121 gimple use_stmt = USE_STMT (use_p);
2122 if (is_gimple_debug (use_stmt))
2123 continue;
2124
2125 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2126 {
2127 if (dump_enabled_p ())
2128 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2129 "intermediate value used outside loop.");
2130
2131 return NULL;
2132 }
2133
2134 if (vinfo_for_stmt (use_stmt)
2135 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2136 nloop_uses++;
2137 if (nloop_uses > 1)
2138 {
2139 if (dump_enabled_p ())
2140 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2141 "reduction used in loop.");
2142 return NULL;
2143 }
2144 }
2145
2146 if (TREE_CODE (loop_arg) != SSA_NAME)
2147 {
2148 if (dump_enabled_p ())
2149 {
2150 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2151 "reduction: not ssa_name: ");
2152 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2153 }
2154 return NULL;
2155 }
2156
2157 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2158 if (!def_stmt)
2159 {
2160 if (dump_enabled_p ())
2161 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2162 "reduction: no def_stmt.");
2163 return NULL;
2164 }
2165
2166 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2167 {
2168 if (dump_enabled_p ())
2169 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2170 return NULL;
2171 }
2172
2173 if (is_gimple_assign (def_stmt))
2174 {
2175 name = gimple_assign_lhs (def_stmt);
2176 phi_def = false;
2177 }
2178 else
2179 {
2180 name = PHI_RESULT (def_stmt);
2181 phi_def = true;
2182 }
2183
2184 nloop_uses = 0;
2185 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2186 {
2187 gimple use_stmt = USE_STMT (use_p);
2188 if (is_gimple_debug (use_stmt))
2189 continue;
2190 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2191 && vinfo_for_stmt (use_stmt)
2192 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2193 nloop_uses++;
2194 if (nloop_uses > 1)
2195 {
2196 if (dump_enabled_p ())
2197 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2198 "reduction used in loop.");
2199 return NULL;
2200 }
2201 }
2202
2203 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2204 defined in the inner loop. */
2205 if (phi_def)
2206 {
2207 op1 = PHI_ARG_DEF (def_stmt, 0);
2208
2209 if (gimple_phi_num_args (def_stmt) != 1
2210 || TREE_CODE (op1) != SSA_NAME)
2211 {
2212 if (dump_enabled_p ())
2213 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2214 "unsupported phi node definition.");
2215
2216 return NULL;
2217 }
2218
2219 def1 = SSA_NAME_DEF_STMT (op1);
2220 if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2221 && loop->inner
2222 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2223 && is_gimple_assign (def1))
2224 {
2225 if (dump_enabled_p ())
2226 report_vect_op (MSG_NOTE, def_stmt,
2227 "detected double reduction: ");
2228
2229 *double_reduc = true;
2230 return def_stmt;
2231 }
2232
2233 return NULL;
2234 }
2235
2236 code = orig_code = gimple_assign_rhs_code (def_stmt);
2237
2238 /* We can handle "res -= x[i]", which is non-associative by
2239 simply rewriting this into "res += -x[i]". Avoid changing
2240 gimple instruction for the first simple tests and only do this
2241 if we're allowed to change code at all. */
2242 if (code == MINUS_EXPR
2243 && modify
2244 && (op1 = gimple_assign_rhs1 (def_stmt))
2245 && TREE_CODE (op1) == SSA_NAME
2246 && SSA_NAME_DEF_STMT (op1) == phi)
2247 code = PLUS_EXPR;
2248
2249 if (check_reduction
2250 && (!commutative_tree_code (code) || !associative_tree_code (code)))
2251 {
2252 if (dump_enabled_p ())
2253 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2254 "reduction: not commutative/associative: ");
2255 return NULL;
2256 }
2257
2258 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2259 {
2260 if (code != COND_EXPR)
2261 {
2262 if (dump_enabled_p ())
2263 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2264 "reduction: not binary operation: ");
2265
2266 return NULL;
2267 }
2268
2269 op3 = gimple_assign_rhs1 (def_stmt);
2270 if (COMPARISON_CLASS_P (op3))
2271 {
2272 op4 = TREE_OPERAND (op3, 1);
2273 op3 = TREE_OPERAND (op3, 0);
2274 }
2275
2276 op1 = gimple_assign_rhs2 (def_stmt);
2277 op2 = gimple_assign_rhs3 (def_stmt);
2278
2279 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2280 {
2281 if (dump_enabled_p ())
2282 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2283 "reduction: uses not ssa_names: ");
2284
2285 return NULL;
2286 }
2287 }
2288 else
2289 {
2290 op1 = gimple_assign_rhs1 (def_stmt);
2291 op2 = gimple_assign_rhs2 (def_stmt);
2292
2293 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2294 {
2295 if (dump_enabled_p ())
2296 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2297 "reduction: uses not ssa_names: ");
2298
2299 return NULL;
2300 }
2301 }
2302
2303 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2304 if ((TREE_CODE (op1) == SSA_NAME
2305 && !types_compatible_p (type,TREE_TYPE (op1)))
2306 || (TREE_CODE (op2) == SSA_NAME
2307 && !types_compatible_p (type, TREE_TYPE (op2)))
2308 || (op3 && TREE_CODE (op3) == SSA_NAME
2309 && !types_compatible_p (type, TREE_TYPE (op3)))
2310 || (op4 && TREE_CODE (op4) == SSA_NAME
2311 && !types_compatible_p (type, TREE_TYPE (op4))))
2312 {
2313 if (dump_enabled_p ())
2314 {
2315 dump_printf_loc (MSG_NOTE, vect_location,
2316 "reduction: multiple types: operation type: ");
2317 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2318 dump_printf (MSG_NOTE, ", operands types: ");
2319 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2320 TREE_TYPE (op1));
2321 dump_printf (MSG_NOTE, ",");
2322 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2323 TREE_TYPE (op2));
2324 if (op3)
2325 {
2326 dump_printf (MSG_NOTE, ",");
2327 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2328 TREE_TYPE (op3));
2329 }
2330
2331 if (op4)
2332 {
2333 dump_printf (MSG_NOTE, ",");
2334 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2335 TREE_TYPE (op4));
2336 }
2337 }
2338
2339 return NULL;
2340 }
2341
2342 /* Check that it's ok to change the order of the computation.
2343 Generally, when vectorizing a reduction we change the order of the
2344 computation. This may change the behavior of the program in some
2345 cases, so we need to check that this is ok. One exception is when
2346 vectorizing an outer-loop: the inner-loop is executed sequentially,
2347 and therefore vectorizing reductions in the inner-loop during
2348 outer-loop vectorization is safe. */
2349
2350 /* CHECKME: check for !flag_finite_math_only too? */
2351 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2352 && check_reduction)
2353 {
2354 /* Changing the order of operations changes the semantics. */
2355 if (dump_enabled_p ())
2356 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2357 "reduction: unsafe fp math optimization: ");
2358 return NULL;
2359 }
2360 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2361 && check_reduction)
2362 {
2363 /* Changing the order of operations changes the semantics. */
2364 if (dump_enabled_p ())
2365 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2366 "reduction: unsafe int math optimization: ");
2367 return NULL;
2368 }
2369 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2370 {
2371 /* Changing the order of operations changes the semantics. */
2372 if (dump_enabled_p ())
2373 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2374 "reduction: unsafe fixed-point math optimization: ");
2375 return NULL;
2376 }
2377
2378 /* If we detected "res -= x[i]" earlier, rewrite it into
2379 "res += -x[i]" now. If this turns out to be useless reassoc
2380 will clean it up again. */
2381 if (orig_code == MINUS_EXPR)
2382 {
2383 tree rhs = gimple_assign_rhs2 (def_stmt);
2384 tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
2385 gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2386 rhs, NULL);
2387 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2388 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2389 loop_info, NULL));
2390 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2391 gimple_assign_set_rhs2 (def_stmt, negrhs);
2392 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2393 update_stmt (def_stmt);
2394 }
2395
2396 /* Reduction is safe. We're dealing with one of the following:
2397 1) integer arithmetic and no trapv
2398 2) floating point arithmetic, and special flags permit this optimization
2399 3) nested cycle (i.e., outer loop vectorization). */
2400 if (TREE_CODE (op1) == SSA_NAME)
2401 def1 = SSA_NAME_DEF_STMT (op1);
2402
2403 if (TREE_CODE (op2) == SSA_NAME)
2404 def2 = SSA_NAME_DEF_STMT (op2);
2405
2406 if (code != COND_EXPR
2407 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2408 {
2409 if (dump_enabled_p ())
2410 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2411 return NULL;
2412 }
2413
2414 /* Check that one def is the reduction def, defined by PHI,
2415 the other def is either defined in the loop ("vect_internal_def"),
2416 or it's an induction (defined by a loop-header phi-node). */
2417
2418 if (def2 && def2 == phi
2419 && (code == COND_EXPR
2420 || !def1 || gimple_nop_p (def1)
2421 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2422 && (is_gimple_assign (def1)
2423 || is_gimple_call (def1)
2424 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2425 == vect_induction_def
2426 || (gimple_code (def1) == GIMPLE_PHI
2427 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2428 == vect_internal_def
2429 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2430 {
2431 if (dump_enabled_p ())
2432 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2433 return def_stmt;
2434 }
2435
2436 if (def1 && def1 == phi
2437 && (code == COND_EXPR
2438 || !def2 || gimple_nop_p (def2)
2439 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2440 && (is_gimple_assign (def2)
2441 || is_gimple_call (def2)
2442 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2443 == vect_induction_def
2444 || (gimple_code (def2) == GIMPLE_PHI
2445 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2446 == vect_internal_def
2447 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2448 {
2449 if (check_reduction)
2450 {
2451 /* Swap operands (just for simplicity - so that the rest of the code
2452 can assume that the reduction variable is always the last (second)
2453 argument). */
2454 if (dump_enabled_p ())
2455 report_vect_op (MSG_NOTE, def_stmt,
2456 "detected reduction: need to swap operands: ");
2457
2458 swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2459 gimple_assign_rhs2_ptr (def_stmt));
2460
2461 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2462 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2463 }
2464 else
2465 {
2466 if (dump_enabled_p ())
2467 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2468 }
2469
2470 return def_stmt;
2471 }
2472
2473 /* Try to find SLP reduction chain. */
2474 if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2475 {
2476 if (dump_enabled_p ())
2477 report_vect_op (MSG_NOTE, def_stmt,
2478 "reduction: detected reduction chain: ");
2479
2480 return def_stmt;
2481 }
2482
2483 if (dump_enabled_p ())
2484 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2485 "reduction: unknown pattern: ");
2486
2487 return NULL;
2488 }
2489
2490 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2491 in-place. Arguments as there. */
2492
2493 static gimple
2494 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2495 bool check_reduction, bool *double_reduc)
2496 {
2497 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2498 double_reduc, false);
2499 }
2500
2501 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2502 in-place if it enables detection of more reductions. Arguments
2503 as there. */
2504
2505 gimple
2506 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2507 bool check_reduction, bool *double_reduc)
2508 {
2509 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2510 double_reduc, true);
2511 }
2512
2513 /* Calculate the cost of one scalar iteration of the loop. */
2514 int
2515 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2516 {
2517 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2518 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2519 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2520 int innerloop_iters, i, stmt_cost;
2521
2522 /* Count statements in scalar loop. Using this as scalar cost for a single
2523 iteration for now.
2524
2525 TODO: Add outer loop support.
2526
2527 TODO: Consider assigning different costs to different scalar
2528 statements. */
2529
2530 /* FORNOW. */
2531 innerloop_iters = 1;
2532 if (loop->inner)
2533 innerloop_iters = 50; /* FIXME */
2534
2535 for (i = 0; i < nbbs; i++)
2536 {
2537 gimple_stmt_iterator si;
2538 basic_block bb = bbs[i];
2539
2540 if (bb->loop_father == loop->inner)
2541 factor = innerloop_iters;
2542 else
2543 factor = 1;
2544
2545 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2546 {
2547 gimple stmt = gsi_stmt (si);
2548 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2549
2550 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2551 continue;
2552
2553 /* Skip stmts that are not vectorized inside the loop. */
2554 if (stmt_info
2555 && !STMT_VINFO_RELEVANT_P (stmt_info)
2556 && (!STMT_VINFO_LIVE_P (stmt_info)
2557 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2558 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2559 continue;
2560
2561 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2562 {
2563 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2564 stmt_cost = vect_get_stmt_cost (scalar_load);
2565 else
2566 stmt_cost = vect_get_stmt_cost (scalar_store);
2567 }
2568 else
2569 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2570
2571 scalar_single_iter_cost += stmt_cost * factor;
2572 }
2573 }
2574 return scalar_single_iter_cost;
2575 }
2576
2577 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2578 int
2579 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2580 int *peel_iters_epilogue,
2581 int scalar_single_iter_cost,
2582 stmt_vector_for_cost *prologue_cost_vec,
2583 stmt_vector_for_cost *epilogue_cost_vec)
2584 {
2585 int retval = 0;
2586 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2587
2588 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2589 {
2590 *peel_iters_epilogue = vf/2;
2591 if (dump_enabled_p ())
2592 dump_printf_loc (MSG_NOTE, vect_location,
2593 "cost model: epilogue peel iters set to vf/2 "
2594 "because loop iterations are unknown .");
2595
2596 /* If peeled iterations are known but number of scalar loop
2597 iterations are unknown, count a taken branch per peeled loop. */
2598 retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2599 NULL, 0, vect_prologue);
2600 }
2601 else
2602 {
2603 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2604 peel_iters_prologue = niters < peel_iters_prologue ?
2605 niters : peel_iters_prologue;
2606 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2607 /* If we need to peel for gaps, but no peeling is required, we have to
2608 peel VF iterations. */
2609 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2610 *peel_iters_epilogue = vf;
2611 }
2612
2613 if (peel_iters_prologue)
2614 retval += record_stmt_cost (prologue_cost_vec,
2615 peel_iters_prologue * scalar_single_iter_cost,
2616 scalar_stmt, NULL, 0, vect_prologue);
2617 if (*peel_iters_epilogue)
2618 retval += record_stmt_cost (epilogue_cost_vec,
2619 *peel_iters_epilogue * scalar_single_iter_cost,
2620 scalar_stmt, NULL, 0, vect_epilogue);
2621 return retval;
2622 }
2623
2624 /* Function vect_estimate_min_profitable_iters
2625
2626 Return the number of iterations required for the vector version of the
2627 loop to be profitable relative to the cost of the scalar version of the
2628 loop. */
2629
2630 static void
2631 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2632 int *ret_min_profitable_niters,
2633 int *ret_min_profitable_estimate)
2634 {
2635 int min_profitable_iters;
2636 int min_profitable_estimate;
2637 int peel_iters_prologue;
2638 int peel_iters_epilogue;
2639 unsigned vec_inside_cost = 0;
2640 int vec_outside_cost = 0;
2641 unsigned vec_prologue_cost = 0;
2642 unsigned vec_epilogue_cost = 0;
2643 int scalar_single_iter_cost = 0;
2644 int scalar_outside_cost = 0;
2645 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2646 int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2647 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2648
2649 /* Cost model disabled. */
2650 if (!flag_vect_cost_model)
2651 {
2652 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.");
2653 *ret_min_profitable_niters = 0;
2654 *ret_min_profitable_estimate = 0;
2655 return;
2656 }
2657
2658 /* Requires loop versioning tests to handle misalignment. */
2659 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2660 {
2661 /* FIXME: Make cost depend on complexity of individual check. */
2662 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2663 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2664 vect_prologue);
2665 dump_printf (MSG_NOTE,
2666 "cost model: Adding cost of checks for loop "
2667 "versioning to treat misalignment.\n");
2668 }
2669
2670 /* Requires loop versioning with alias checks. */
2671 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2672 {
2673 /* FIXME: Make cost depend on complexity of individual check. */
2674 unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2675 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2676 vect_prologue);
2677 dump_printf (MSG_NOTE,
2678 "cost model: Adding cost of checks for loop "
2679 "versioning aliasing.\n");
2680 }
2681
2682 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2683 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2684 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2685 vect_prologue);
2686
2687 /* Count statements in scalar loop. Using this as scalar cost for a single
2688 iteration for now.
2689
2690 TODO: Add outer loop support.
2691
2692 TODO: Consider assigning different costs to different scalar
2693 statements. */
2694
2695 scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2696
2697 /* Add additional cost for the peeled instructions in prologue and epilogue
2698 loop.
2699
2700 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2701 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2702
2703 TODO: Build an expression that represents peel_iters for prologue and
2704 epilogue to be used in a run-time test. */
2705
2706 if (npeel < 0)
2707 {
2708 peel_iters_prologue = vf/2;
2709 dump_printf (MSG_NOTE, "cost model: "
2710 "prologue peel iters set to vf/2.");
2711
2712 /* If peeling for alignment is unknown, loop bound of main loop becomes
2713 unknown. */
2714 peel_iters_epilogue = vf/2;
2715 dump_printf (MSG_NOTE, "cost model: "
2716 "epilogue peel iters set to vf/2 because "
2717 "peeling for alignment is unknown.");
2718
2719 /* If peeled iterations are unknown, count a taken branch and a not taken
2720 branch per peeled loop. Even if scalar loop iterations are known,
2721 vector iterations are not known since peeled prologue iterations are
2722 not known. Hence guards remain the same. */
2723 (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2724 NULL, 0, vect_prologue);
2725 (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2726 NULL, 0, vect_prologue);
2727 /* FORNOW: Don't attempt to pass individual scalar instructions to
2728 the model; just assume linear cost for scalar iterations. */
2729 (void) add_stmt_cost (target_cost_data,
2730 peel_iters_prologue * scalar_single_iter_cost,
2731 scalar_stmt, NULL, 0, vect_prologue);
2732 (void) add_stmt_cost (target_cost_data,
2733 peel_iters_epilogue * scalar_single_iter_cost,
2734 scalar_stmt, NULL, 0, vect_epilogue);
2735 }
2736 else
2737 {
2738 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2739 stmt_info_for_cost *si;
2740 int j;
2741 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2742
2743 prologue_cost_vec.create (2);
2744 epilogue_cost_vec.create (2);
2745 peel_iters_prologue = npeel;
2746
2747 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2748 &peel_iters_epilogue,
2749 scalar_single_iter_cost,
2750 &prologue_cost_vec,
2751 &epilogue_cost_vec);
2752
2753 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2754 {
2755 struct _stmt_vec_info *stmt_info
2756 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2757 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2758 si->misalign, vect_prologue);
2759 }
2760
2761 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2762 {
2763 struct _stmt_vec_info *stmt_info
2764 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2765 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2766 si->misalign, vect_epilogue);
2767 }
2768
2769 prologue_cost_vec.release ();
2770 epilogue_cost_vec.release ();
2771 }
2772
2773 /* FORNOW: The scalar outside cost is incremented in one of the
2774 following ways:
2775
2776 1. The vectorizer checks for alignment and aliasing and generates
2777 a condition that allows dynamic vectorization. A cost model
2778 check is ANDED with the versioning condition. Hence scalar code
2779 path now has the added cost of the versioning check.
2780
2781 if (cost > th & versioning_check)
2782 jmp to vector code
2783
2784 Hence run-time scalar is incremented by not-taken branch cost.
2785
2786 2. The vectorizer then checks if a prologue is required. If the
2787 cost model check was not done before during versioning, it has to
2788 be done before the prologue check.
2789
2790 if (cost <= th)
2791 prologue = scalar_iters
2792 if (prologue == 0)
2793 jmp to vector code
2794 else
2795 execute prologue
2796 if (prologue == num_iters)
2797 go to exit
2798
2799 Hence the run-time scalar cost is incremented by a taken branch,
2800 plus a not-taken branch, plus a taken branch cost.
2801
2802 3. The vectorizer then checks if an epilogue is required. If the
2803 cost model check was not done before during prologue check, it
2804 has to be done with the epilogue check.
2805
2806 if (prologue == 0)
2807 jmp to vector code
2808 else
2809 execute prologue
2810 if (prologue == num_iters)
2811 go to exit
2812 vector code:
2813 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2814 jmp to epilogue
2815
2816 Hence the run-time scalar cost should be incremented by 2 taken
2817 branches.
2818
2819 TODO: The back end may reorder the BBS's differently and reverse
2820 conditions/branch directions. Change the estimates below to
2821 something more reasonable. */
2822
2823 /* If the number of iterations is known and we do not do versioning, we can
2824 decide whether to vectorize at compile time. Hence the scalar version
2825 do not carry cost model guard costs. */
2826 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2827 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2828 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2829 {
2830 /* Cost model check occurs at versioning. */
2831 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2832 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2833 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2834 else
2835 {
2836 /* Cost model check occurs at prologue generation. */
2837 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2838 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2839 + vect_get_stmt_cost (cond_branch_not_taken);
2840 /* Cost model check occurs at epilogue generation. */
2841 else
2842 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2843 }
2844 }
2845
2846 /* Complete the target-specific cost calculations. */
2847 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2848 &vec_inside_cost, &vec_epilogue_cost);
2849
2850 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2851
2852 /* Calculate number of iterations required to make the vector version
2853 profitable, relative to the loop bodies only. The following condition
2854 must hold true:
2855 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2856 where
2857 SIC = scalar iteration cost, VIC = vector iteration cost,
2858 VOC = vector outside cost, VF = vectorization factor,
2859 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2860 SOC = scalar outside cost for run time cost model check. */
2861
2862 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2863 {
2864 if (vec_outside_cost <= 0)
2865 min_profitable_iters = 1;
2866 else
2867 {
2868 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2869 - vec_inside_cost * peel_iters_prologue
2870 - vec_inside_cost * peel_iters_epilogue)
2871 / ((scalar_single_iter_cost * vf)
2872 - vec_inside_cost);
2873
2874 if ((scalar_single_iter_cost * vf * min_profitable_iters)
2875 <= (((int) vec_inside_cost * min_profitable_iters)
2876 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
2877 min_profitable_iters++;
2878 }
2879 }
2880 /* vector version will never be profitable. */
2881 else
2882 {
2883 if (dump_enabled_p ())
2884 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2885 "cost model: the vector iteration cost = %d "
2886 "divided by the scalar iteration cost = %d "
2887 "is greater or equal to the vectorization factor = %d.",
2888 vec_inside_cost, scalar_single_iter_cost, vf);
2889 *ret_min_profitable_niters = -1;
2890 *ret_min_profitable_estimate = -1;
2891 return;
2892 }
2893
2894 if (dump_enabled_p ())
2895 {
2896 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2897 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
2898 vec_inside_cost);
2899 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
2900 vec_prologue_cost);
2901 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
2902 vec_epilogue_cost);
2903 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
2904 scalar_single_iter_cost);
2905 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
2906 scalar_outside_cost);
2907 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
2908 vec_outside_cost);
2909 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
2910 peel_iters_prologue);
2911 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
2912 peel_iters_epilogue);
2913 dump_printf (MSG_NOTE,
2914 " Calculated minimum iters for profitability: %d\n",
2915 min_profitable_iters);
2916 }
2917
2918 min_profitable_iters =
2919 min_profitable_iters < vf ? vf : min_profitable_iters;
2920
2921 /* Because the condition we create is:
2922 if (niters <= min_profitable_iters)
2923 then skip the vectorized loop. */
2924 min_profitable_iters--;
2925
2926 if (dump_enabled_p ())
2927 dump_printf_loc (MSG_NOTE, vect_location,
2928 " Runtime profitability threshold = %d\n", min_profitable_iters);
2929
2930 *ret_min_profitable_niters = min_profitable_iters;
2931
2932 /* Calculate number of iterations required to make the vector version
2933 profitable, relative to the loop bodies only.
2934
2935 Non-vectorized variant is SIC * niters and it must win over vector
2936 variant on the expected loop trip count. The following condition must hold true:
2937 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
2938
2939 if (vec_outside_cost <= 0)
2940 min_profitable_estimate = 1;
2941 else
2942 {
2943 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
2944 - vec_inside_cost * peel_iters_prologue
2945 - vec_inside_cost * peel_iters_epilogue)
2946 / ((scalar_single_iter_cost * vf)
2947 - vec_inside_cost);
2948 }
2949 min_profitable_estimate --;
2950 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
2951 if (dump_enabled_p ())
2952 dump_printf_loc (MSG_NOTE, vect_location,
2953 " Static estimate profitability threshold = %d\n",
2954 min_profitable_iters);
2955
2956 *ret_min_profitable_estimate = min_profitable_estimate;
2957 }
2958
2959
2960 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2961 functions. Design better to avoid maintenance issues. */
2962
2963 /* Function vect_model_reduction_cost.
2964
2965 Models cost for a reduction operation, including the vector ops
2966 generated within the strip-mine loop, the initial definition before
2967 the loop, and the epilogue code that must be generated. */
2968
2969 static bool
2970 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2971 int ncopies)
2972 {
2973 int prologue_cost = 0, epilogue_cost = 0;
2974 enum tree_code code;
2975 optab optab;
2976 tree vectype;
2977 gimple stmt, orig_stmt;
2978 tree reduction_op;
2979 enum machine_mode mode;
2980 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2981 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2982 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2983
2984 /* Cost of reduction op inside loop. */
2985 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
2986 stmt_info, 0, vect_body);
2987 stmt = STMT_VINFO_STMT (stmt_info);
2988
2989 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2990 {
2991 case GIMPLE_SINGLE_RHS:
2992 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2993 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2994 break;
2995 case GIMPLE_UNARY_RHS:
2996 reduction_op = gimple_assign_rhs1 (stmt);
2997 break;
2998 case GIMPLE_BINARY_RHS:
2999 reduction_op = gimple_assign_rhs2 (stmt);
3000 break;
3001 case GIMPLE_TERNARY_RHS:
3002 reduction_op = gimple_assign_rhs3 (stmt);
3003 break;
3004 default:
3005 gcc_unreachable ();
3006 }
3007
3008 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3009 if (!vectype)
3010 {
3011 if (dump_enabled_p ())
3012 {
3013 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3014 "unsupported data-type ");
3015 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3016 TREE_TYPE (reduction_op));
3017 }
3018 return false;
3019 }
3020
3021 mode = TYPE_MODE (vectype);
3022 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3023
3024 if (!orig_stmt)
3025 orig_stmt = STMT_VINFO_STMT (stmt_info);
3026
3027 code = gimple_assign_rhs_code (orig_stmt);
3028
3029 /* Add in cost for initial definition. */
3030 prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3031 stmt_info, 0, vect_prologue);
3032
3033 /* Determine cost of epilogue code.
3034
3035 We have a reduction operator that will reduce the vector in one statement.
3036 Also requires scalar extract. */
3037
3038 if (!nested_in_vect_loop_p (loop, orig_stmt))
3039 {
3040 if (reduc_code != ERROR_MARK)
3041 {
3042 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3043 stmt_info, 0, vect_epilogue);
3044 epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3045 stmt_info, 0, vect_epilogue);
3046 }
3047 else
3048 {
3049 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3050 tree bitsize =
3051 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3052 int element_bitsize = tree_low_cst (bitsize, 1);
3053 int nelements = vec_size_in_bits / element_bitsize;
3054
3055 optab = optab_for_tree_code (code, vectype, optab_default);
3056
3057 /* We have a whole vector shift available. */
3058 if (VECTOR_MODE_P (mode)
3059 && optab_handler (optab, mode) != CODE_FOR_nothing
3060 && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3061 {
3062 /* Final reduction via vector shifts and the reduction operator.
3063 Also requires scalar extract. */
3064 epilogue_cost += add_stmt_cost (target_cost_data,
3065 exact_log2 (nelements) * 2,
3066 vector_stmt, stmt_info, 0,
3067 vect_epilogue);
3068 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3069 vec_to_scalar, stmt_info, 0,
3070 vect_epilogue);
3071 }
3072 else
3073 /* Use extracts and reduction op for final reduction. For N
3074 elements, we have N extracts and N-1 reduction ops. */
3075 epilogue_cost += add_stmt_cost (target_cost_data,
3076 nelements + nelements - 1,
3077 vector_stmt, stmt_info, 0,
3078 vect_epilogue);
3079 }
3080 }
3081
3082 if (dump_enabled_p ())
3083 dump_printf (MSG_NOTE,
3084 "vect_model_reduction_cost: inside_cost = %d, "
3085 "prologue_cost = %d, epilogue_cost = %d .", inside_cost,
3086 prologue_cost, epilogue_cost);
3087
3088 return true;
3089 }
3090
3091
3092 /* Function vect_model_induction_cost.
3093
3094 Models cost for induction operations. */
3095
3096 static void
3097 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3098 {
3099 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3100 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3101 unsigned inside_cost, prologue_cost;
3102
3103 /* loop cost for vec_loop. */
3104 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3105 stmt_info, 0, vect_body);
3106
3107 /* prologue cost for vec_init and vec_step. */
3108 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3109 stmt_info, 0, vect_prologue);
3110
3111 if (dump_enabled_p ())
3112 dump_printf_loc (MSG_NOTE, vect_location,
3113 "vect_model_induction_cost: inside_cost = %d, "
3114 "prologue_cost = %d .", inside_cost, prologue_cost);
3115 }
3116
3117
3118 /* Function get_initial_def_for_induction
3119
3120 Input:
3121 STMT - a stmt that performs an induction operation in the loop.
3122 IV_PHI - the initial value of the induction variable
3123
3124 Output:
3125 Return a vector variable, initialized with the first VF values of
3126 the induction variable. E.g., for an iv with IV_PHI='X' and
3127 evolution S, for a vector of 4 units, we want to return:
3128 [X, X + S, X + 2*S, X + 3*S]. */
3129
3130 static tree
3131 get_initial_def_for_induction (gimple iv_phi)
3132 {
3133 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3134 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3135 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3136 tree scalar_type;
3137 tree vectype;
3138 int nunits;
3139 edge pe = loop_preheader_edge (loop);
3140 struct loop *iv_loop;
3141 basic_block new_bb;
3142 tree new_vec, vec_init, vec_step, t;
3143 tree access_fn;
3144 tree new_var;
3145 tree new_name;
3146 gimple init_stmt, induction_phi, new_stmt;
3147 tree induc_def, vec_def, vec_dest;
3148 tree init_expr, step_expr;
3149 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3150 int i;
3151 bool ok;
3152 int ncopies;
3153 tree expr;
3154 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3155 bool nested_in_vect_loop = false;
3156 gimple_seq stmts = NULL;
3157 imm_use_iterator imm_iter;
3158 use_operand_p use_p;
3159 gimple exit_phi;
3160 edge latch_e;
3161 tree loop_arg;
3162 gimple_stmt_iterator si;
3163 basic_block bb = gimple_bb (iv_phi);
3164 tree stepvectype;
3165 tree resvectype;
3166
3167 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3168 if (nested_in_vect_loop_p (loop, iv_phi))
3169 {
3170 nested_in_vect_loop = true;
3171 iv_loop = loop->inner;
3172 }
3173 else
3174 iv_loop = loop;
3175 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3176
3177 latch_e = loop_latch_edge (iv_loop);
3178 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3179
3180 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
3181 gcc_assert (access_fn);
3182 STRIP_NOPS (access_fn);
3183 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
3184 &init_expr, &step_expr);
3185 gcc_assert (ok);
3186 pe = loop_preheader_edge (iv_loop);
3187
3188 scalar_type = TREE_TYPE (init_expr);
3189 vectype = get_vectype_for_scalar_type (scalar_type);
3190 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3191 gcc_assert (vectype);
3192 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3193 ncopies = vf / nunits;
3194
3195 gcc_assert (phi_info);
3196 gcc_assert (ncopies >= 1);
3197
3198 /* Find the first insertion point in the BB. */
3199 si = gsi_after_labels (bb);
3200
3201 /* Create the vector that holds the initial_value of the induction. */
3202 if (nested_in_vect_loop)
3203 {
3204 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3205 been created during vectorization of previous stmts. We obtain it
3206 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3207 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3208 loop_preheader_edge (iv_loop));
3209 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
3210 /* If the initial value is not of proper type, convert it. */
3211 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3212 {
3213 new_stmt = gimple_build_assign_with_ops
3214 (VIEW_CONVERT_EXPR,
3215 vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"),
3216 build1 (VIEW_CONVERT_EXPR, vectype, vec_init), NULL_TREE);
3217 vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3218 gimple_assign_set_lhs (new_stmt, vec_init);
3219 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3220 new_stmt);
3221 gcc_assert (!new_bb);
3222 set_vinfo_for_stmt (new_stmt,
3223 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3224 }
3225 }
3226 else
3227 {
3228 vec<constructor_elt, va_gc> *v;
3229
3230 /* iv_loop is the loop to be vectorized. Create:
3231 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3232 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
3233 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
3234 if (stmts)
3235 {
3236 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3237 gcc_assert (!new_bb);
3238 }
3239
3240 vec_alloc (v, nunits);
3241 bool constant_p = is_gimple_min_invariant (new_name);
3242 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3243 for (i = 1; i < nunits; i++)
3244 {
3245 /* Create: new_name_i = new_name + step_expr */
3246 enum tree_code code = POINTER_TYPE_P (scalar_type)
3247 ? POINTER_PLUS_EXPR : PLUS_EXPR;
3248 new_name = fold_build2 (code, scalar_type, new_name, step_expr);
3249 if (!is_gimple_min_invariant (new_name))
3250 {
3251 init_stmt = gimple_build_assign (new_var, new_name);
3252 new_name = make_ssa_name (new_var, init_stmt);
3253 gimple_assign_set_lhs (init_stmt, new_name);
3254 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3255 gcc_assert (!new_bb);
3256 if (dump_enabled_p ())
3257 {
3258 dump_printf_loc (MSG_NOTE, vect_location,
3259 "created new init_stmt: ");
3260 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3261 }
3262 constant_p = false;
3263 }
3264 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3265 }
3266 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3267 if (constant_p)
3268 new_vec = build_vector_from_ctor (vectype, v);
3269 else
3270 new_vec = build_constructor (vectype, v);
3271 vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3272 }
3273
3274
3275 /* Create the vector that holds the step of the induction. */
3276 if (nested_in_vect_loop)
3277 /* iv_loop is nested in the loop to be vectorized. Generate:
3278 vec_step = [S, S, S, S] */
3279 new_name = step_expr;
3280 else
3281 {
3282 /* iv_loop is the loop to be vectorized. Generate:
3283 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3284 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3285 {
3286 expr = build_int_cst (integer_type_node, vf);
3287 expr = fold_convert (TREE_TYPE (step_expr), expr);
3288 }
3289 else
3290 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3291 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3292 expr, step_expr);
3293 if (TREE_CODE (step_expr) == SSA_NAME)
3294 new_name = vect_init_vector (iv_phi, new_name,
3295 TREE_TYPE (step_expr), NULL);
3296 }
3297
3298 t = unshare_expr (new_name);
3299 gcc_assert (CONSTANT_CLASS_P (new_name)
3300 || TREE_CODE (new_name) == SSA_NAME);
3301 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3302 gcc_assert (stepvectype);
3303 new_vec = build_vector_from_val (stepvectype, t);
3304 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3305
3306
3307 /* Create the following def-use cycle:
3308 loop prolog:
3309 vec_init = ...
3310 vec_step = ...
3311 loop:
3312 vec_iv = PHI <vec_init, vec_loop>
3313 ...
3314 STMT
3315 ...
3316 vec_loop = vec_iv + vec_step; */
3317
3318 /* Create the induction-phi that defines the induction-operand. */
3319 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3320 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3321 set_vinfo_for_stmt (induction_phi,
3322 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3323 induc_def = PHI_RESULT (induction_phi);
3324
3325 /* Create the iv update inside the loop */
3326 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3327 induc_def, vec_step);
3328 vec_def = make_ssa_name (vec_dest, new_stmt);
3329 gimple_assign_set_lhs (new_stmt, vec_def);
3330 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3331 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3332 NULL));
3333
3334 /* Set the arguments of the phi node: */
3335 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3336 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3337 UNKNOWN_LOCATION);
3338
3339
3340 /* In case that vectorization factor (VF) is bigger than the number
3341 of elements that we can fit in a vectype (nunits), we have to generate
3342 more than one vector stmt - i.e - we need to "unroll" the
3343 vector stmt by a factor VF/nunits. For more details see documentation
3344 in vectorizable_operation. */
3345
3346 if (ncopies > 1)
3347 {
3348 stmt_vec_info prev_stmt_vinfo;
3349 /* FORNOW. This restriction should be relaxed. */
3350 gcc_assert (!nested_in_vect_loop);
3351
3352 /* Create the vector that holds the step of the induction. */
3353 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3354 {
3355 expr = build_int_cst (integer_type_node, nunits);
3356 expr = fold_convert (TREE_TYPE (step_expr), expr);
3357 }
3358 else
3359 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3360 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3361 expr, step_expr);
3362 if (TREE_CODE (step_expr) == SSA_NAME)
3363 new_name = vect_init_vector (iv_phi, new_name,
3364 TREE_TYPE (step_expr), NULL);
3365 t = unshare_expr (new_name);
3366 gcc_assert (CONSTANT_CLASS_P (new_name)
3367 || TREE_CODE (new_name) == SSA_NAME);
3368 new_vec = build_vector_from_val (stepvectype, t);
3369 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3370
3371 vec_def = induc_def;
3372 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3373 for (i = 1; i < ncopies; i++)
3374 {
3375 /* vec_i = vec_prev + vec_step */
3376 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3377 vec_def, vec_step);
3378 vec_def = make_ssa_name (vec_dest, new_stmt);
3379 gimple_assign_set_lhs (new_stmt, vec_def);
3380
3381 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3382 if (!useless_type_conversion_p (resvectype, vectype))
3383 {
3384 new_stmt = gimple_build_assign_with_ops
3385 (VIEW_CONVERT_EXPR,
3386 vect_get_new_vect_var (resvectype, vect_simple_var,
3387 "vec_iv_"),
3388 build1 (VIEW_CONVERT_EXPR, resvectype,
3389 gimple_assign_lhs (new_stmt)), NULL_TREE);
3390 gimple_assign_set_lhs (new_stmt,
3391 make_ssa_name
3392 (gimple_assign_lhs (new_stmt), new_stmt));
3393 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3394 }
3395 set_vinfo_for_stmt (new_stmt,
3396 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3397 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3398 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3399 }
3400 }
3401
3402 if (nested_in_vect_loop)
3403 {
3404 /* Find the loop-closed exit-phi of the induction, and record
3405 the final vector of induction results: */
3406 exit_phi = NULL;
3407 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3408 {
3409 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3410 {
3411 exit_phi = USE_STMT (use_p);
3412 break;
3413 }
3414 }
3415 if (exit_phi)
3416 {
3417 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3418 /* FORNOW. Currently not supporting the case that an inner-loop induction
3419 is not used in the outer-loop (i.e. only outside the outer-loop). */
3420 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3421 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3422
3423 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3424 if (dump_enabled_p ())
3425 {
3426 dump_printf_loc (MSG_NOTE, vect_location,
3427 "vector of inductions after inner-loop:");
3428 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3429 }
3430 }
3431 }
3432
3433
3434 if (dump_enabled_p ())
3435 {
3436 dump_printf_loc (MSG_NOTE, vect_location,
3437 "transform induction: created def-use cycle: ");
3438 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3439 dump_printf (MSG_NOTE, "\n");
3440 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3441 SSA_NAME_DEF_STMT (vec_def), 0);
3442 }
3443
3444 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3445 if (!useless_type_conversion_p (resvectype, vectype))
3446 {
3447 new_stmt = gimple_build_assign_with_ops
3448 (VIEW_CONVERT_EXPR,
3449 vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3450 build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3451 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3452 gimple_assign_set_lhs (new_stmt, induc_def);
3453 si = gsi_after_labels (bb);
3454 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3455 set_vinfo_for_stmt (new_stmt,
3456 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3457 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3458 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3459 }
3460
3461 return induc_def;
3462 }
3463
3464
3465 /* Function get_initial_def_for_reduction
3466
3467 Input:
3468 STMT - a stmt that performs a reduction operation in the loop.
3469 INIT_VAL - the initial value of the reduction variable
3470
3471 Output:
3472 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3473 of the reduction (used for adjusting the epilog - see below).
3474 Return a vector variable, initialized according to the operation that STMT
3475 performs. This vector will be used as the initial value of the
3476 vector of partial results.
3477
3478 Option1 (adjust in epilog): Initialize the vector as follows:
3479 add/bit or/xor: [0,0,...,0,0]
3480 mult/bit and: [1,1,...,1,1]
3481 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3482 and when necessary (e.g. add/mult case) let the caller know
3483 that it needs to adjust the result by init_val.
3484
3485 Option2: Initialize the vector as follows:
3486 add/bit or/xor: [init_val,0,0,...,0]
3487 mult/bit and: [init_val,1,1,...,1]
3488 min/max/cond_expr: [init_val,init_val,...,init_val]
3489 and no adjustments are needed.
3490
3491 For example, for the following code:
3492
3493 s = init_val;
3494 for (i=0;i<n;i++)
3495 s = s + a[i];
3496
3497 STMT is 's = s + a[i]', and the reduction variable is 's'.
3498 For a vector of 4 units, we want to return either [0,0,0,init_val],
3499 or [0,0,0,0] and let the caller know that it needs to adjust
3500 the result at the end by 'init_val'.
3501
3502 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3503 initialization vector is simpler (same element in all entries), if
3504 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3505
3506 A cost model should help decide between these two schemes. */
3507
3508 tree
3509 get_initial_def_for_reduction (gimple stmt, tree init_val,
3510 tree *adjustment_def)
3511 {
3512 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3513 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3514 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3515 tree scalar_type = TREE_TYPE (init_val);
3516 tree vectype = get_vectype_for_scalar_type (scalar_type);
3517 int nunits;
3518 enum tree_code code = gimple_assign_rhs_code (stmt);
3519 tree def_for_init;
3520 tree init_def;
3521 tree *elts;
3522 int i;
3523 bool nested_in_vect_loop = false;
3524 tree init_value;
3525 REAL_VALUE_TYPE real_init_val = dconst0;
3526 int int_init_val = 0;
3527 gimple def_stmt = NULL;
3528
3529 gcc_assert (vectype);
3530 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3531
3532 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3533 || SCALAR_FLOAT_TYPE_P (scalar_type));
3534
3535 if (nested_in_vect_loop_p (loop, stmt))
3536 nested_in_vect_loop = true;
3537 else
3538 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3539
3540 /* In case of double reduction we only create a vector variable to be put
3541 in the reduction phi node. The actual statement creation is done in
3542 vect_create_epilog_for_reduction. */
3543 if (adjustment_def && nested_in_vect_loop
3544 && TREE_CODE (init_val) == SSA_NAME
3545 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3546 && gimple_code (def_stmt) == GIMPLE_PHI
3547 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3548 && vinfo_for_stmt (def_stmt)
3549 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3550 == vect_double_reduction_def)
3551 {
3552 *adjustment_def = NULL;
3553 return vect_create_destination_var (init_val, vectype);
3554 }
3555
3556 if (TREE_CONSTANT (init_val))
3557 {
3558 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3559 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3560 else
3561 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3562 }
3563 else
3564 init_value = init_val;
3565
3566 switch (code)
3567 {
3568 case WIDEN_SUM_EXPR:
3569 case DOT_PROD_EXPR:
3570 case PLUS_EXPR:
3571 case MINUS_EXPR:
3572 case BIT_IOR_EXPR:
3573 case BIT_XOR_EXPR:
3574 case MULT_EXPR:
3575 case BIT_AND_EXPR:
3576 /* ADJUSMENT_DEF is NULL when called from
3577 vect_create_epilog_for_reduction to vectorize double reduction. */
3578 if (adjustment_def)
3579 {
3580 if (nested_in_vect_loop)
3581 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3582 NULL);
3583 else
3584 *adjustment_def = init_val;
3585 }
3586
3587 if (code == MULT_EXPR)
3588 {
3589 real_init_val = dconst1;
3590 int_init_val = 1;
3591 }
3592
3593 if (code == BIT_AND_EXPR)
3594 int_init_val = -1;
3595
3596 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3597 def_for_init = build_real (scalar_type, real_init_val);
3598 else
3599 def_for_init = build_int_cst (scalar_type, int_init_val);
3600
3601 /* Create a vector of '0' or '1' except the first element. */
3602 elts = XALLOCAVEC (tree, nunits);
3603 for (i = nunits - 2; i >= 0; --i)
3604 elts[i + 1] = def_for_init;
3605
3606 /* Option1: the first element is '0' or '1' as well. */
3607 if (adjustment_def)
3608 {
3609 elts[0] = def_for_init;
3610 init_def = build_vector (vectype, elts);
3611 break;
3612 }
3613
3614 /* Option2: the first element is INIT_VAL. */
3615 elts[0] = init_val;
3616 if (TREE_CONSTANT (init_val))
3617 init_def = build_vector (vectype, elts);
3618 else
3619 {
3620 vec<constructor_elt, va_gc> *v;
3621 vec_alloc (v, nunits);
3622 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3623 for (i = 1; i < nunits; ++i)
3624 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3625 init_def = build_constructor (vectype, v);
3626 }
3627
3628 break;
3629
3630 case MIN_EXPR:
3631 case MAX_EXPR:
3632 case COND_EXPR:
3633 if (adjustment_def)
3634 {
3635 *adjustment_def = NULL_TREE;
3636 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3637 break;
3638 }
3639
3640 init_def = build_vector_from_val (vectype, init_value);
3641 break;
3642
3643 default:
3644 gcc_unreachable ();
3645 }
3646
3647 return init_def;
3648 }
3649
3650
3651 /* Function vect_create_epilog_for_reduction
3652
3653 Create code at the loop-epilog to finalize the result of a reduction
3654 computation.
3655
3656 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3657 reduction statements.
3658 STMT is the scalar reduction stmt that is being vectorized.
3659 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3660 number of elements that we can fit in a vectype (nunits). In this case
3661 we have to generate more than one vector stmt - i.e - we need to "unroll"
3662 the vector stmt by a factor VF/nunits. For more details see documentation
3663 in vectorizable_operation.
3664 REDUC_CODE is the tree-code for the epilog reduction.
3665 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3666 computation.
3667 REDUC_INDEX is the index of the operand in the right hand side of the
3668 statement that is defined by REDUCTION_PHI.
3669 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3670 SLP_NODE is an SLP node containing a group of reduction statements. The
3671 first one in this group is STMT.
3672
3673 This function:
3674 1. Creates the reduction def-use cycles: sets the arguments for
3675 REDUCTION_PHIS:
3676 The loop-entry argument is the vectorized initial-value of the reduction.
3677 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3678 sums.
3679 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3680 by applying the operation specified by REDUC_CODE if available, or by
3681 other means (whole-vector shifts or a scalar loop).
3682 The function also creates a new phi node at the loop exit to preserve
3683 loop-closed form, as illustrated below.
3684
3685 The flow at the entry to this function:
3686
3687 loop:
3688 vec_def = phi <null, null> # REDUCTION_PHI
3689 VECT_DEF = vector_stmt # vectorized form of STMT
3690 s_loop = scalar_stmt # (scalar) STMT
3691 loop_exit:
3692 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3693 use <s_out0>
3694 use <s_out0>
3695
3696 The above is transformed by this function into:
3697
3698 loop:
3699 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3700 VECT_DEF = vector_stmt # vectorized form of STMT
3701 s_loop = scalar_stmt # (scalar) STMT
3702 loop_exit:
3703 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3704 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3705 v_out2 = reduce <v_out1>
3706 s_out3 = extract_field <v_out2, 0>
3707 s_out4 = adjust_result <s_out3>
3708 use <s_out4>
3709 use <s_out4>
3710 */
3711
3712 static void
3713 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3714 int ncopies, enum tree_code reduc_code,
3715 vec<gimple> reduction_phis,
3716 int reduc_index, bool double_reduc,
3717 slp_tree slp_node)
3718 {
3719 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3720 stmt_vec_info prev_phi_info;
3721 tree vectype;
3722 enum machine_mode mode;
3723 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3724 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3725 basic_block exit_bb;
3726 tree scalar_dest;
3727 tree scalar_type;
3728 gimple new_phi = NULL, phi;
3729 gimple_stmt_iterator exit_gsi;
3730 tree vec_dest;
3731 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3732 gimple epilog_stmt = NULL;
3733 enum tree_code code = gimple_assign_rhs_code (stmt);
3734 gimple exit_phi;
3735 tree bitsize, bitpos;
3736 tree adjustment_def = NULL;
3737 tree vec_initial_def = NULL;
3738 tree reduction_op, expr, def;
3739 tree orig_name, scalar_result;
3740 imm_use_iterator imm_iter, phi_imm_iter;
3741 use_operand_p use_p, phi_use_p;
3742 bool extract_scalar_result = false;
3743 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3744 bool nested_in_vect_loop = false;
3745 vec<gimple> new_phis = vNULL;
3746 vec<gimple> inner_phis = vNULL;
3747 enum vect_def_type dt = vect_unknown_def_type;
3748 int j, i;
3749 vec<tree> scalar_results = vNULL;
3750 unsigned int group_size = 1, k, ratio;
3751 vec<tree> vec_initial_defs = vNULL;
3752 vec<gimple> phis;
3753 bool slp_reduc = false;
3754 tree new_phi_result;
3755 gimple inner_phi = NULL;
3756
3757 if (slp_node)
3758 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3759
3760 if (nested_in_vect_loop_p (loop, stmt))
3761 {
3762 outer_loop = loop;
3763 loop = loop->inner;
3764 nested_in_vect_loop = true;
3765 gcc_assert (!slp_node);
3766 }
3767
3768 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3769 {
3770 case GIMPLE_SINGLE_RHS:
3771 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3772 == ternary_op);
3773 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3774 break;
3775 case GIMPLE_UNARY_RHS:
3776 reduction_op = gimple_assign_rhs1 (stmt);
3777 break;
3778 case GIMPLE_BINARY_RHS:
3779 reduction_op = reduc_index ?
3780 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3781 break;
3782 case GIMPLE_TERNARY_RHS:
3783 reduction_op = gimple_op (stmt, reduc_index + 1);
3784 break;
3785 default:
3786 gcc_unreachable ();
3787 }
3788
3789 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3790 gcc_assert (vectype);
3791 mode = TYPE_MODE (vectype);
3792
3793 /* 1. Create the reduction def-use cycle:
3794 Set the arguments of REDUCTION_PHIS, i.e., transform
3795
3796 loop:
3797 vec_def = phi <null, null> # REDUCTION_PHI
3798 VECT_DEF = vector_stmt # vectorized form of STMT
3799 ...
3800
3801 into:
3802
3803 loop:
3804 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3805 VECT_DEF = vector_stmt # vectorized form of STMT
3806 ...
3807
3808 (in case of SLP, do it for all the phis). */
3809
3810 /* Get the loop-entry arguments. */
3811 if (slp_node)
3812 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3813 NULL, slp_node, reduc_index);
3814 else
3815 {
3816 vec_initial_defs.create (1);
3817 /* For the case of reduction, vect_get_vec_def_for_operand returns
3818 the scalar def before the loop, that defines the initial value
3819 of the reduction variable. */
3820 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3821 &adjustment_def);
3822 vec_initial_defs.quick_push (vec_initial_def);
3823 }
3824
3825 /* Set phi nodes arguments. */
3826 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
3827 {
3828 tree vec_init_def = vec_initial_defs[i];
3829 tree def = vect_defs[i];
3830 for (j = 0; j < ncopies; j++)
3831 {
3832 /* Set the loop-entry arg of the reduction-phi. */
3833 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3834 UNKNOWN_LOCATION);
3835
3836 /* Set the loop-latch arg for the reduction-phi. */
3837 if (j > 0)
3838 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3839
3840 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3841
3842 if (dump_enabled_p ())
3843 {
3844 dump_printf_loc (MSG_NOTE, vect_location,
3845 "transform reduction: created def-use cycle: ");
3846 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
3847 dump_printf (MSG_NOTE, "\n");
3848 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
3849 }
3850
3851 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3852 }
3853 }
3854
3855 vec_initial_defs.release ();
3856
3857 /* 2. Create epilog code.
3858 The reduction epilog code operates across the elements of the vector
3859 of partial results computed by the vectorized loop.
3860 The reduction epilog code consists of:
3861
3862 step 1: compute the scalar result in a vector (v_out2)
3863 step 2: extract the scalar result (s_out3) from the vector (v_out2)
3864 step 3: adjust the scalar result (s_out3) if needed.
3865
3866 Step 1 can be accomplished using one the following three schemes:
3867 (scheme 1) using reduc_code, if available.
3868 (scheme 2) using whole-vector shifts, if available.
3869 (scheme 3) using a scalar loop. In this case steps 1+2 above are
3870 combined.
3871
3872 The overall epilog code looks like this:
3873
3874 s_out0 = phi <s_loop> # original EXIT_PHI
3875 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3876 v_out2 = reduce <v_out1> # step 1
3877 s_out3 = extract_field <v_out2, 0> # step 2
3878 s_out4 = adjust_result <s_out3> # step 3
3879
3880 (step 3 is optional, and steps 1 and 2 may be combined).
3881 Lastly, the uses of s_out0 are replaced by s_out4. */
3882
3883
3884 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3885 v_out1 = phi <VECT_DEF>
3886 Store them in NEW_PHIS. */
3887
3888 exit_bb = single_exit (loop)->dest;
3889 prev_phi_info = NULL;
3890 new_phis.create (vect_defs.length ());
3891 FOR_EACH_VEC_ELT (vect_defs, i, def)
3892 {
3893 for (j = 0; j < ncopies; j++)
3894 {
3895 tree new_def = copy_ssa_name (def, NULL);
3896 phi = create_phi_node (new_def, exit_bb);
3897 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3898 if (j == 0)
3899 new_phis.quick_push (phi);
3900 else
3901 {
3902 def = vect_get_vec_def_for_stmt_copy (dt, def);
3903 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3904 }
3905
3906 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3907 prev_phi_info = vinfo_for_stmt (phi);
3908 }
3909 }
3910
3911 /* The epilogue is created for the outer-loop, i.e., for the loop being
3912 vectorized. Create exit phis for the outer loop. */
3913 if (double_reduc)
3914 {
3915 loop = outer_loop;
3916 exit_bb = single_exit (loop)->dest;
3917 inner_phis.create (vect_defs.length ());
3918 FOR_EACH_VEC_ELT (new_phis, i, phi)
3919 {
3920 tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3921 gimple outer_phi = create_phi_node (new_result, exit_bb);
3922 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3923 PHI_RESULT (phi));
3924 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3925 loop_vinfo, NULL));
3926 inner_phis.quick_push (phi);
3927 new_phis[i] = outer_phi;
3928 prev_phi_info = vinfo_for_stmt (outer_phi);
3929 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
3930 {
3931 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3932 new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3933 outer_phi = create_phi_node (new_result, exit_bb);
3934 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3935 PHI_RESULT (phi));
3936 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3937 loop_vinfo, NULL));
3938 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
3939 prev_phi_info = vinfo_for_stmt (outer_phi);
3940 }
3941 }
3942 }
3943
3944 exit_gsi = gsi_after_labels (exit_bb);
3945
3946 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3947 (i.e. when reduc_code is not available) and in the final adjustment
3948 code (if needed). Also get the original scalar reduction variable as
3949 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
3950 represents a reduction pattern), the tree-code and scalar-def are
3951 taken from the original stmt that the pattern-stmt (STMT) replaces.
3952 Otherwise (it is a regular reduction) - the tree-code and scalar-def
3953 are taken from STMT. */
3954
3955 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3956 if (!orig_stmt)
3957 {
3958 /* Regular reduction */
3959 orig_stmt = stmt;
3960 }
3961 else
3962 {
3963 /* Reduction pattern */
3964 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3965 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3966 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3967 }
3968
3969 code = gimple_assign_rhs_code (orig_stmt);
3970 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3971 partial results are added and not subtracted. */
3972 if (code == MINUS_EXPR)
3973 code = PLUS_EXPR;
3974
3975 scalar_dest = gimple_assign_lhs (orig_stmt);
3976 scalar_type = TREE_TYPE (scalar_dest);
3977 scalar_results.create (group_size);
3978 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3979 bitsize = TYPE_SIZE (scalar_type);
3980
3981 /* In case this is a reduction in an inner-loop while vectorizing an outer
3982 loop - we don't need to extract a single scalar result at the end of the
3983 inner-loop (unless it is double reduction, i.e., the use of reduction is
3984 outside the outer-loop). The final vector of partial results will be used
3985 in the vectorized outer-loop, or reduced to a scalar result at the end of
3986 the outer-loop. */
3987 if (nested_in_vect_loop && !double_reduc)
3988 goto vect_finalize_reduction;
3989
3990 /* SLP reduction without reduction chain, e.g.,
3991 # a1 = phi <a2, a0>
3992 # b1 = phi <b2, b0>
3993 a2 = operation (a1)
3994 b2 = operation (b1) */
3995 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
3996
3997 /* In case of reduction chain, e.g.,
3998 # a1 = phi <a3, a0>
3999 a2 = operation (a1)
4000 a3 = operation (a2),
4001
4002 we may end up with more than one vector result. Here we reduce them to
4003 one vector. */
4004 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4005 {
4006 tree first_vect = PHI_RESULT (new_phis[0]);
4007 tree tmp;
4008 gimple new_vec_stmt = NULL;
4009
4010 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4011 for (k = 1; k < new_phis.length (); k++)
4012 {
4013 gimple next_phi = new_phis[k];
4014 tree second_vect = PHI_RESULT (next_phi);
4015
4016 tmp = build2 (code, vectype, first_vect, second_vect);
4017 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4018 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4019 gimple_assign_set_lhs (new_vec_stmt, first_vect);
4020 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4021 }
4022
4023 new_phi_result = first_vect;
4024 if (new_vec_stmt)
4025 {
4026 new_phis.truncate (0);
4027 new_phis.safe_push (new_vec_stmt);
4028 }
4029 }
4030 else
4031 new_phi_result = PHI_RESULT (new_phis[0]);
4032
4033 /* 2.3 Create the reduction code, using one of the three schemes described
4034 above. In SLP we simply need to extract all the elements from the
4035 vector (without reducing them), so we use scalar shifts. */
4036 if (reduc_code != ERROR_MARK && !slp_reduc)
4037 {
4038 tree tmp;
4039
4040 /*** Case 1: Create:
4041 v_out2 = reduc_expr <v_out1> */
4042
4043 if (dump_enabled_p ())
4044 dump_printf_loc (MSG_NOTE, vect_location,
4045 "Reduce using direct vector reduction.");
4046
4047 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4048 tmp = build1 (reduc_code, vectype, new_phi_result);
4049 epilog_stmt = gimple_build_assign (vec_dest, tmp);
4050 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4051 gimple_assign_set_lhs (epilog_stmt, new_temp);
4052 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4053
4054 extract_scalar_result = true;
4055 }
4056 else
4057 {
4058 enum tree_code shift_code = ERROR_MARK;
4059 bool have_whole_vector_shift = true;
4060 int bit_offset;
4061 int element_bitsize = tree_low_cst (bitsize, 1);
4062 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4063 tree vec_temp;
4064
4065 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4066 shift_code = VEC_RSHIFT_EXPR;
4067 else
4068 have_whole_vector_shift = false;
4069
4070 /* Regardless of whether we have a whole vector shift, if we're
4071 emulating the operation via tree-vect-generic, we don't want
4072 to use it. Only the first round of the reduction is likely
4073 to still be profitable via emulation. */
4074 /* ??? It might be better to emit a reduction tree code here, so that
4075 tree-vect-generic can expand the first round via bit tricks. */
4076 if (!VECTOR_MODE_P (mode))
4077 have_whole_vector_shift = false;
4078 else
4079 {
4080 optab optab = optab_for_tree_code (code, vectype, optab_default);
4081 if (optab_handler (optab, mode) == CODE_FOR_nothing)
4082 have_whole_vector_shift = false;
4083 }
4084
4085 if (have_whole_vector_shift && !slp_reduc)
4086 {
4087 /*** Case 2: Create:
4088 for (offset = VS/2; offset >= element_size; offset/=2)
4089 {
4090 Create: va' = vec_shift <va, offset>
4091 Create: va = vop <va, va'>
4092 } */
4093
4094 if (dump_enabled_p ())
4095 dump_printf_loc (MSG_NOTE, vect_location,
4096 "Reduce using vector shifts");
4097
4098 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4099 new_temp = new_phi_result;
4100 for (bit_offset = vec_size_in_bits/2;
4101 bit_offset >= element_bitsize;
4102 bit_offset /= 2)
4103 {
4104 tree bitpos = size_int (bit_offset);
4105
4106 epilog_stmt = gimple_build_assign_with_ops (shift_code,
4107 vec_dest, new_temp, bitpos);
4108 new_name = make_ssa_name (vec_dest, epilog_stmt);
4109 gimple_assign_set_lhs (epilog_stmt, new_name);
4110 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4111
4112 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4113 new_name, new_temp);
4114 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4115 gimple_assign_set_lhs (epilog_stmt, new_temp);
4116 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4117 }
4118
4119 extract_scalar_result = true;
4120 }
4121 else
4122 {
4123 tree rhs;
4124
4125 /*** Case 3: Create:
4126 s = extract_field <v_out2, 0>
4127 for (offset = element_size;
4128 offset < vector_size;
4129 offset += element_size;)
4130 {
4131 Create: s' = extract_field <v_out2, offset>
4132 Create: s = op <s, s'> // For non SLP cases
4133 } */
4134
4135 if (dump_enabled_p ())
4136 dump_printf_loc (MSG_NOTE, vect_location,
4137 "Reduce using scalar code. ");
4138
4139 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4140 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4141 {
4142 if (gimple_code (new_phi) == GIMPLE_PHI)
4143 vec_temp = PHI_RESULT (new_phi);
4144 else
4145 vec_temp = gimple_assign_lhs (new_phi);
4146 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4147 bitsize_zero_node);
4148 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4149 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4150 gimple_assign_set_lhs (epilog_stmt, new_temp);
4151 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4152
4153 /* In SLP we don't need to apply reduction operation, so we just
4154 collect s' values in SCALAR_RESULTS. */
4155 if (slp_reduc)
4156 scalar_results.safe_push (new_temp);
4157
4158 for (bit_offset = element_bitsize;
4159 bit_offset < vec_size_in_bits;
4160 bit_offset += element_bitsize)
4161 {
4162 tree bitpos = bitsize_int (bit_offset);
4163 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4164 bitsize, bitpos);
4165
4166 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4167 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4168 gimple_assign_set_lhs (epilog_stmt, new_name);
4169 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4170
4171 if (slp_reduc)
4172 {
4173 /* In SLP we don't need to apply reduction operation, so
4174 we just collect s' values in SCALAR_RESULTS. */
4175 new_temp = new_name;
4176 scalar_results.safe_push (new_name);
4177 }
4178 else
4179 {
4180 epilog_stmt = gimple_build_assign_with_ops (code,
4181 new_scalar_dest, new_name, new_temp);
4182 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4183 gimple_assign_set_lhs (epilog_stmt, new_temp);
4184 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4185 }
4186 }
4187 }
4188
4189 /* The only case where we need to reduce scalar results in SLP, is
4190 unrolling. If the size of SCALAR_RESULTS is greater than
4191 GROUP_SIZE, we reduce them combining elements modulo
4192 GROUP_SIZE. */
4193 if (slp_reduc)
4194 {
4195 tree res, first_res, new_res;
4196 gimple new_stmt;
4197
4198 /* Reduce multiple scalar results in case of SLP unrolling. */
4199 for (j = group_size; scalar_results.iterate (j, &res);
4200 j++)
4201 {
4202 first_res = scalar_results[j % group_size];
4203 new_stmt = gimple_build_assign_with_ops (code,
4204 new_scalar_dest, first_res, res);
4205 new_res = make_ssa_name (new_scalar_dest, new_stmt);
4206 gimple_assign_set_lhs (new_stmt, new_res);
4207 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4208 scalar_results[j % group_size] = new_res;
4209 }
4210 }
4211 else
4212 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4213 scalar_results.safe_push (new_temp);
4214
4215 extract_scalar_result = false;
4216 }
4217 }
4218
4219 /* 2.4 Extract the final scalar result. Create:
4220 s_out3 = extract_field <v_out2, bitpos> */
4221
4222 if (extract_scalar_result)
4223 {
4224 tree rhs;
4225
4226 if (dump_enabled_p ())
4227 dump_printf_loc (MSG_NOTE, vect_location,
4228 "extract scalar result");
4229
4230 if (BYTES_BIG_ENDIAN)
4231 bitpos = size_binop (MULT_EXPR,
4232 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4233 TYPE_SIZE (scalar_type));
4234 else
4235 bitpos = bitsize_zero_node;
4236
4237 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4238 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4239 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4240 gimple_assign_set_lhs (epilog_stmt, new_temp);
4241 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4242 scalar_results.safe_push (new_temp);
4243 }
4244
4245 vect_finalize_reduction:
4246
4247 if (double_reduc)
4248 loop = loop->inner;
4249
4250 /* 2.5 Adjust the final result by the initial value of the reduction
4251 variable. (When such adjustment is not needed, then
4252 'adjustment_def' is zero). For example, if code is PLUS we create:
4253 new_temp = loop_exit_def + adjustment_def */
4254
4255 if (adjustment_def)
4256 {
4257 gcc_assert (!slp_reduc);
4258 if (nested_in_vect_loop)
4259 {
4260 new_phi = new_phis[0];
4261 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4262 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4263 new_dest = vect_create_destination_var (scalar_dest, vectype);
4264 }
4265 else
4266 {
4267 new_temp = scalar_results[0];
4268 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4269 expr = build2 (code, scalar_type, new_temp, adjustment_def);
4270 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4271 }
4272
4273 epilog_stmt = gimple_build_assign (new_dest, expr);
4274 new_temp = make_ssa_name (new_dest, epilog_stmt);
4275 gimple_assign_set_lhs (epilog_stmt, new_temp);
4276 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
4277 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4278 if (nested_in_vect_loop)
4279 {
4280 set_vinfo_for_stmt (epilog_stmt,
4281 new_stmt_vec_info (epilog_stmt, loop_vinfo,
4282 NULL));
4283 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4284 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4285
4286 if (!double_reduc)
4287 scalar_results.quick_push (new_temp);
4288 else
4289 scalar_results[0] = new_temp;
4290 }
4291 else
4292 scalar_results[0] = new_temp;
4293
4294 new_phis[0] = epilog_stmt;
4295 }
4296
4297 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4298 phis with new adjusted scalar results, i.e., replace use <s_out0>
4299 with use <s_out4>.
4300
4301 Transform:
4302 loop_exit:
4303 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4304 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4305 v_out2 = reduce <v_out1>
4306 s_out3 = extract_field <v_out2, 0>
4307 s_out4 = adjust_result <s_out3>
4308 use <s_out0>
4309 use <s_out0>
4310
4311 into:
4312
4313 loop_exit:
4314 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4315 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4316 v_out2 = reduce <v_out1>
4317 s_out3 = extract_field <v_out2, 0>
4318 s_out4 = adjust_result <s_out3>
4319 use <s_out4>
4320 use <s_out4> */
4321
4322
4323 /* In SLP reduction chain we reduce vector results into one vector if
4324 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4325 the last stmt in the reduction chain, since we are looking for the loop
4326 exit phi node. */
4327 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4328 {
4329 scalar_dest = gimple_assign_lhs (
4330 SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4331 group_size = 1;
4332 }
4333
4334 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4335 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
4336 need to match SCALAR_RESULTS with corresponding statements. The first
4337 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4338 the first vector stmt, etc.
4339 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
4340 if (group_size > new_phis.length ())
4341 {
4342 ratio = group_size / new_phis.length ();
4343 gcc_assert (!(group_size % new_phis.length ()));
4344 }
4345 else
4346 ratio = 1;
4347
4348 for (k = 0; k < group_size; k++)
4349 {
4350 if (k % ratio == 0)
4351 {
4352 epilog_stmt = new_phis[k / ratio];
4353 reduction_phi = reduction_phis[k / ratio];
4354 if (double_reduc)
4355 inner_phi = inner_phis[k / ratio];
4356 }
4357
4358 if (slp_reduc)
4359 {
4360 gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4361
4362 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4363 /* SLP statements can't participate in patterns. */
4364 gcc_assert (!orig_stmt);
4365 scalar_dest = gimple_assign_lhs (current_stmt);
4366 }
4367
4368 phis.create (3);
4369 /* Find the loop-closed-use at the loop exit of the original scalar
4370 result. (The reduction result is expected to have two immediate uses -
4371 one at the latch block, and one at the loop exit). */
4372 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4373 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4374 phis.safe_push (USE_STMT (use_p));
4375
4376 /* While we expect to have found an exit_phi because of loop-closed-ssa
4377 form we can end up without one if the scalar cycle is dead. */
4378
4379 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4380 {
4381 if (outer_loop)
4382 {
4383 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4384 gimple vect_phi;
4385
4386 /* FORNOW. Currently not supporting the case that an inner-loop
4387 reduction is not used in the outer-loop (but only outside the
4388 outer-loop), unless it is double reduction. */
4389 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4390 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4391 || double_reduc);
4392
4393 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4394 if (!double_reduc
4395 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4396 != vect_double_reduction_def)
4397 continue;
4398
4399 /* Handle double reduction:
4400
4401 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4402 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4403 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4404 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
4405
4406 At that point the regular reduction (stmt2 and stmt3) is
4407 already vectorized, as well as the exit phi node, stmt4.
4408 Here we vectorize the phi node of double reduction, stmt1, and
4409 update all relevant statements. */
4410
4411 /* Go through all the uses of s2 to find double reduction phi
4412 node, i.e., stmt1 above. */
4413 orig_name = PHI_RESULT (exit_phi);
4414 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4415 {
4416 stmt_vec_info use_stmt_vinfo;
4417 stmt_vec_info new_phi_vinfo;
4418 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4419 basic_block bb = gimple_bb (use_stmt);
4420 gimple use;
4421
4422 /* Check that USE_STMT is really double reduction phi
4423 node. */
4424 if (gimple_code (use_stmt) != GIMPLE_PHI
4425 || gimple_phi_num_args (use_stmt) != 2
4426 || bb->loop_father != outer_loop)
4427 continue;
4428 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4429 if (!use_stmt_vinfo
4430 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4431 != vect_double_reduction_def)
4432 continue;
4433
4434 /* Create vector phi node for double reduction:
4435 vs1 = phi <vs0, vs2>
4436 vs1 was created previously in this function by a call to
4437 vect_get_vec_def_for_operand and is stored in
4438 vec_initial_def;
4439 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4440 vs0 is created here. */
4441
4442 /* Create vector phi node. */
4443 vect_phi = create_phi_node (vec_initial_def, bb);
4444 new_phi_vinfo = new_stmt_vec_info (vect_phi,
4445 loop_vec_info_for_loop (outer_loop), NULL);
4446 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4447
4448 /* Create vs0 - initial def of the double reduction phi. */
4449 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4450 loop_preheader_edge (outer_loop));
4451 init_def = get_initial_def_for_reduction (stmt,
4452 preheader_arg, NULL);
4453 vect_phi_init = vect_init_vector (use_stmt, init_def,
4454 vectype, NULL);
4455
4456 /* Update phi node arguments with vs0 and vs2. */
4457 add_phi_arg (vect_phi, vect_phi_init,
4458 loop_preheader_edge (outer_loop),
4459 UNKNOWN_LOCATION);
4460 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4461 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4462 if (dump_enabled_p ())
4463 {
4464 dump_printf_loc (MSG_NOTE, vect_location,
4465 "created double reduction phi node: ");
4466 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4467 }
4468
4469 vect_phi_res = PHI_RESULT (vect_phi);
4470
4471 /* Replace the use, i.e., set the correct vs1 in the regular
4472 reduction phi node. FORNOW, NCOPIES is always 1, so the
4473 loop is redundant. */
4474 use = reduction_phi;
4475 for (j = 0; j < ncopies; j++)
4476 {
4477 edge pr_edge = loop_preheader_edge (loop);
4478 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4479 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4480 }
4481 }
4482 }
4483 }
4484
4485 phis.release ();
4486 if (nested_in_vect_loop)
4487 {
4488 if (double_reduc)
4489 loop = outer_loop;
4490 else
4491 continue;
4492 }
4493
4494 phis.create (3);
4495 /* Find the loop-closed-use at the loop exit of the original scalar
4496 result. (The reduction result is expected to have two immediate uses,
4497 one at the latch block, and one at the loop exit). For double
4498 reductions we are looking for exit phis of the outer loop. */
4499 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4500 {
4501 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4502 phis.safe_push (USE_STMT (use_p));
4503 else
4504 {
4505 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4506 {
4507 tree phi_res = PHI_RESULT (USE_STMT (use_p));
4508
4509 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4510 {
4511 if (!flow_bb_inside_loop_p (loop,
4512 gimple_bb (USE_STMT (phi_use_p))))
4513 phis.safe_push (USE_STMT (phi_use_p));
4514 }
4515 }
4516 }
4517 }
4518
4519 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4520 {
4521 /* Replace the uses: */
4522 orig_name = PHI_RESULT (exit_phi);
4523 scalar_result = scalar_results[k];
4524 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4525 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4526 SET_USE (use_p, scalar_result);
4527 }
4528
4529 phis.release ();
4530 }
4531
4532 scalar_results.release ();
4533 inner_phis.release ();
4534 new_phis.release ();
4535 }
4536
4537
4538 /* Function vectorizable_reduction.
4539
4540 Check if STMT performs a reduction operation that can be vectorized.
4541 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4542 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4543 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4544
4545 This function also handles reduction idioms (patterns) that have been
4546 recognized in advance during vect_pattern_recog. In this case, STMT may be
4547 of this form:
4548 X = pattern_expr (arg0, arg1, ..., X)
4549 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4550 sequence that had been detected and replaced by the pattern-stmt (STMT).
4551
4552 In some cases of reduction patterns, the type of the reduction variable X is
4553 different than the type of the other arguments of STMT.
4554 In such cases, the vectype that is used when transforming STMT into a vector
4555 stmt is different than the vectype that is used to determine the
4556 vectorization factor, because it consists of a different number of elements
4557 than the actual number of elements that are being operated upon in parallel.
4558
4559 For example, consider an accumulation of shorts into an int accumulator.
4560 On some targets it's possible to vectorize this pattern operating on 8
4561 shorts at a time (hence, the vectype for purposes of determining the
4562 vectorization factor should be V8HI); on the other hand, the vectype that
4563 is used to create the vector form is actually V4SI (the type of the result).
4564
4565 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4566 indicates what is the actual level of parallelism (V8HI in the example), so
4567 that the right vectorization factor would be derived. This vectype
4568 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4569 be used to create the vectorized stmt. The right vectype for the vectorized
4570 stmt is obtained from the type of the result X:
4571 get_vectype_for_scalar_type (TREE_TYPE (X))
4572
4573 This means that, contrary to "regular" reductions (or "regular" stmts in
4574 general), the following equation:
4575 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4576 does *NOT* necessarily hold for reduction patterns. */
4577
4578 bool
4579 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4580 gimple *vec_stmt, slp_tree slp_node)
4581 {
4582 tree vec_dest;
4583 tree scalar_dest;
4584 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4585 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4586 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4587 tree vectype_in = NULL_TREE;
4588 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4589 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4590 enum tree_code code, orig_code, epilog_reduc_code;
4591 enum machine_mode vec_mode;
4592 int op_type;
4593 optab optab, reduc_optab;
4594 tree new_temp = NULL_TREE;
4595 tree def;
4596 gimple def_stmt;
4597 enum vect_def_type dt;
4598 gimple new_phi = NULL;
4599 tree scalar_type;
4600 bool is_simple_use;
4601 gimple orig_stmt;
4602 stmt_vec_info orig_stmt_info;
4603 tree expr = NULL_TREE;
4604 int i;
4605 int ncopies;
4606 int epilog_copies;
4607 stmt_vec_info prev_stmt_info, prev_phi_info;
4608 bool single_defuse_cycle = false;
4609 tree reduc_def = NULL_TREE;
4610 gimple new_stmt = NULL;
4611 int j;
4612 tree ops[3];
4613 bool nested_cycle = false, found_nested_cycle_def = false;
4614 gimple reduc_def_stmt = NULL;
4615 /* The default is that the reduction variable is the last in statement. */
4616 int reduc_index = 2;
4617 bool double_reduc = false, dummy;
4618 basic_block def_bb;
4619 struct loop * def_stmt_loop, *outer_loop = NULL;
4620 tree def_arg;
4621 gimple def_arg_stmt;
4622 vec<tree> vec_oprnds0 = vNULL;
4623 vec<tree> vec_oprnds1 = vNULL;
4624 vec<tree> vect_defs = vNULL;
4625 vec<gimple> phis = vNULL;
4626 int vec_num;
4627 tree def0, def1, tem, op0, op1 = NULL_TREE;
4628
4629 /* In case of reduction chain we switch to the first stmt in the chain, but
4630 we don't update STMT_INFO, since only the last stmt is marked as reduction
4631 and has reduction properties. */
4632 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4633 stmt = GROUP_FIRST_ELEMENT (stmt_info);
4634
4635 if (nested_in_vect_loop_p (loop, stmt))
4636 {
4637 outer_loop = loop;
4638 loop = loop->inner;
4639 nested_cycle = true;
4640 }
4641
4642 /* 1. Is vectorizable reduction? */
4643 /* Not supportable if the reduction variable is used in the loop, unless
4644 it's a reduction chain. */
4645 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4646 && !GROUP_FIRST_ELEMENT (stmt_info))
4647 return false;
4648
4649 /* Reductions that are not used even in an enclosing outer-loop,
4650 are expected to be "live" (used out of the loop). */
4651 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4652 && !STMT_VINFO_LIVE_P (stmt_info))
4653 return false;
4654
4655 /* Make sure it was already recognized as a reduction computation. */
4656 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4657 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4658 return false;
4659
4660 /* 2. Has this been recognized as a reduction pattern?
4661
4662 Check if STMT represents a pattern that has been recognized
4663 in earlier analysis stages. For stmts that represent a pattern,
4664 the STMT_VINFO_RELATED_STMT field records the last stmt in
4665 the original sequence that constitutes the pattern. */
4666
4667 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4668 if (orig_stmt)
4669 {
4670 orig_stmt_info = vinfo_for_stmt (orig_stmt);
4671 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4672 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4673 }
4674
4675 /* 3. Check the operands of the operation. The first operands are defined
4676 inside the loop body. The last operand is the reduction variable,
4677 which is defined by the loop-header-phi. */
4678
4679 gcc_assert (is_gimple_assign (stmt));
4680
4681 /* Flatten RHS. */
4682 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4683 {
4684 case GIMPLE_SINGLE_RHS:
4685 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4686 if (op_type == ternary_op)
4687 {
4688 tree rhs = gimple_assign_rhs1 (stmt);
4689 ops[0] = TREE_OPERAND (rhs, 0);
4690 ops[1] = TREE_OPERAND (rhs, 1);
4691 ops[2] = TREE_OPERAND (rhs, 2);
4692 code = TREE_CODE (rhs);
4693 }
4694 else
4695 return false;
4696 break;
4697
4698 case GIMPLE_BINARY_RHS:
4699 code = gimple_assign_rhs_code (stmt);
4700 op_type = TREE_CODE_LENGTH (code);
4701 gcc_assert (op_type == binary_op);
4702 ops[0] = gimple_assign_rhs1 (stmt);
4703 ops[1] = gimple_assign_rhs2 (stmt);
4704 break;
4705
4706 case GIMPLE_TERNARY_RHS:
4707 code = gimple_assign_rhs_code (stmt);
4708 op_type = TREE_CODE_LENGTH (code);
4709 gcc_assert (op_type == ternary_op);
4710 ops[0] = gimple_assign_rhs1 (stmt);
4711 ops[1] = gimple_assign_rhs2 (stmt);
4712 ops[2] = gimple_assign_rhs3 (stmt);
4713 break;
4714
4715 case GIMPLE_UNARY_RHS:
4716 return false;
4717
4718 default:
4719 gcc_unreachable ();
4720 }
4721
4722 if (code == COND_EXPR && slp_node)
4723 return false;
4724
4725 scalar_dest = gimple_assign_lhs (stmt);
4726 scalar_type = TREE_TYPE (scalar_dest);
4727 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4728 && !SCALAR_FLOAT_TYPE_P (scalar_type))
4729 return false;
4730
4731 /* Do not try to vectorize bit-precision reductions. */
4732 if ((TYPE_PRECISION (scalar_type)
4733 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4734 return false;
4735
4736 /* All uses but the last are expected to be defined in the loop.
4737 The last use is the reduction variable. In case of nested cycle this
4738 assumption is not true: we use reduc_index to record the index of the
4739 reduction variable. */
4740 for (i = 0; i < op_type - 1; i++)
4741 {
4742 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4743 if (i == 0 && code == COND_EXPR)
4744 continue;
4745
4746 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4747 &def_stmt, &def, &dt, &tem);
4748 if (!vectype_in)
4749 vectype_in = tem;
4750 gcc_assert (is_simple_use);
4751
4752 if (dt != vect_internal_def
4753 && dt != vect_external_def
4754 && dt != vect_constant_def
4755 && dt != vect_induction_def
4756 && !(dt == vect_nested_cycle && nested_cycle))
4757 return false;
4758
4759 if (dt == vect_nested_cycle)
4760 {
4761 found_nested_cycle_def = true;
4762 reduc_def_stmt = def_stmt;
4763 reduc_index = i;
4764 }
4765 }
4766
4767 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4768 &def_stmt, &def, &dt, &tem);
4769 if (!vectype_in)
4770 vectype_in = tem;
4771 gcc_assert (is_simple_use);
4772 if (!(dt == vect_reduction_def
4773 || dt == vect_nested_cycle
4774 || ((dt == vect_internal_def || dt == vect_external_def
4775 || dt == vect_constant_def || dt == vect_induction_def)
4776 && nested_cycle && found_nested_cycle_def)))
4777 {
4778 /* For pattern recognized stmts, orig_stmt might be a reduction,
4779 but some helper statements for the pattern might not, or
4780 might be COND_EXPRs with reduction uses in the condition. */
4781 gcc_assert (orig_stmt);
4782 return false;
4783 }
4784 if (!found_nested_cycle_def)
4785 reduc_def_stmt = def_stmt;
4786
4787 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4788 if (orig_stmt)
4789 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4790 reduc_def_stmt,
4791 !nested_cycle,
4792 &dummy));
4793 else
4794 {
4795 gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4796 !nested_cycle, &dummy);
4797 /* We changed STMT to be the first stmt in reduction chain, hence we
4798 check that in this case the first element in the chain is STMT. */
4799 gcc_assert (stmt == tmp
4800 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4801 }
4802
4803 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4804 return false;
4805
4806 if (slp_node || PURE_SLP_STMT (stmt_info))
4807 ncopies = 1;
4808 else
4809 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4810 / TYPE_VECTOR_SUBPARTS (vectype_in));
4811
4812 gcc_assert (ncopies >= 1);
4813
4814 vec_mode = TYPE_MODE (vectype_in);
4815
4816 if (code == COND_EXPR)
4817 {
4818 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4819 {
4820 if (dump_enabled_p ())
4821 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4822 "unsupported condition in reduction");
4823
4824 return false;
4825 }
4826 }
4827 else
4828 {
4829 /* 4. Supportable by target? */
4830
4831 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
4832 || code == LROTATE_EXPR || code == RROTATE_EXPR)
4833 {
4834 /* Shifts and rotates are only supported by vectorizable_shifts,
4835 not vectorizable_reduction. */
4836 if (dump_enabled_p ())
4837 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4838 "unsupported shift or rotation.");
4839 return false;
4840 }
4841
4842 /* 4.1. check support for the operation in the loop */
4843 optab = optab_for_tree_code (code, vectype_in, optab_default);
4844 if (!optab)
4845 {
4846 if (dump_enabled_p ())
4847 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4848 "no optab.");
4849
4850 return false;
4851 }
4852
4853 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4854 {
4855 if (dump_enabled_p ())
4856 dump_printf (MSG_NOTE, "op not supported by target.");
4857
4858 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4859 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4860 < vect_min_worthwhile_factor (code))
4861 return false;
4862
4863 if (dump_enabled_p ())
4864 dump_printf (MSG_NOTE, "proceeding using word mode.");
4865 }
4866
4867 /* Worthwhile without SIMD support? */
4868 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4869 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4870 < vect_min_worthwhile_factor (code))
4871 {
4872 if (dump_enabled_p ())
4873 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4874 "not worthwhile without SIMD support.");
4875
4876 return false;
4877 }
4878 }
4879
4880 /* 4.2. Check support for the epilog operation.
4881
4882 If STMT represents a reduction pattern, then the type of the
4883 reduction variable may be different than the type of the rest
4884 of the arguments. For example, consider the case of accumulation
4885 of shorts into an int accumulator; The original code:
4886 S1: int_a = (int) short_a;
4887 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
4888
4889 was replaced with:
4890 STMT: int_acc = widen_sum <short_a, int_acc>
4891
4892 This means that:
4893 1. The tree-code that is used to create the vector operation in the
4894 epilog code (that reduces the partial results) is not the
4895 tree-code of STMT, but is rather the tree-code of the original
4896 stmt from the pattern that STMT is replacing. I.e, in the example
4897 above we want to use 'widen_sum' in the loop, but 'plus' in the
4898 epilog.
4899 2. The type (mode) we use to check available target support
4900 for the vector operation to be created in the *epilog*, is
4901 determined by the type of the reduction variable (in the example
4902 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4903 However the type (mode) we use to check available target support
4904 for the vector operation to be created *inside the loop*, is
4905 determined by the type of the other arguments to STMT (in the
4906 example we'd check this: optab_handler (widen_sum_optab,
4907 vect_short_mode)).
4908
4909 This is contrary to "regular" reductions, in which the types of all
4910 the arguments are the same as the type of the reduction variable.
4911 For "regular" reductions we can therefore use the same vector type
4912 (and also the same tree-code) when generating the epilog code and
4913 when generating the code inside the loop. */
4914
4915 if (orig_stmt)
4916 {
4917 /* This is a reduction pattern: get the vectype from the type of the
4918 reduction variable, and get the tree-code from orig_stmt. */
4919 orig_code = gimple_assign_rhs_code (orig_stmt);
4920 gcc_assert (vectype_out);
4921 vec_mode = TYPE_MODE (vectype_out);
4922 }
4923 else
4924 {
4925 /* Regular reduction: use the same vectype and tree-code as used for
4926 the vector code inside the loop can be used for the epilog code. */
4927 orig_code = code;
4928 }
4929
4930 if (nested_cycle)
4931 {
4932 def_bb = gimple_bb (reduc_def_stmt);
4933 def_stmt_loop = def_bb->loop_father;
4934 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4935 loop_preheader_edge (def_stmt_loop));
4936 if (TREE_CODE (def_arg) == SSA_NAME
4937 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4938 && gimple_code (def_arg_stmt) == GIMPLE_PHI
4939 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4940 && vinfo_for_stmt (def_arg_stmt)
4941 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4942 == vect_double_reduction_def)
4943 double_reduc = true;
4944 }
4945
4946 epilog_reduc_code = ERROR_MARK;
4947 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4948 {
4949 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4950 optab_default);
4951 if (!reduc_optab)
4952 {
4953 if (dump_enabled_p ())
4954 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4955 "no optab for reduction.");
4956
4957 epilog_reduc_code = ERROR_MARK;
4958 }
4959
4960 if (reduc_optab
4961 && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4962 {
4963 if (dump_enabled_p ())
4964 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4965 "reduc op not supported by target.");
4966
4967 epilog_reduc_code = ERROR_MARK;
4968 }
4969 }
4970 else
4971 {
4972 if (!nested_cycle || double_reduc)
4973 {
4974 if (dump_enabled_p ())
4975 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4976 "no reduc code for scalar code.");
4977
4978 return false;
4979 }
4980 }
4981
4982 if (double_reduc && ncopies > 1)
4983 {
4984 if (dump_enabled_p ())
4985 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4986 "multiple types in double reduction");
4987
4988 return false;
4989 }
4990
4991 /* In case of widenning multiplication by a constant, we update the type
4992 of the constant to be the type of the other operand. We check that the
4993 constant fits the type in the pattern recognition pass. */
4994 if (code == DOT_PROD_EXPR
4995 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
4996 {
4997 if (TREE_CODE (ops[0]) == INTEGER_CST)
4998 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
4999 else if (TREE_CODE (ops[1]) == INTEGER_CST)
5000 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5001 else
5002 {
5003 if (dump_enabled_p ())
5004 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5005 "invalid types in dot-prod");
5006
5007 return false;
5008 }
5009 }
5010
5011 if (!vec_stmt) /* transformation not required. */
5012 {
5013 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
5014 return false;
5015 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5016 return true;
5017 }
5018
5019 /** Transform. **/
5020
5021 if (dump_enabled_p ())
5022 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.");
5023
5024 /* FORNOW: Multiple types are not supported for condition. */
5025 if (code == COND_EXPR)
5026 gcc_assert (ncopies == 1);
5027
5028 /* Create the destination vector */
5029 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5030
5031 /* In case the vectorization factor (VF) is bigger than the number
5032 of elements that we can fit in a vectype (nunits), we have to generate
5033 more than one vector stmt - i.e - we need to "unroll" the
5034 vector stmt by a factor VF/nunits. For more details see documentation
5035 in vectorizable_operation. */
5036
5037 /* If the reduction is used in an outer loop we need to generate
5038 VF intermediate results, like so (e.g. for ncopies=2):
5039 r0 = phi (init, r0)
5040 r1 = phi (init, r1)
5041 r0 = x0 + r0;
5042 r1 = x1 + r1;
5043 (i.e. we generate VF results in 2 registers).
5044 In this case we have a separate def-use cycle for each copy, and therefore
5045 for each copy we get the vector def for the reduction variable from the
5046 respective phi node created for this copy.
5047
5048 Otherwise (the reduction is unused in the loop nest), we can combine
5049 together intermediate results, like so (e.g. for ncopies=2):
5050 r = phi (init, r)
5051 r = x0 + r;
5052 r = x1 + r;
5053 (i.e. we generate VF/2 results in a single register).
5054 In this case for each copy we get the vector def for the reduction variable
5055 from the vectorized reduction operation generated in the previous iteration.
5056 */
5057
5058 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5059 {
5060 single_defuse_cycle = true;
5061 epilog_copies = 1;
5062 }
5063 else
5064 epilog_copies = ncopies;
5065
5066 prev_stmt_info = NULL;
5067 prev_phi_info = NULL;
5068 if (slp_node)
5069 {
5070 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5071 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5072 == TYPE_VECTOR_SUBPARTS (vectype_in));
5073 }
5074 else
5075 {
5076 vec_num = 1;
5077 vec_oprnds0.create (1);
5078 if (op_type == ternary_op)
5079 vec_oprnds1.create (1);
5080 }
5081
5082 phis.create (vec_num);
5083 vect_defs.create (vec_num);
5084 if (!slp_node)
5085 vect_defs.quick_push (NULL_TREE);
5086
5087 for (j = 0; j < ncopies; j++)
5088 {
5089 if (j == 0 || !single_defuse_cycle)
5090 {
5091 for (i = 0; i < vec_num; i++)
5092 {
5093 /* Create the reduction-phi that defines the reduction
5094 operand. */
5095 new_phi = create_phi_node (vec_dest, loop->header);
5096 set_vinfo_for_stmt (new_phi,
5097 new_stmt_vec_info (new_phi, loop_vinfo,
5098 NULL));
5099 if (j == 0 || slp_node)
5100 phis.quick_push (new_phi);
5101 }
5102 }
5103
5104 if (code == COND_EXPR)
5105 {
5106 gcc_assert (!slp_node);
5107 vectorizable_condition (stmt, gsi, vec_stmt,
5108 PHI_RESULT (phis[0]),
5109 reduc_index, NULL);
5110 /* Multiple types are not supported for condition. */
5111 break;
5112 }
5113
5114 /* Handle uses. */
5115 if (j == 0)
5116 {
5117 op0 = ops[!reduc_index];
5118 if (op_type == ternary_op)
5119 {
5120 if (reduc_index == 0)
5121 op1 = ops[2];
5122 else
5123 op1 = ops[1];
5124 }
5125
5126 if (slp_node)
5127 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5128 slp_node, -1);
5129 else
5130 {
5131 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5132 stmt, NULL);
5133 vec_oprnds0.quick_push (loop_vec_def0);
5134 if (op_type == ternary_op)
5135 {
5136 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5137 NULL);
5138 vec_oprnds1.quick_push (loop_vec_def1);
5139 }
5140 }
5141 }
5142 else
5143 {
5144 if (!slp_node)
5145 {
5146 enum vect_def_type dt;
5147 gimple dummy_stmt;
5148 tree dummy;
5149
5150 vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5151 &dummy_stmt, &dummy, &dt);
5152 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5153 loop_vec_def0);
5154 vec_oprnds0[0] = loop_vec_def0;
5155 if (op_type == ternary_op)
5156 {
5157 vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5158 &dummy, &dt);
5159 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5160 loop_vec_def1);
5161 vec_oprnds1[0] = loop_vec_def1;
5162 }
5163 }
5164
5165 if (single_defuse_cycle)
5166 reduc_def = gimple_assign_lhs (new_stmt);
5167
5168 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5169 }
5170
5171 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5172 {
5173 if (slp_node)
5174 reduc_def = PHI_RESULT (phis[i]);
5175 else
5176 {
5177 if (!single_defuse_cycle || j == 0)
5178 reduc_def = PHI_RESULT (new_phi);
5179 }
5180
5181 def1 = ((op_type == ternary_op)
5182 ? vec_oprnds1[i] : NULL);
5183 if (op_type == binary_op)
5184 {
5185 if (reduc_index == 0)
5186 expr = build2 (code, vectype_out, reduc_def, def0);
5187 else
5188 expr = build2 (code, vectype_out, def0, reduc_def);
5189 }
5190 else
5191 {
5192 if (reduc_index == 0)
5193 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5194 else
5195 {
5196 if (reduc_index == 1)
5197 expr = build3 (code, vectype_out, def0, reduc_def, def1);
5198 else
5199 expr = build3 (code, vectype_out, def0, def1, reduc_def);
5200 }
5201 }
5202
5203 new_stmt = gimple_build_assign (vec_dest, expr);
5204 new_temp = make_ssa_name (vec_dest, new_stmt);
5205 gimple_assign_set_lhs (new_stmt, new_temp);
5206 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5207
5208 if (slp_node)
5209 {
5210 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5211 vect_defs.quick_push (new_temp);
5212 }
5213 else
5214 vect_defs[0] = new_temp;
5215 }
5216
5217 if (slp_node)
5218 continue;
5219
5220 if (j == 0)
5221 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5222 else
5223 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5224
5225 prev_stmt_info = vinfo_for_stmt (new_stmt);
5226 prev_phi_info = vinfo_for_stmt (new_phi);
5227 }
5228
5229 /* Finalize the reduction-phi (set its arguments) and create the
5230 epilog reduction code. */
5231 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5232 {
5233 new_temp = gimple_assign_lhs (*vec_stmt);
5234 vect_defs[0] = new_temp;
5235 }
5236
5237 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5238 epilog_reduc_code, phis, reduc_index,
5239 double_reduc, slp_node);
5240
5241 phis.release ();
5242 vect_defs.release ();
5243 vec_oprnds0.release ();
5244 vec_oprnds1.release ();
5245
5246 return true;
5247 }
5248
5249 /* Function vect_min_worthwhile_factor.
5250
5251 For a loop where we could vectorize the operation indicated by CODE,
5252 return the minimum vectorization factor that makes it worthwhile
5253 to use generic vectors. */
5254 int
5255 vect_min_worthwhile_factor (enum tree_code code)
5256 {
5257 switch (code)
5258 {
5259 case PLUS_EXPR:
5260 case MINUS_EXPR:
5261 case NEGATE_EXPR:
5262 return 4;
5263
5264 case BIT_AND_EXPR:
5265 case BIT_IOR_EXPR:
5266 case BIT_XOR_EXPR:
5267 case BIT_NOT_EXPR:
5268 return 2;
5269
5270 default:
5271 return INT_MAX;
5272 }
5273 }
5274
5275
5276 /* Function vectorizable_induction
5277
5278 Check if PHI performs an induction computation that can be vectorized.
5279 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5280 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5281 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5282
5283 bool
5284 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5285 gimple *vec_stmt)
5286 {
5287 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5288 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5289 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5290 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5291 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5292 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5293 tree vec_def;
5294
5295 gcc_assert (ncopies >= 1);
5296 /* FORNOW. These restrictions should be relaxed. */
5297 if (nested_in_vect_loop_p (loop, phi))
5298 {
5299 imm_use_iterator imm_iter;
5300 use_operand_p use_p;
5301 gimple exit_phi;
5302 edge latch_e;
5303 tree loop_arg;
5304
5305 if (ncopies > 1)
5306 {
5307 if (dump_enabled_p ())
5308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5309 "multiple types in nested loop.");
5310 return false;
5311 }
5312
5313 exit_phi = NULL;
5314 latch_e = loop_latch_edge (loop->inner);
5315 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5316 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5317 {
5318 if (!flow_bb_inside_loop_p (loop->inner,
5319 gimple_bb (USE_STMT (use_p))))
5320 {
5321 exit_phi = USE_STMT (use_p);
5322 break;
5323 }
5324 }
5325 if (exit_phi)
5326 {
5327 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5328 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5329 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5330 {
5331 if (dump_enabled_p ())
5332 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5333 "inner-loop induction only used outside "
5334 "of the outer vectorized loop.");
5335 return false;
5336 }
5337 }
5338 }
5339
5340 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5341 return false;
5342
5343 /* FORNOW: SLP not supported. */
5344 if (STMT_SLP_TYPE (stmt_info))
5345 return false;
5346
5347 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5348
5349 if (gimple_code (phi) != GIMPLE_PHI)
5350 return false;
5351
5352 if (!vec_stmt) /* transformation not required. */
5353 {
5354 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5355 if (dump_enabled_p ())
5356 dump_printf_loc (MSG_NOTE, vect_location,
5357 "=== vectorizable_induction ===");
5358 vect_model_induction_cost (stmt_info, ncopies);
5359 return true;
5360 }
5361
5362 /** Transform. **/
5363
5364 if (dump_enabled_p ())
5365 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.");
5366
5367 vec_def = get_initial_def_for_induction (phi);
5368 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5369 return true;
5370 }
5371
5372 /* Function vectorizable_live_operation.
5373
5374 STMT computes a value that is used outside the loop. Check if
5375 it can be supported. */
5376
5377 bool
5378 vectorizable_live_operation (gimple stmt,
5379 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5380 gimple *vec_stmt)
5381 {
5382 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5383 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5384 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5385 int i;
5386 int op_type;
5387 tree op;
5388 tree def;
5389 gimple def_stmt;
5390 enum vect_def_type dt;
5391 enum tree_code code;
5392 enum gimple_rhs_class rhs_class;
5393
5394 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5395
5396 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5397 return false;
5398
5399 if (!is_gimple_assign (stmt))
5400 {
5401 if (gimple_call_internal_p (stmt)
5402 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5403 && gimple_call_lhs (stmt)
5404 && loop->simduid
5405 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5406 && loop->simduid
5407 == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5408 {
5409 edge e = single_exit (loop);
5410 basic_block merge_bb = e->dest;
5411 imm_use_iterator imm_iter;
5412 use_operand_p use_p;
5413 tree lhs = gimple_call_lhs (stmt);
5414
5415 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5416 {
5417 gimple use_stmt = USE_STMT (use_p);
5418 if (gimple_code (use_stmt) == GIMPLE_PHI
5419 || gimple_bb (use_stmt) == merge_bb)
5420 {
5421 if (vec_stmt)
5422 {
5423 tree vfm1
5424 = build_int_cst (unsigned_type_node,
5425 loop_vinfo->vectorization_factor - 1);
5426 SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5427 }
5428 return true;
5429 }
5430 }
5431 }
5432
5433 return false;
5434 }
5435
5436 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5437 return false;
5438
5439 /* FORNOW. CHECKME. */
5440 if (nested_in_vect_loop_p (loop, stmt))
5441 return false;
5442
5443 code = gimple_assign_rhs_code (stmt);
5444 op_type = TREE_CODE_LENGTH (code);
5445 rhs_class = get_gimple_rhs_class (code);
5446 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5447 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5448
5449 /* FORNOW: support only if all uses are invariant. This means
5450 that the scalar operations can remain in place, unvectorized.
5451 The original last scalar value that they compute will be used. */
5452
5453 for (i = 0; i < op_type; i++)
5454 {
5455 if (rhs_class == GIMPLE_SINGLE_RHS)
5456 op = TREE_OPERAND (gimple_op (stmt, 1), i);
5457 else
5458 op = gimple_op (stmt, i + 1);
5459 if (op
5460 && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5461 &dt))
5462 {
5463 if (dump_enabled_p ())
5464 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5465 "use not simple.");
5466 return false;
5467 }
5468
5469 if (dt != vect_external_def && dt != vect_constant_def)
5470 return false;
5471 }
5472
5473 /* No transformation is required for the cases we currently support. */
5474 return true;
5475 }
5476
5477 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
5478
5479 static void
5480 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5481 {
5482 ssa_op_iter op_iter;
5483 imm_use_iterator imm_iter;
5484 def_operand_p def_p;
5485 gimple ustmt;
5486
5487 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5488 {
5489 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5490 {
5491 basic_block bb;
5492
5493 if (!is_gimple_debug (ustmt))
5494 continue;
5495
5496 bb = gimple_bb (ustmt);
5497
5498 if (!flow_bb_inside_loop_p (loop, bb))
5499 {
5500 if (gimple_debug_bind_p (ustmt))
5501 {
5502 if (dump_enabled_p ())
5503 dump_printf_loc (MSG_NOTE, vect_location,
5504 "killing debug use");
5505
5506 gimple_debug_bind_reset_value (ustmt);
5507 update_stmt (ustmt);
5508 }
5509 else
5510 gcc_unreachable ();
5511 }
5512 }
5513 }
5514 }
5515
5516 /* Function vect_transform_loop.
5517
5518 The analysis phase has determined that the loop is vectorizable.
5519 Vectorize the loop - created vectorized stmts to replace the scalar
5520 stmts in the loop, and update the loop exit condition. */
5521
5522 void
5523 vect_transform_loop (loop_vec_info loop_vinfo)
5524 {
5525 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5526 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5527 int nbbs = loop->num_nodes;
5528 gimple_stmt_iterator si;
5529 int i;
5530 tree ratio = NULL;
5531 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5532 bool grouped_store;
5533 bool slp_scheduled = false;
5534 unsigned int nunits;
5535 gimple stmt, pattern_stmt;
5536 gimple_seq pattern_def_seq = NULL;
5537 gimple_stmt_iterator pattern_def_si = gsi_none ();
5538 bool transform_pattern_stmt = false;
5539 bool check_profitability = false;
5540 int th;
5541 /* Record number of iterations before we started tampering with the profile. */
5542 gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5543
5544 if (dump_enabled_p ())
5545 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===");
5546
5547 /* If profile is inprecise, we have chance to fix it up. */
5548 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5549 expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5550
5551 /* Use the more conservative vectorization threshold. If the number
5552 of iterations is constant assume the cost check has been performed
5553 by our caller. If the threshold makes all loops profitable that
5554 run at least the vectorization factor number of times checking
5555 is pointless, too. */
5556 th = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
5557 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
5558 th = MAX (th, LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo));
5559 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5560 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5561 {
5562 if (dump_enabled_p ())
5563 dump_printf_loc (MSG_NOTE, vect_location,
5564 "Profitability threshold is %d loop iterations.", th);
5565 check_profitability = true;
5566 }
5567
5568 /* Version the loop first, if required, so the profitability check
5569 comes first. */
5570
5571 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5572 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5573 {
5574 vect_loop_versioning (loop_vinfo, th, check_profitability);
5575 check_profitability = false;
5576 }
5577
5578 /* Peel the loop if there are data refs with unknown alignment.
5579 Only one data ref with unknown store is allowed. */
5580
5581 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5582 {
5583 vect_do_peeling_for_alignment (loop_vinfo, th, check_profitability);
5584 check_profitability = false;
5585 }
5586
5587 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5588 compile time constant), or it is a constant that doesn't divide by the
5589 vectorization factor, then an epilog loop needs to be created.
5590 We therefore duplicate the loop: the original loop will be vectorized,
5591 and will compute the first (n/VF) iterations. The second copy of the loop
5592 will remain scalar and will compute the remaining (n%VF) iterations.
5593 (VF is the vectorization factor). */
5594
5595 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5596 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5597 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
5598 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5599 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
5600 th, check_profitability);
5601 else
5602 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5603 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5604
5605 /* 1) Make sure the loop header has exactly two entries
5606 2) Make sure we have a preheader basic block. */
5607
5608 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5609
5610 split_edge (loop_preheader_edge (loop));
5611
5612 /* FORNOW: the vectorizer supports only loops which body consist
5613 of one basic block (header + empty latch). When the vectorizer will
5614 support more involved loop forms, the order by which the BBs are
5615 traversed need to be reconsidered. */
5616
5617 for (i = 0; i < nbbs; i++)
5618 {
5619 basic_block bb = bbs[i];
5620 stmt_vec_info stmt_info;
5621 gimple phi;
5622
5623 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5624 {
5625 phi = gsi_stmt (si);
5626 if (dump_enabled_p ())
5627 {
5628 dump_printf_loc (MSG_NOTE, vect_location,
5629 "------>vectorizing phi: ");
5630 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5631 }
5632 stmt_info = vinfo_for_stmt (phi);
5633 if (!stmt_info)
5634 continue;
5635
5636 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5637 vect_loop_kill_debug_uses (loop, phi);
5638
5639 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5640 && !STMT_VINFO_LIVE_P (stmt_info))
5641 continue;
5642
5643 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5644 != (unsigned HOST_WIDE_INT) vectorization_factor)
5645 && dump_enabled_p ())
5646 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.");
5647
5648 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5649 {
5650 if (dump_enabled_p ())
5651 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.");
5652 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5653 }
5654 }
5655
5656 pattern_stmt = NULL;
5657 for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5658 {
5659 bool is_store;
5660
5661 if (transform_pattern_stmt)
5662 stmt = pattern_stmt;
5663 else
5664 {
5665 stmt = gsi_stmt (si);
5666 /* During vectorization remove existing clobber stmts. */
5667 if (gimple_clobber_p (stmt))
5668 {
5669 unlink_stmt_vdef (stmt);
5670 gsi_remove (&si, true);
5671 release_defs (stmt);
5672 continue;
5673 }
5674 }
5675
5676 if (dump_enabled_p ())
5677 {
5678 dump_printf_loc (MSG_NOTE, vect_location,
5679 "------>vectorizing statement: ");
5680 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5681 }
5682
5683 stmt_info = vinfo_for_stmt (stmt);
5684
5685 /* vector stmts created in the outer-loop during vectorization of
5686 stmts in an inner-loop may not have a stmt_info, and do not
5687 need to be vectorized. */
5688 if (!stmt_info)
5689 {
5690 gsi_next (&si);
5691 continue;
5692 }
5693
5694 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5695 vect_loop_kill_debug_uses (loop, stmt);
5696
5697 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5698 && !STMT_VINFO_LIVE_P (stmt_info))
5699 {
5700 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5701 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5702 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5703 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5704 {
5705 stmt = pattern_stmt;
5706 stmt_info = vinfo_for_stmt (stmt);
5707 }
5708 else
5709 {
5710 gsi_next (&si);
5711 continue;
5712 }
5713 }
5714 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5715 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5716 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5717 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5718 transform_pattern_stmt = true;
5719
5720 /* If pattern statement has def stmts, vectorize them too. */
5721 if (is_pattern_stmt_p (stmt_info))
5722 {
5723 if (pattern_def_seq == NULL)
5724 {
5725 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
5726 pattern_def_si = gsi_start (pattern_def_seq);
5727 }
5728 else if (!gsi_end_p (pattern_def_si))
5729 gsi_next (&pattern_def_si);
5730 if (pattern_def_seq != NULL)
5731 {
5732 gimple pattern_def_stmt = NULL;
5733 stmt_vec_info pattern_def_stmt_info = NULL;
5734
5735 while (!gsi_end_p (pattern_def_si))
5736 {
5737 pattern_def_stmt = gsi_stmt (pattern_def_si);
5738 pattern_def_stmt_info
5739 = vinfo_for_stmt (pattern_def_stmt);
5740 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
5741 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
5742 break;
5743 gsi_next (&pattern_def_si);
5744 }
5745
5746 if (!gsi_end_p (pattern_def_si))
5747 {
5748 if (dump_enabled_p ())
5749 {
5750 dump_printf_loc (MSG_NOTE, vect_location,
5751 "==> vectorizing pattern def "
5752 "stmt: ");
5753 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
5754 pattern_def_stmt, 0);
5755 }
5756
5757 stmt = pattern_def_stmt;
5758 stmt_info = pattern_def_stmt_info;
5759 }
5760 else
5761 {
5762 pattern_def_si = gsi_none ();
5763 transform_pattern_stmt = false;
5764 }
5765 }
5766 else
5767 transform_pattern_stmt = false;
5768 }
5769
5770 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5771 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
5772 STMT_VINFO_VECTYPE (stmt_info));
5773 if (!STMT_SLP_TYPE (stmt_info)
5774 && nunits != (unsigned int) vectorization_factor
5775 && dump_enabled_p ())
5776 /* For SLP VF is set according to unrolling factor, and not to
5777 vector size, hence for SLP this print is not valid. */
5778 dump_printf_loc (MSG_NOTE, vect_location,
5779 "multiple-types.");
5780
5781 /* SLP. Schedule all the SLP instances when the first SLP stmt is
5782 reached. */
5783 if (STMT_SLP_TYPE (stmt_info))
5784 {
5785 if (!slp_scheduled)
5786 {
5787 slp_scheduled = true;
5788
5789 if (dump_enabled_p ())
5790 dump_printf_loc (MSG_NOTE, vect_location,
5791 "=== scheduling SLP instances ===");
5792
5793 vect_schedule_slp (loop_vinfo, NULL);
5794 }
5795
5796 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
5797 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
5798 {
5799 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5800 {
5801 pattern_def_seq = NULL;
5802 gsi_next (&si);
5803 }
5804 continue;
5805 }
5806 }
5807
5808 /* -------- vectorize statement ------------ */
5809 if (dump_enabled_p ())
5810 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.");
5811
5812 grouped_store = false;
5813 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
5814 if (is_store)
5815 {
5816 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
5817 {
5818 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5819 interleaving chain was completed - free all the stores in
5820 the chain. */
5821 gsi_next (&si);
5822 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
5823 continue;
5824 }
5825 else
5826 {
5827 /* Free the attached stmt_vec_info and remove the stmt. */
5828 gimple store = gsi_stmt (si);
5829 free_stmt_vec_info (store);
5830 unlink_stmt_vdef (store);
5831 gsi_remove (&si, true);
5832 release_defs (store);
5833 continue;
5834 }
5835 }
5836
5837 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5838 {
5839 pattern_def_seq = NULL;
5840 gsi_next (&si);
5841 }
5842 } /* stmts in BB */
5843 } /* BBs in loop */
5844
5845 slpeel_make_loop_iterate_ntimes (loop, ratio);
5846
5847 /* Reduce loop iterations by the vectorization factor. */
5848 scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
5849 expected_iterations / vectorization_factor);
5850 loop->nb_iterations_upper_bound
5851 = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (vectorization_factor),
5852 FLOOR_DIV_EXPR);
5853 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5854 && loop->nb_iterations_upper_bound != double_int_zero)
5855 loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - double_int_one;
5856 if (loop->any_estimate)
5857 {
5858 loop->nb_iterations_estimate
5859 = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (vectorization_factor),
5860 FLOOR_DIV_EXPR);
5861 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5862 && loop->nb_iterations_estimate != double_int_zero)
5863 loop->nb_iterations_estimate = loop->nb_iterations_estimate - double_int_one;
5864 }
5865
5866 if (dump_enabled_p ())
5867 {
5868 dump_printf_loc (MSG_NOTE, vect_location,
5869 "LOOP VECTORIZED\n");
5870 if (loop->inner)
5871 dump_printf_loc (MSG_NOTE, vect_location,
5872 "OUTER LOOP VECTORIZED\n");
5873 }
5874 }