re PR testsuite/51128 (New LTO failures)
[gcc.git] / gcc / tree-vect-loop.c
1 /* Loop Vectorization
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
3 Free Software Foundation, Inc.
4 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
5 Ira Rosen <irar@il.ibm.com>
6
7 This file is part of GCC.
8
9 GCC is free software; you can redistribute it and/or modify it under
10 the terms of the GNU General Public License as published by the Free
11 Software Foundation; either version 3, or (at your option) any later
12 version.
13
14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with GCC; see the file COPYING3. If not see
21 <http://www.gnu.org/licenses/>. */
22
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "basic-block.h"
30 #include "tree-pretty-print.h"
31 #include "gimple-pretty-print.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
34 #include "cfgloop.h"
35 #include "cfglayout.h"
36 #include "expr.h"
37 #include "recog.h"
38 #include "optabs.h"
39 #include "params.h"
40 #include "diagnostic-core.h"
41 #include "tree-chrec.h"
42 #include "tree-scalar-evolution.h"
43 #include "tree-vectorizer.h"
44 #include "target.h"
45
46 /* Loop Vectorization Pass.
47
48 This pass tries to vectorize loops.
49
50 For example, the vectorizer transforms the following simple loop:
51
52 short a[N]; short b[N]; short c[N]; int i;
53
54 for (i=0; i<N; i++){
55 a[i] = b[i] + c[i];
56 }
57
58 as if it was manually vectorized by rewriting the source code into:
59
60 typedef int __attribute__((mode(V8HI))) v8hi;
61 short a[N]; short b[N]; short c[N]; int i;
62 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
63 v8hi va, vb, vc;
64
65 for (i=0; i<N/8; i++){
66 vb = pb[i];
67 vc = pc[i];
68 va = vb + vc;
69 pa[i] = va;
70 }
71
72 The main entry to this pass is vectorize_loops(), in which
73 the vectorizer applies a set of analyses on a given set of loops,
74 followed by the actual vectorization transformation for the loops that
75 had successfully passed the analysis phase.
76 Throughout this pass we make a distinction between two types of
77 data: scalars (which are represented by SSA_NAMES), and memory references
78 ("data-refs"). These two types of data require different handling both
79 during analysis and transformation. The types of data-refs that the
80 vectorizer currently supports are ARRAY_REFS which base is an array DECL
81 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
82 accesses are required to have a simple (consecutive) access pattern.
83
84 Analysis phase:
85 ===============
86 The driver for the analysis phase is vect_analyze_loop().
87 It applies a set of analyses, some of which rely on the scalar evolution
88 analyzer (scev) developed by Sebastian Pop.
89
90 During the analysis phase the vectorizer records some information
91 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
92 loop, as well as general information about the loop as a whole, which is
93 recorded in a "loop_vec_info" struct attached to each loop.
94
95 Transformation phase:
96 =====================
97 The loop transformation phase scans all the stmts in the loop, and
98 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
99 the loop that needs to be vectorized. It inserts the vector code sequence
100 just before the scalar stmt S, and records a pointer to the vector code
101 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
102 attached to S). This pointer will be used for the vectorization of following
103 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
104 otherwise, we rely on dead code elimination for removing it.
105
106 For example, say stmt S1 was vectorized into stmt VS1:
107
108 VS1: vb = px[i];
109 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
110 S2: a = b;
111
112 To vectorize stmt S2, the vectorizer first finds the stmt that defines
113 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
114 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
115 resulting sequence would be:
116
117 VS1: vb = px[i];
118 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
119 VS2: va = vb;
120 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
121
122 Operands that are not SSA_NAMEs, are data-refs that appear in
123 load/store operations (like 'x[i]' in S1), and are handled differently.
124
125 Target modeling:
126 =================
127 Currently the only target specific information that is used is the
128 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
129 Targets that can support different sizes of vectors, for now will need
130 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
131 flexibility will be added in the future.
132
133 Since we only vectorize operations which vector form can be
134 expressed using existing tree codes, to verify that an operation is
135 supported, the vectorizer checks the relevant optab at the relevant
136 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
137 the value found is CODE_FOR_nothing, then there's no target support, and
138 we can't vectorize the stmt.
139
140 For additional information on this project see:
141 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
142 */
143
144 /* Function vect_determine_vectorization_factor
145
146 Determine the vectorization factor (VF). VF is the number of data elements
147 that are operated upon in parallel in a single iteration of the vectorized
148 loop. For example, when vectorizing a loop that operates on 4byte elements,
149 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
150 elements can fit in a single vector register.
151
152 We currently support vectorization of loops in which all types operated upon
153 are of the same size. Therefore this function currently sets VF according to
154 the size of the types operated upon, and fails if there are multiple sizes
155 in the loop.
156
157 VF is also the factor by which the loop iterations are strip-mined, e.g.:
158 original loop:
159 for (i=0; i<N; i++){
160 a[i] = b[i] + c[i];
161 }
162
163 vectorized loop:
164 for (i=0; i<N; i+=VF){
165 a[i:VF] = b[i:VF] + c[i:VF];
166 }
167 */
168
169 static bool
170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
171 {
172 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
173 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
174 int nbbs = loop->num_nodes;
175 gimple_stmt_iterator si;
176 unsigned int vectorization_factor = 0;
177 tree scalar_type;
178 gimple phi;
179 tree vectype;
180 unsigned int nunits;
181 stmt_vec_info stmt_info;
182 int i;
183 HOST_WIDE_INT dummy;
184 gimple stmt, pattern_stmt = NULL, pattern_def_stmt = NULL;
185 bool analyze_pattern_stmt = false, pattern_def = false;
186
187 if (vect_print_dump_info (REPORT_DETAILS))
188 fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
189
190 for (i = 0; i < nbbs; i++)
191 {
192 basic_block bb = bbs[i];
193
194 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
195 {
196 phi = gsi_stmt (si);
197 stmt_info = vinfo_for_stmt (phi);
198 if (vect_print_dump_info (REPORT_DETAILS))
199 {
200 fprintf (vect_dump, "==> examining phi: ");
201 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
202 }
203
204 gcc_assert (stmt_info);
205
206 if (STMT_VINFO_RELEVANT_P (stmt_info))
207 {
208 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
209 scalar_type = TREE_TYPE (PHI_RESULT (phi));
210
211 if (vect_print_dump_info (REPORT_DETAILS))
212 {
213 fprintf (vect_dump, "get vectype for scalar type: ");
214 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
215 }
216
217 vectype = get_vectype_for_scalar_type (scalar_type);
218 if (!vectype)
219 {
220 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
221 {
222 fprintf (vect_dump,
223 "not vectorized: unsupported data-type ");
224 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
225 }
226 return false;
227 }
228 STMT_VINFO_VECTYPE (stmt_info) = vectype;
229
230 if (vect_print_dump_info (REPORT_DETAILS))
231 {
232 fprintf (vect_dump, "vectype: ");
233 print_generic_expr (vect_dump, vectype, TDF_SLIM);
234 }
235
236 nunits = TYPE_VECTOR_SUBPARTS (vectype);
237 if (vect_print_dump_info (REPORT_DETAILS))
238 fprintf (vect_dump, "nunits = %d", nunits);
239
240 if (!vectorization_factor
241 || (nunits > vectorization_factor))
242 vectorization_factor = nunits;
243 }
244 }
245
246 for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
247 {
248 tree vf_vectype;
249
250 if (analyze_pattern_stmt)
251 {
252 stmt = pattern_stmt;
253 analyze_pattern_stmt = false;
254 }
255 else
256 stmt = gsi_stmt (si);
257
258 stmt_info = vinfo_for_stmt (stmt);
259
260 if (vect_print_dump_info (REPORT_DETAILS))
261 {
262 fprintf (vect_dump, "==> examining statement: ");
263 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
264 }
265
266 gcc_assert (stmt_info);
267
268 /* Skip stmts which do not need to be vectorized. */
269 if (!STMT_VINFO_RELEVANT_P (stmt_info)
270 && !STMT_VINFO_LIVE_P (stmt_info))
271 {
272 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
273 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
274 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
275 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
276 {
277 stmt = pattern_stmt;
278 stmt_info = vinfo_for_stmt (pattern_stmt);
279 if (vect_print_dump_info (REPORT_DETAILS))
280 {
281 fprintf (vect_dump, "==> examining pattern statement: ");
282 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
283 }
284 }
285 else
286 {
287 if (vect_print_dump_info (REPORT_DETAILS))
288 fprintf (vect_dump, "skip.");
289 gsi_next (&si);
290 continue;
291 }
292 }
293 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
294 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
295 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
296 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
297 analyze_pattern_stmt = true;
298
299 /* If a pattern statement has a def stmt, analyze it too. */
300 if (is_pattern_stmt_p (stmt_info)
301 && (pattern_def_stmt = STMT_VINFO_PATTERN_DEF_STMT (stmt_info))
302 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_def_stmt))
303 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_def_stmt))))
304 {
305 if (pattern_def)
306 pattern_def = false;
307 else
308 {
309 if (vect_print_dump_info (REPORT_DETAILS))
310 {
311 fprintf (vect_dump, "==> examining pattern def stmt: ");
312 print_gimple_stmt (vect_dump, pattern_def_stmt, 0,
313 TDF_SLIM);
314 }
315
316 pattern_def = true;
317 stmt = pattern_def_stmt;
318 stmt_info = vinfo_for_stmt (stmt);
319 }
320 }
321
322 if (gimple_get_lhs (stmt) == NULL_TREE)
323 {
324 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
325 {
326 fprintf (vect_dump, "not vectorized: irregular stmt.");
327 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
328 }
329 return false;
330 }
331
332 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
333 {
334 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
335 {
336 fprintf (vect_dump, "not vectorized: vector stmt in loop:");
337 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
338 }
339 return false;
340 }
341
342 if (STMT_VINFO_VECTYPE (stmt_info))
343 {
344 /* The only case when a vectype had been already set is for stmts
345 that contain a dataref, or for "pattern-stmts" (stmts
346 generated by the vectorizer to represent/replace a certain
347 idiom). */
348 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
349 || is_pattern_stmt_p (stmt_info)
350 || pattern_def);
351 vectype = STMT_VINFO_VECTYPE (stmt_info);
352 }
353 else
354 {
355 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
356 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
357 if (vect_print_dump_info (REPORT_DETAILS))
358 {
359 fprintf (vect_dump, "get vectype for scalar type: ");
360 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
361 }
362 vectype = get_vectype_for_scalar_type (scalar_type);
363 if (!vectype)
364 {
365 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
366 {
367 fprintf (vect_dump,
368 "not vectorized: unsupported data-type ");
369 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
370 }
371 return false;
372 }
373
374 STMT_VINFO_VECTYPE (stmt_info) = vectype;
375 }
376
377 /* The vectorization factor is according to the smallest
378 scalar type (or the largest vector size, but we only
379 support one vector size per loop). */
380 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
381 &dummy);
382 if (vect_print_dump_info (REPORT_DETAILS))
383 {
384 fprintf (vect_dump, "get vectype for scalar type: ");
385 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
386 }
387 vf_vectype = get_vectype_for_scalar_type (scalar_type);
388 if (!vf_vectype)
389 {
390 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
391 {
392 fprintf (vect_dump,
393 "not vectorized: unsupported data-type ");
394 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
395 }
396 return false;
397 }
398
399 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
400 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
401 {
402 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
403 {
404 fprintf (vect_dump,
405 "not vectorized: different sized vector "
406 "types in statement, ");
407 print_generic_expr (vect_dump, vectype, TDF_SLIM);
408 fprintf (vect_dump, " and ");
409 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
410 }
411 return false;
412 }
413
414 if (vect_print_dump_info (REPORT_DETAILS))
415 {
416 fprintf (vect_dump, "vectype: ");
417 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
418 }
419
420 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
421 if (vect_print_dump_info (REPORT_DETAILS))
422 fprintf (vect_dump, "nunits = %d", nunits);
423
424 if (!vectorization_factor
425 || (nunits > vectorization_factor))
426 vectorization_factor = nunits;
427
428 if (!analyze_pattern_stmt && !pattern_def)
429 gsi_next (&si);
430 }
431 }
432
433 /* TODO: Analyze cost. Decide if worth while to vectorize. */
434 if (vect_print_dump_info (REPORT_DETAILS))
435 fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
436 if (vectorization_factor <= 1)
437 {
438 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
439 fprintf (vect_dump, "not vectorized: unsupported data-type");
440 return false;
441 }
442 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
443
444 return true;
445 }
446
447
448 /* Function vect_is_simple_iv_evolution.
449
450 FORNOW: A simple evolution of an induction variables in the loop is
451 considered a polynomial evolution with constant step. */
452
453 static bool
454 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
455 tree * step)
456 {
457 tree init_expr;
458 tree step_expr;
459 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
460
461 /* When there is no evolution in this loop, the evolution function
462 is not "simple". */
463 if (evolution_part == NULL_TREE)
464 return false;
465
466 /* When the evolution is a polynomial of degree >= 2
467 the evolution function is not "simple". */
468 if (tree_is_chrec (evolution_part))
469 return false;
470
471 step_expr = evolution_part;
472 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
473
474 if (vect_print_dump_info (REPORT_DETAILS))
475 {
476 fprintf (vect_dump, "step: ");
477 print_generic_expr (vect_dump, step_expr, TDF_SLIM);
478 fprintf (vect_dump, ", init: ");
479 print_generic_expr (vect_dump, init_expr, TDF_SLIM);
480 }
481
482 *init = init_expr;
483 *step = step_expr;
484
485 if (TREE_CODE (step_expr) != INTEGER_CST)
486 {
487 if (vect_print_dump_info (REPORT_DETAILS))
488 fprintf (vect_dump, "step unknown.");
489 return false;
490 }
491
492 return true;
493 }
494
495 /* Function vect_analyze_scalar_cycles_1.
496
497 Examine the cross iteration def-use cycles of scalar variables
498 in LOOP. LOOP_VINFO represents the loop that is now being
499 considered for vectorization (can be LOOP, or an outer-loop
500 enclosing LOOP). */
501
502 static void
503 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
504 {
505 basic_block bb = loop->header;
506 tree dumy;
507 VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
508 gimple_stmt_iterator gsi;
509 bool double_reduc;
510
511 if (vect_print_dump_info (REPORT_DETAILS))
512 fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
513
514 /* First - identify all inductions. Reduction detection assumes that all the
515 inductions have been identified, therefore, this order must not be
516 changed. */
517 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
518 {
519 gimple phi = gsi_stmt (gsi);
520 tree access_fn = NULL;
521 tree def = PHI_RESULT (phi);
522 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
523
524 if (vect_print_dump_info (REPORT_DETAILS))
525 {
526 fprintf (vect_dump, "Analyze phi: ");
527 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
528 }
529
530 /* Skip virtual phi's. The data dependences that are associated with
531 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
532 if (!is_gimple_reg (SSA_NAME_VAR (def)))
533 continue;
534
535 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
536
537 /* Analyze the evolution function. */
538 access_fn = analyze_scalar_evolution (loop, def);
539 if (access_fn)
540 STRIP_NOPS (access_fn);
541 if (access_fn && vect_print_dump_info (REPORT_DETAILS))
542 {
543 fprintf (vect_dump, "Access function of PHI: ");
544 print_generic_expr (vect_dump, access_fn, TDF_SLIM);
545 }
546
547 if (!access_fn
548 || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
549 {
550 VEC_safe_push (gimple, heap, worklist, phi);
551 continue;
552 }
553
554 if (vect_print_dump_info (REPORT_DETAILS))
555 fprintf (vect_dump, "Detected induction.");
556 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
557 }
558
559
560 /* Second - identify all reductions and nested cycles. */
561 while (VEC_length (gimple, worklist) > 0)
562 {
563 gimple phi = VEC_pop (gimple, worklist);
564 tree def = PHI_RESULT (phi);
565 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
566 gimple reduc_stmt;
567 bool nested_cycle;
568
569 if (vect_print_dump_info (REPORT_DETAILS))
570 {
571 fprintf (vect_dump, "Analyze phi: ");
572 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
573 }
574
575 gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
576 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
577
578 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
579 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
580 &double_reduc);
581 if (reduc_stmt)
582 {
583 if (double_reduc)
584 {
585 if (vect_print_dump_info (REPORT_DETAILS))
586 fprintf (vect_dump, "Detected double reduction.");
587
588 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
589 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
590 vect_double_reduction_def;
591 }
592 else
593 {
594 if (nested_cycle)
595 {
596 if (vect_print_dump_info (REPORT_DETAILS))
597 fprintf (vect_dump, "Detected vectorizable nested cycle.");
598
599 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
600 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
601 vect_nested_cycle;
602 }
603 else
604 {
605 if (vect_print_dump_info (REPORT_DETAILS))
606 fprintf (vect_dump, "Detected reduction.");
607
608 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
609 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
610 vect_reduction_def;
611 /* Store the reduction cycles for possible vectorization in
612 loop-aware SLP. */
613 VEC_safe_push (gimple, heap,
614 LOOP_VINFO_REDUCTIONS (loop_vinfo),
615 reduc_stmt);
616 }
617 }
618 }
619 else
620 if (vect_print_dump_info (REPORT_DETAILS))
621 fprintf (vect_dump, "Unknown def-use cycle pattern.");
622 }
623
624 VEC_free (gimple, heap, worklist);
625 }
626
627
628 /* Function vect_analyze_scalar_cycles.
629
630 Examine the cross iteration def-use cycles of scalar variables, by
631 analyzing the loop-header PHIs of scalar variables. Classify each
632 cycle as one of the following: invariant, induction, reduction, unknown.
633 We do that for the loop represented by LOOP_VINFO, and also to its
634 inner-loop, if exists.
635 Examples for scalar cycles:
636
637 Example1: reduction:
638
639 loop1:
640 for (i=0; i<N; i++)
641 sum += a[i];
642
643 Example2: induction:
644
645 loop2:
646 for (i=0; i<N; i++)
647 a[i] = i; */
648
649 static void
650 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
651 {
652 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
653
654 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
655
656 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
657 Reductions in such inner-loop therefore have different properties than
658 the reductions in the nest that gets vectorized:
659 1. When vectorized, they are executed in the same order as in the original
660 scalar loop, so we can't change the order of computation when
661 vectorizing them.
662 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
663 current checks are too strict. */
664
665 if (loop->inner)
666 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
667 }
668
669 /* Function vect_get_loop_niters.
670
671 Determine how many iterations the loop is executed.
672 If an expression that represents the number of iterations
673 can be constructed, place it in NUMBER_OF_ITERATIONS.
674 Return the loop exit condition. */
675
676 static gimple
677 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
678 {
679 tree niters;
680
681 if (vect_print_dump_info (REPORT_DETAILS))
682 fprintf (vect_dump, "=== get_loop_niters ===");
683
684 niters = number_of_exit_cond_executions (loop);
685
686 if (niters != NULL_TREE
687 && niters != chrec_dont_know)
688 {
689 *number_of_iterations = niters;
690
691 if (vect_print_dump_info (REPORT_DETAILS))
692 {
693 fprintf (vect_dump, "==> get_loop_niters:" );
694 print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
695 }
696 }
697
698 return get_loop_exit_condition (loop);
699 }
700
701
702 /* Function bb_in_loop_p
703
704 Used as predicate for dfs order traversal of the loop bbs. */
705
706 static bool
707 bb_in_loop_p (const_basic_block bb, const void *data)
708 {
709 const struct loop *const loop = (const struct loop *)data;
710 if (flow_bb_inside_loop_p (loop, bb))
711 return true;
712 return false;
713 }
714
715
716 /* Function new_loop_vec_info.
717
718 Create and initialize a new loop_vec_info struct for LOOP, as well as
719 stmt_vec_info structs for all the stmts in LOOP. */
720
721 static loop_vec_info
722 new_loop_vec_info (struct loop *loop)
723 {
724 loop_vec_info res;
725 basic_block *bbs;
726 gimple_stmt_iterator si;
727 unsigned int i, nbbs;
728
729 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
730 LOOP_VINFO_LOOP (res) = loop;
731
732 bbs = get_loop_body (loop);
733
734 /* Create/Update stmt_info for all stmts in the loop. */
735 for (i = 0; i < loop->num_nodes; i++)
736 {
737 basic_block bb = bbs[i];
738
739 /* BBs in a nested inner-loop will have been already processed (because
740 we will have called vect_analyze_loop_form for any nested inner-loop).
741 Therefore, for stmts in an inner-loop we just want to update the
742 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
743 loop_info of the outer-loop we are currently considering to vectorize
744 (instead of the loop_info of the inner-loop).
745 For stmts in other BBs we need to create a stmt_info from scratch. */
746 if (bb->loop_father != loop)
747 {
748 /* Inner-loop bb. */
749 gcc_assert (loop->inner && bb->loop_father == loop->inner);
750 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
751 {
752 gimple phi = gsi_stmt (si);
753 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
754 loop_vec_info inner_loop_vinfo =
755 STMT_VINFO_LOOP_VINFO (stmt_info);
756 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
757 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
758 }
759 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
760 {
761 gimple stmt = gsi_stmt (si);
762 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
763 loop_vec_info inner_loop_vinfo =
764 STMT_VINFO_LOOP_VINFO (stmt_info);
765 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
766 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
767 }
768 }
769 else
770 {
771 /* bb in current nest. */
772 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
773 {
774 gimple phi = gsi_stmt (si);
775 gimple_set_uid (phi, 0);
776 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
777 }
778
779 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
780 {
781 gimple stmt = gsi_stmt (si);
782 gimple_set_uid (stmt, 0);
783 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
784 }
785 }
786 }
787
788 /* CHECKME: We want to visit all BBs before their successors (except for
789 latch blocks, for which this assertion wouldn't hold). In the simple
790 case of the loop forms we allow, a dfs order of the BBs would the same
791 as reversed postorder traversal, so we are safe. */
792
793 free (bbs);
794 bbs = XCNEWVEC (basic_block, loop->num_nodes);
795 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
796 bbs, loop->num_nodes, loop);
797 gcc_assert (nbbs == loop->num_nodes);
798
799 LOOP_VINFO_BBS (res) = bbs;
800 LOOP_VINFO_NITERS (res) = NULL;
801 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
802 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
803 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
804 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
805 LOOP_VINFO_VECT_FACTOR (res) = 0;
806 LOOP_VINFO_LOOP_NEST (res) = VEC_alloc (loop_p, heap, 3);
807 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
808 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
809 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
810 LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
811 VEC_alloc (gimple, heap,
812 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
813 LOOP_VINFO_MAY_ALIAS_DDRS (res) =
814 VEC_alloc (ddr_p, heap,
815 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
816 LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
817 LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
818 LOOP_VINFO_REDUCTION_CHAINS (res) = VEC_alloc (gimple, heap, 10);
819 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
820 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
821 LOOP_VINFO_PEELING_HTAB (res) = NULL;
822 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
823
824 return res;
825 }
826
827
828 /* Function destroy_loop_vec_info.
829
830 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
831 stmts in the loop. */
832
833 void
834 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
835 {
836 struct loop *loop;
837 basic_block *bbs;
838 int nbbs;
839 gimple_stmt_iterator si;
840 int j;
841 VEC (slp_instance, heap) *slp_instances;
842 slp_instance instance;
843
844 if (!loop_vinfo)
845 return;
846
847 loop = LOOP_VINFO_LOOP (loop_vinfo);
848
849 bbs = LOOP_VINFO_BBS (loop_vinfo);
850 nbbs = loop->num_nodes;
851
852 if (!clean_stmts)
853 {
854 free (LOOP_VINFO_BBS (loop_vinfo));
855 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
856 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
857 VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
858 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
859 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
860
861 free (loop_vinfo);
862 loop->aux = NULL;
863 return;
864 }
865
866 for (j = 0; j < nbbs; j++)
867 {
868 basic_block bb = bbs[j];
869 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
870 free_stmt_vec_info (gsi_stmt (si));
871
872 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
873 {
874 gimple stmt = gsi_stmt (si);
875 /* Free stmt_vec_info. */
876 free_stmt_vec_info (stmt);
877 gsi_next (&si);
878 }
879 }
880
881 free (LOOP_VINFO_BBS (loop_vinfo));
882 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
883 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
884 VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
885 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
886 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
887 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
888 FOR_EACH_VEC_ELT (slp_instance, slp_instances, j, instance)
889 vect_free_slp_instance (instance);
890
891 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
892 VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
893 VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
894 VEC_free (gimple, heap, LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo));
895
896 if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
897 htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
898
899 free (loop_vinfo);
900 loop->aux = NULL;
901 }
902
903
904 /* Function vect_analyze_loop_1.
905
906 Apply a set of analyses on LOOP, and create a loop_vec_info struct
907 for it. The different analyses will record information in the
908 loop_vec_info struct. This is a subset of the analyses applied in
909 vect_analyze_loop, to be applied on an inner-loop nested in the loop
910 that is now considered for (outer-loop) vectorization. */
911
912 static loop_vec_info
913 vect_analyze_loop_1 (struct loop *loop)
914 {
915 loop_vec_info loop_vinfo;
916
917 if (vect_print_dump_info (REPORT_DETAILS))
918 fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
919
920 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
921
922 loop_vinfo = vect_analyze_loop_form (loop);
923 if (!loop_vinfo)
924 {
925 if (vect_print_dump_info (REPORT_DETAILS))
926 fprintf (vect_dump, "bad inner-loop form.");
927 return NULL;
928 }
929
930 return loop_vinfo;
931 }
932
933
934 /* Function vect_analyze_loop_form.
935
936 Verify that certain CFG restrictions hold, including:
937 - the loop has a pre-header
938 - the loop has a single entry and exit
939 - the loop exit condition is simple enough, and the number of iterations
940 can be analyzed (a countable loop). */
941
942 loop_vec_info
943 vect_analyze_loop_form (struct loop *loop)
944 {
945 loop_vec_info loop_vinfo;
946 gimple loop_cond;
947 tree number_of_iterations = NULL;
948 loop_vec_info inner_loop_vinfo = NULL;
949
950 if (vect_print_dump_info (REPORT_DETAILS))
951 fprintf (vect_dump, "=== vect_analyze_loop_form ===");
952
953 /* Different restrictions apply when we are considering an inner-most loop,
954 vs. an outer (nested) loop.
955 (FORNOW. May want to relax some of these restrictions in the future). */
956
957 if (!loop->inner)
958 {
959 /* Inner-most loop. We currently require that the number of BBs is
960 exactly 2 (the header and latch). Vectorizable inner-most loops
961 look like this:
962
963 (pre-header)
964 |
965 header <--------+
966 | | |
967 | +--> latch --+
968 |
969 (exit-bb) */
970
971 if (loop->num_nodes != 2)
972 {
973 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
974 fprintf (vect_dump, "not vectorized: control flow in loop.");
975 return NULL;
976 }
977
978 if (empty_block_p (loop->header))
979 {
980 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
981 fprintf (vect_dump, "not vectorized: empty loop.");
982 return NULL;
983 }
984 }
985 else
986 {
987 struct loop *innerloop = loop->inner;
988 edge entryedge;
989
990 /* Nested loop. We currently require that the loop is doubly-nested,
991 contains a single inner loop, and the number of BBs is exactly 5.
992 Vectorizable outer-loops look like this:
993
994 (pre-header)
995 |
996 header <---+
997 | |
998 inner-loop |
999 | |
1000 tail ------+
1001 |
1002 (exit-bb)
1003
1004 The inner-loop has the properties expected of inner-most loops
1005 as described above. */
1006
1007 if ((loop->inner)->inner || (loop->inner)->next)
1008 {
1009 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1010 fprintf (vect_dump, "not vectorized: multiple nested loops.");
1011 return NULL;
1012 }
1013
1014 /* Analyze the inner-loop. */
1015 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1016 if (!inner_loop_vinfo)
1017 {
1018 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1019 fprintf (vect_dump, "not vectorized: Bad inner loop.");
1020 return NULL;
1021 }
1022
1023 if (!expr_invariant_in_loop_p (loop,
1024 LOOP_VINFO_NITERS (inner_loop_vinfo)))
1025 {
1026 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1027 fprintf (vect_dump,
1028 "not vectorized: inner-loop count not invariant.");
1029 destroy_loop_vec_info (inner_loop_vinfo, true);
1030 return NULL;
1031 }
1032
1033 if (loop->num_nodes != 5)
1034 {
1035 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1036 fprintf (vect_dump, "not vectorized: control flow in loop.");
1037 destroy_loop_vec_info (inner_loop_vinfo, true);
1038 return NULL;
1039 }
1040
1041 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1042 entryedge = EDGE_PRED (innerloop->header, 0);
1043 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1044 entryedge = EDGE_PRED (innerloop->header, 1);
1045
1046 if (entryedge->src != loop->header
1047 || !single_exit (innerloop)
1048 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1049 {
1050 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1051 fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
1052 destroy_loop_vec_info (inner_loop_vinfo, true);
1053 return NULL;
1054 }
1055
1056 if (vect_print_dump_info (REPORT_DETAILS))
1057 fprintf (vect_dump, "Considering outer-loop vectorization.");
1058 }
1059
1060 if (!single_exit (loop)
1061 || EDGE_COUNT (loop->header->preds) != 2)
1062 {
1063 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1064 {
1065 if (!single_exit (loop))
1066 fprintf (vect_dump, "not vectorized: multiple exits.");
1067 else if (EDGE_COUNT (loop->header->preds) != 2)
1068 fprintf (vect_dump, "not vectorized: too many incoming edges.");
1069 }
1070 if (inner_loop_vinfo)
1071 destroy_loop_vec_info (inner_loop_vinfo, true);
1072 return NULL;
1073 }
1074
1075 /* We assume that the loop exit condition is at the end of the loop. i.e,
1076 that the loop is represented as a do-while (with a proper if-guard
1077 before the loop if needed), where the loop header contains all the
1078 executable statements, and the latch is empty. */
1079 if (!empty_block_p (loop->latch)
1080 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1081 {
1082 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1083 fprintf (vect_dump, "not vectorized: unexpected loop form.");
1084 if (inner_loop_vinfo)
1085 destroy_loop_vec_info (inner_loop_vinfo, true);
1086 return NULL;
1087 }
1088
1089 /* Make sure there exists a single-predecessor exit bb: */
1090 if (!single_pred_p (single_exit (loop)->dest))
1091 {
1092 edge e = single_exit (loop);
1093 if (!(e->flags & EDGE_ABNORMAL))
1094 {
1095 split_loop_exit_edge (e);
1096 if (vect_print_dump_info (REPORT_DETAILS))
1097 fprintf (vect_dump, "split exit edge.");
1098 }
1099 else
1100 {
1101 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1102 fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1103 if (inner_loop_vinfo)
1104 destroy_loop_vec_info (inner_loop_vinfo, true);
1105 return NULL;
1106 }
1107 }
1108
1109 loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1110 if (!loop_cond)
1111 {
1112 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1113 fprintf (vect_dump, "not vectorized: complicated exit condition.");
1114 if (inner_loop_vinfo)
1115 destroy_loop_vec_info (inner_loop_vinfo, true);
1116 return NULL;
1117 }
1118
1119 if (!number_of_iterations)
1120 {
1121 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1122 fprintf (vect_dump,
1123 "not vectorized: number of iterations cannot be computed.");
1124 if (inner_loop_vinfo)
1125 destroy_loop_vec_info (inner_loop_vinfo, true);
1126 return NULL;
1127 }
1128
1129 if (chrec_contains_undetermined (number_of_iterations))
1130 {
1131 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1132 fprintf (vect_dump, "Infinite number of iterations.");
1133 if (inner_loop_vinfo)
1134 destroy_loop_vec_info (inner_loop_vinfo, true);
1135 return NULL;
1136 }
1137
1138 if (!NITERS_KNOWN_P (number_of_iterations))
1139 {
1140 if (vect_print_dump_info (REPORT_DETAILS))
1141 {
1142 fprintf (vect_dump, "Symbolic number of iterations is ");
1143 print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1144 }
1145 }
1146 else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1147 {
1148 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1149 fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1150 if (inner_loop_vinfo)
1151 destroy_loop_vec_info (inner_loop_vinfo, false);
1152 return NULL;
1153 }
1154
1155 loop_vinfo = new_loop_vec_info (loop);
1156 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1157 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1158
1159 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1160
1161 /* CHECKME: May want to keep it around it in the future. */
1162 if (inner_loop_vinfo)
1163 destroy_loop_vec_info (inner_loop_vinfo, false);
1164
1165 gcc_assert (!loop->aux);
1166 loop->aux = loop_vinfo;
1167 return loop_vinfo;
1168 }
1169
1170
1171 /* Get cost by calling cost target builtin. */
1172
1173 static inline int
1174 vect_get_cost (enum vect_cost_for_stmt type_of_cost)
1175 {
1176 tree dummy_type = NULL;
1177 int dummy = 0;
1178
1179 return targetm.vectorize.builtin_vectorization_cost (type_of_cost,
1180 dummy_type, dummy);
1181 }
1182
1183
1184 /* Function vect_analyze_loop_operations.
1185
1186 Scan the loop stmts and make sure they are all vectorizable. */
1187
1188 static bool
1189 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1190 {
1191 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1192 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1193 int nbbs = loop->num_nodes;
1194 gimple_stmt_iterator si;
1195 unsigned int vectorization_factor = 0;
1196 int i;
1197 gimple phi;
1198 stmt_vec_info stmt_info;
1199 bool need_to_vectorize = false;
1200 int min_profitable_iters;
1201 int min_scalar_loop_bound;
1202 unsigned int th;
1203 bool only_slp_in_loop = true, ok;
1204
1205 if (vect_print_dump_info (REPORT_DETAILS))
1206 fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1207
1208 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1209 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1210 if (slp)
1211 {
1212 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1213 vectorization factor of the loop is the unrolling factor required by
1214 the SLP instances. If that unrolling factor is 1, we say, that we
1215 perform pure SLP on loop - cross iteration parallelism is not
1216 exploited. */
1217 for (i = 0; i < nbbs; i++)
1218 {
1219 basic_block bb = bbs[i];
1220 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1221 {
1222 gimple stmt = gsi_stmt (si);
1223 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1224 gcc_assert (stmt_info);
1225 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1226 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1227 && !PURE_SLP_STMT (stmt_info))
1228 /* STMT needs both SLP and loop-based vectorization. */
1229 only_slp_in_loop = false;
1230 }
1231 }
1232
1233 if (only_slp_in_loop)
1234 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1235 else
1236 vectorization_factor = least_common_multiple (vectorization_factor,
1237 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1238
1239 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1240 if (vect_print_dump_info (REPORT_DETAILS))
1241 fprintf (vect_dump, "Updating vectorization factor to %d ",
1242 vectorization_factor);
1243 }
1244
1245 for (i = 0; i < nbbs; i++)
1246 {
1247 basic_block bb = bbs[i];
1248
1249 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1250 {
1251 phi = gsi_stmt (si);
1252 ok = true;
1253
1254 stmt_info = vinfo_for_stmt (phi);
1255 if (vect_print_dump_info (REPORT_DETAILS))
1256 {
1257 fprintf (vect_dump, "examining phi: ");
1258 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1259 }
1260
1261 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1262 (i.e., a phi in the tail of the outer-loop). */
1263 if (! is_loop_header_bb_p (bb))
1264 {
1265 /* FORNOW: we currently don't support the case that these phis
1266 are not used in the outerloop (unless it is double reduction,
1267 i.e., this phi is vect_reduction_def), cause this case
1268 requires to actually do something here. */
1269 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1270 || STMT_VINFO_LIVE_P (stmt_info))
1271 && STMT_VINFO_DEF_TYPE (stmt_info)
1272 != vect_double_reduction_def)
1273 {
1274 if (vect_print_dump_info (REPORT_DETAILS))
1275 fprintf (vect_dump,
1276 "Unsupported loop-closed phi in outer-loop.");
1277 return false;
1278 }
1279
1280 /* If PHI is used in the outer loop, we check that its operand
1281 is defined in the inner loop. */
1282 if (STMT_VINFO_RELEVANT_P (stmt_info))
1283 {
1284 tree phi_op;
1285 gimple op_def_stmt;
1286
1287 if (gimple_phi_num_args (phi) != 1)
1288 return false;
1289
1290 phi_op = PHI_ARG_DEF (phi, 0);
1291 if (TREE_CODE (phi_op) != SSA_NAME)
1292 return false;
1293
1294 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1295 if (!op_def_stmt || !vinfo_for_stmt (op_def_stmt))
1296 return false;
1297
1298 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1299 != vect_used_in_outer
1300 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1301 != vect_used_in_outer_by_reduction)
1302 return false;
1303 }
1304
1305 continue;
1306 }
1307
1308 gcc_assert (stmt_info);
1309
1310 if (STMT_VINFO_LIVE_P (stmt_info))
1311 {
1312 /* FORNOW: not yet supported. */
1313 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1314 fprintf (vect_dump, "not vectorized: value used after loop.");
1315 return false;
1316 }
1317
1318 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1319 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1320 {
1321 /* A scalar-dependence cycle that we don't support. */
1322 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1323 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1324 return false;
1325 }
1326
1327 if (STMT_VINFO_RELEVANT_P (stmt_info))
1328 {
1329 need_to_vectorize = true;
1330 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1331 ok = vectorizable_induction (phi, NULL, NULL);
1332 }
1333
1334 if (!ok)
1335 {
1336 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1337 {
1338 fprintf (vect_dump,
1339 "not vectorized: relevant phi not supported: ");
1340 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1341 }
1342 return false;
1343 }
1344 }
1345
1346 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1347 {
1348 gimple stmt = gsi_stmt (si);
1349 if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1350 return false;
1351 }
1352 } /* bbs */
1353
1354 /* All operations in the loop are either irrelevant (deal with loop
1355 control, or dead), or only used outside the loop and can be moved
1356 out of the loop (e.g. invariants, inductions). The loop can be
1357 optimized away by scalar optimizations. We're better off not
1358 touching this loop. */
1359 if (!need_to_vectorize)
1360 {
1361 if (vect_print_dump_info (REPORT_DETAILS))
1362 fprintf (vect_dump,
1363 "All the computation can be taken out of the loop.");
1364 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1365 fprintf (vect_dump,
1366 "not vectorized: redundant loop. no profit to vectorize.");
1367 return false;
1368 }
1369
1370 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1371 && vect_print_dump_info (REPORT_DETAILS))
1372 fprintf (vect_dump,
1373 "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1374 vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1375
1376 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1377 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1378 {
1379 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1380 fprintf (vect_dump, "not vectorized: iteration count too small.");
1381 if (vect_print_dump_info (REPORT_DETAILS))
1382 fprintf (vect_dump,"not vectorized: iteration count smaller than "
1383 "vectorization factor.");
1384 return false;
1385 }
1386
1387 /* Analyze cost. Decide if worth while to vectorize. */
1388
1389 /* Once VF is set, SLP costs should be updated since the number of created
1390 vector stmts depends on VF. */
1391 vect_update_slp_costs_according_to_vf (loop_vinfo);
1392
1393 min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1394 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1395
1396 if (min_profitable_iters < 0)
1397 {
1398 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1399 fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1400 if (vect_print_dump_info (REPORT_DETAILS))
1401 fprintf (vect_dump, "not vectorized: vector version will never be "
1402 "profitable.");
1403 return false;
1404 }
1405
1406 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1407 * vectorization_factor) - 1);
1408
1409 /* Use the cost model only if it is more conservative than user specified
1410 threshold. */
1411
1412 th = (unsigned) min_scalar_loop_bound;
1413 if (min_profitable_iters
1414 && (!min_scalar_loop_bound
1415 || min_profitable_iters > min_scalar_loop_bound))
1416 th = (unsigned) min_profitable_iters;
1417
1418 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1419 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1420 {
1421 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1422 fprintf (vect_dump, "not vectorized: vectorization not "
1423 "profitable.");
1424 if (vect_print_dump_info (REPORT_DETAILS))
1425 fprintf (vect_dump, "not vectorized: iteration count smaller than "
1426 "user specified loop bound parameter or minimum "
1427 "profitable iterations (whichever is more conservative).");
1428 return false;
1429 }
1430
1431 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1432 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1433 || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1434 {
1435 if (vect_print_dump_info (REPORT_DETAILS))
1436 fprintf (vect_dump, "epilog loop required.");
1437 if (!vect_can_advance_ivs_p (loop_vinfo))
1438 {
1439 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1440 fprintf (vect_dump,
1441 "not vectorized: can't create epilog loop 1.");
1442 return false;
1443 }
1444 if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1445 {
1446 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1447 fprintf (vect_dump,
1448 "not vectorized: can't create epilog loop 2.");
1449 return false;
1450 }
1451 }
1452
1453 return true;
1454 }
1455
1456
1457 /* Function vect_analyze_loop_2.
1458
1459 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1460 for it. The different analyses will record information in the
1461 loop_vec_info struct. */
1462 static bool
1463 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1464 {
1465 bool ok, slp = false;
1466 int max_vf = MAX_VECTORIZATION_FACTOR;
1467 int min_vf = 2;
1468
1469 /* Find all data references in the loop (which correspond to vdefs/vuses)
1470 and analyze their evolution in the loop. Also adjust the minimal
1471 vectorization factor according to the loads and stores.
1472
1473 FORNOW: Handle only simple, array references, which
1474 alignment can be forced, and aligned pointer-references. */
1475
1476 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1477 if (!ok)
1478 {
1479 if (vect_print_dump_info (REPORT_DETAILS))
1480 fprintf (vect_dump, "bad data references.");
1481 return false;
1482 }
1483
1484 /* Classify all cross-iteration scalar data-flow cycles.
1485 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1486
1487 vect_analyze_scalar_cycles (loop_vinfo);
1488
1489 vect_pattern_recog (loop_vinfo);
1490
1491 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1492
1493 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1494 if (!ok)
1495 {
1496 if (vect_print_dump_info (REPORT_DETAILS))
1497 fprintf (vect_dump, "unexpected pattern.");
1498 return false;
1499 }
1500
1501 /* Analyze data dependences between the data-refs in the loop
1502 and adjust the maximum vectorization factor according to
1503 the dependences.
1504 FORNOW: fail at the first data dependence that we encounter. */
1505
1506 ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf);
1507 if (!ok
1508 || max_vf < min_vf)
1509 {
1510 if (vect_print_dump_info (REPORT_DETAILS))
1511 fprintf (vect_dump, "bad data dependence.");
1512 return false;
1513 }
1514
1515 ok = vect_determine_vectorization_factor (loop_vinfo);
1516 if (!ok)
1517 {
1518 if (vect_print_dump_info (REPORT_DETAILS))
1519 fprintf (vect_dump, "can't determine vectorization factor.");
1520 return false;
1521 }
1522 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1523 {
1524 if (vect_print_dump_info (REPORT_DETAILS))
1525 fprintf (vect_dump, "bad data dependence.");
1526 return false;
1527 }
1528
1529 /* Analyze the alignment of the data-refs in the loop.
1530 Fail if a data reference is found that cannot be vectorized. */
1531
1532 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1533 if (!ok)
1534 {
1535 if (vect_print_dump_info (REPORT_DETAILS))
1536 fprintf (vect_dump, "bad data alignment.");
1537 return false;
1538 }
1539
1540 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1541 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1542
1543 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1544 if (!ok)
1545 {
1546 if (vect_print_dump_info (REPORT_DETAILS))
1547 fprintf (vect_dump, "bad data access.");
1548 return false;
1549 }
1550
1551 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1552 It is important to call pruning after vect_analyze_data_ref_accesses,
1553 since we use grouping information gathered by interleaving analysis. */
1554 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1555 if (!ok)
1556 {
1557 if (vect_print_dump_info (REPORT_DETAILS))
1558 fprintf (vect_dump, "too long list of versioning for alias "
1559 "run-time tests.");
1560 return false;
1561 }
1562
1563 /* This pass will decide on using loop versioning and/or loop peeling in
1564 order to enhance the alignment of data references in the loop. */
1565
1566 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1567 if (!ok)
1568 {
1569 if (vect_print_dump_info (REPORT_DETAILS))
1570 fprintf (vect_dump, "bad data alignment.");
1571 return false;
1572 }
1573
1574 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1575 ok = vect_analyze_slp (loop_vinfo, NULL);
1576 if (ok)
1577 {
1578 /* Decide which possible SLP instances to SLP. */
1579 slp = vect_make_slp_decision (loop_vinfo);
1580
1581 /* Find stmts that need to be both vectorized and SLPed. */
1582 vect_detect_hybrid_slp (loop_vinfo);
1583 }
1584 else
1585 return false;
1586
1587 /* Scan all the operations in the loop and make sure they are
1588 vectorizable. */
1589
1590 ok = vect_analyze_loop_operations (loop_vinfo, slp);
1591 if (!ok)
1592 {
1593 if (vect_print_dump_info (REPORT_DETAILS))
1594 fprintf (vect_dump, "bad operation or unsupported loop bound.");
1595 return false;
1596 }
1597
1598 return true;
1599 }
1600
1601 /* Function vect_analyze_loop.
1602
1603 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1604 for it. The different analyses will record information in the
1605 loop_vec_info struct. */
1606 loop_vec_info
1607 vect_analyze_loop (struct loop *loop)
1608 {
1609 loop_vec_info loop_vinfo;
1610 unsigned int vector_sizes;
1611
1612 /* Autodetect first vector size we try. */
1613 current_vector_size = 0;
1614 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1615
1616 if (vect_print_dump_info (REPORT_DETAILS))
1617 fprintf (vect_dump, "===== analyze_loop_nest =====");
1618
1619 if (loop_outer (loop)
1620 && loop_vec_info_for_loop (loop_outer (loop))
1621 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1622 {
1623 if (vect_print_dump_info (REPORT_DETAILS))
1624 fprintf (vect_dump, "outer-loop already vectorized.");
1625 return NULL;
1626 }
1627
1628 while (1)
1629 {
1630 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1631 loop_vinfo = vect_analyze_loop_form (loop);
1632 if (!loop_vinfo)
1633 {
1634 if (vect_print_dump_info (REPORT_DETAILS))
1635 fprintf (vect_dump, "bad loop form.");
1636 return NULL;
1637 }
1638
1639 if (vect_analyze_loop_2 (loop_vinfo))
1640 {
1641 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1642
1643 return loop_vinfo;
1644 }
1645
1646 destroy_loop_vec_info (loop_vinfo, true);
1647
1648 vector_sizes &= ~current_vector_size;
1649 if (vector_sizes == 0
1650 || current_vector_size == 0)
1651 return NULL;
1652
1653 /* Try the next biggest vector size. */
1654 current_vector_size = 1 << floor_log2 (vector_sizes);
1655 if (vect_print_dump_info (REPORT_DETAILS))
1656 fprintf (vect_dump, "***** Re-trying analysis with "
1657 "vector size %d\n", current_vector_size);
1658 }
1659 }
1660
1661
1662 /* Function reduction_code_for_scalar_code
1663
1664 Input:
1665 CODE - tree_code of a reduction operations.
1666
1667 Output:
1668 REDUC_CODE - the corresponding tree-code to be used to reduce the
1669 vector of partial results into a single scalar result (which
1670 will also reside in a vector) or ERROR_MARK if the operation is
1671 a supported reduction operation, but does not have such tree-code.
1672
1673 Return FALSE if CODE currently cannot be vectorized as reduction. */
1674
1675 static bool
1676 reduction_code_for_scalar_code (enum tree_code code,
1677 enum tree_code *reduc_code)
1678 {
1679 switch (code)
1680 {
1681 case MAX_EXPR:
1682 *reduc_code = REDUC_MAX_EXPR;
1683 return true;
1684
1685 case MIN_EXPR:
1686 *reduc_code = REDUC_MIN_EXPR;
1687 return true;
1688
1689 case PLUS_EXPR:
1690 *reduc_code = REDUC_PLUS_EXPR;
1691 return true;
1692
1693 case MULT_EXPR:
1694 case MINUS_EXPR:
1695 case BIT_IOR_EXPR:
1696 case BIT_XOR_EXPR:
1697 case BIT_AND_EXPR:
1698 *reduc_code = ERROR_MARK;
1699 return true;
1700
1701 default:
1702 return false;
1703 }
1704 }
1705
1706
1707 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1708 STMT is printed with a message MSG. */
1709
1710 static void
1711 report_vect_op (gimple stmt, const char *msg)
1712 {
1713 fprintf (vect_dump, "%s", msg);
1714 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1715 }
1716
1717
1718 /* Detect SLP reduction of the form:
1719
1720 #a1 = phi <a5, a0>
1721 a2 = operation (a1)
1722 a3 = operation (a2)
1723 a4 = operation (a3)
1724 a5 = operation (a4)
1725
1726 #a = phi <a5>
1727
1728 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1729 FIRST_STMT is the first reduction stmt in the chain
1730 (a2 = operation (a1)).
1731
1732 Return TRUE if a reduction chain was detected. */
1733
1734 static bool
1735 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1736 {
1737 struct loop *loop = (gimple_bb (phi))->loop_father;
1738 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1739 enum tree_code code;
1740 gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1741 stmt_vec_info use_stmt_info, current_stmt_info;
1742 tree lhs;
1743 imm_use_iterator imm_iter;
1744 use_operand_p use_p;
1745 int nloop_uses, size = 0, n_out_of_loop_uses;
1746 bool found = false;
1747
1748 if (loop != vect_loop)
1749 return false;
1750
1751 lhs = PHI_RESULT (phi);
1752 code = gimple_assign_rhs_code (first_stmt);
1753 while (1)
1754 {
1755 nloop_uses = 0;
1756 n_out_of_loop_uses = 0;
1757 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1758 {
1759 gimple use_stmt = USE_STMT (use_p);
1760 if (is_gimple_debug (use_stmt))
1761 continue;
1762
1763 use_stmt = USE_STMT (use_p);
1764
1765 /* Check if we got back to the reduction phi. */
1766 if (use_stmt == phi)
1767 {
1768 loop_use_stmt = use_stmt;
1769 found = true;
1770 break;
1771 }
1772
1773 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1774 {
1775 if (vinfo_for_stmt (use_stmt)
1776 && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1777 {
1778 loop_use_stmt = use_stmt;
1779 nloop_uses++;
1780 }
1781 }
1782 else
1783 n_out_of_loop_uses++;
1784
1785 /* There are can be either a single use in the loop or two uses in
1786 phi nodes. */
1787 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1788 return false;
1789 }
1790
1791 if (found)
1792 break;
1793
1794 /* We reached a statement with no loop uses. */
1795 if (nloop_uses == 0)
1796 return false;
1797
1798 /* This is a loop exit phi, and we haven't reached the reduction phi. */
1799 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
1800 return false;
1801
1802 if (!is_gimple_assign (loop_use_stmt)
1803 || code != gimple_assign_rhs_code (loop_use_stmt)
1804 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
1805 return false;
1806
1807 /* Insert USE_STMT into reduction chain. */
1808 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
1809 if (current_stmt)
1810 {
1811 current_stmt_info = vinfo_for_stmt (current_stmt);
1812 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
1813 GROUP_FIRST_ELEMENT (use_stmt_info)
1814 = GROUP_FIRST_ELEMENT (current_stmt_info);
1815 }
1816 else
1817 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
1818
1819 lhs = gimple_assign_lhs (loop_use_stmt);
1820 current_stmt = loop_use_stmt;
1821 size++;
1822 }
1823
1824 if (!found || loop_use_stmt != phi || size < 2)
1825 return false;
1826
1827 /* Swap the operands, if needed, to make the reduction operand be the second
1828 operand. */
1829 lhs = PHI_RESULT (phi);
1830 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1831 while (next_stmt)
1832 {
1833 if (gimple_assign_rhs2 (next_stmt) == lhs)
1834 {
1835 tree op = gimple_assign_rhs1 (next_stmt);
1836 gimple def_stmt = NULL;
1837
1838 if (TREE_CODE (op) == SSA_NAME)
1839 def_stmt = SSA_NAME_DEF_STMT (op);
1840
1841 /* Check that the other def is either defined in the loop
1842 ("vect_internal_def"), or it's an induction (defined by a
1843 loop-header phi-node). */
1844 if (def_stmt
1845 && gimple_bb (def_stmt)
1846 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1847 && (is_gimple_assign (def_stmt)
1848 || is_gimple_call (def_stmt)
1849 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1850 == vect_induction_def
1851 || (gimple_code (def_stmt) == GIMPLE_PHI
1852 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1853 == vect_internal_def
1854 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
1855 {
1856 lhs = gimple_assign_lhs (next_stmt);
1857 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1858 continue;
1859 }
1860
1861 return false;
1862 }
1863 else
1864 {
1865 tree op = gimple_assign_rhs2 (next_stmt);
1866 gimple def_stmt = NULL;
1867
1868 if (TREE_CODE (op) == SSA_NAME)
1869 def_stmt = SSA_NAME_DEF_STMT (op);
1870
1871 /* Check that the other def is either defined in the loop
1872 ("vect_internal_def"), or it's an induction (defined by a
1873 loop-header phi-node). */
1874 if (def_stmt
1875 && gimple_bb (def_stmt)
1876 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1877 && (is_gimple_assign (def_stmt)
1878 || is_gimple_call (def_stmt)
1879 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1880 == vect_induction_def
1881 || (gimple_code (def_stmt) == GIMPLE_PHI
1882 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1883 == vect_internal_def
1884 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
1885 {
1886 if (vect_print_dump_info (REPORT_DETAILS))
1887 {
1888 fprintf (vect_dump, "swapping oprnds: ");
1889 print_gimple_stmt (vect_dump, next_stmt, 0, TDF_SLIM);
1890 }
1891
1892 swap_tree_operands (next_stmt,
1893 gimple_assign_rhs1_ptr (next_stmt),
1894 gimple_assign_rhs2_ptr (next_stmt));
1895 mark_symbols_for_renaming (next_stmt);
1896 }
1897 else
1898 return false;
1899 }
1900
1901 lhs = gimple_assign_lhs (next_stmt);
1902 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1903 }
1904
1905 /* Save the chain for further analysis in SLP detection. */
1906 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1907 VEC_safe_push (gimple, heap, LOOP_VINFO_REDUCTION_CHAINS (loop_info), first);
1908 GROUP_SIZE (vinfo_for_stmt (first)) = size;
1909
1910 return true;
1911 }
1912
1913
1914 /* Function vect_is_simple_reduction_1
1915
1916 (1) Detect a cross-iteration def-use cycle that represents a simple
1917 reduction computation. We look for the following pattern:
1918
1919 loop_header:
1920 a1 = phi < a0, a2 >
1921 a3 = ...
1922 a2 = operation (a3, a1)
1923
1924 such that:
1925 1. operation is commutative and associative and it is safe to
1926 change the order of the computation (if CHECK_REDUCTION is true)
1927 2. no uses for a2 in the loop (a2 is used out of the loop)
1928 3. no uses of a1 in the loop besides the reduction operation
1929 4. no uses of a1 outside the loop.
1930
1931 Conditions 1,4 are tested here.
1932 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1933
1934 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1935 nested cycles, if CHECK_REDUCTION is false.
1936
1937 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1938 reductions:
1939
1940 a1 = phi < a0, a2 >
1941 inner loop (def of a3)
1942 a2 = phi < a3 >
1943
1944 If MODIFY is true it tries also to rework the code in-place to enable
1945 detection of more reduction patterns. For the time being we rewrite
1946 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
1947 */
1948
1949 static gimple
1950 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
1951 bool check_reduction, bool *double_reduc,
1952 bool modify)
1953 {
1954 struct loop *loop = (gimple_bb (phi))->loop_father;
1955 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1956 edge latch_e = loop_latch_edge (loop);
1957 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1958 gimple def_stmt, def1 = NULL, def2 = NULL;
1959 enum tree_code orig_code, code;
1960 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1961 tree type;
1962 int nloop_uses;
1963 tree name;
1964 imm_use_iterator imm_iter;
1965 use_operand_p use_p;
1966 bool phi_def;
1967
1968 *double_reduc = false;
1969
1970 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1971 otherwise, we assume outer loop vectorization. */
1972 gcc_assert ((check_reduction && loop == vect_loop)
1973 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1974
1975 name = PHI_RESULT (phi);
1976 nloop_uses = 0;
1977 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1978 {
1979 gimple use_stmt = USE_STMT (use_p);
1980 if (is_gimple_debug (use_stmt))
1981 continue;
1982
1983 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1984 {
1985 if (vect_print_dump_info (REPORT_DETAILS))
1986 fprintf (vect_dump, "intermediate value used outside loop.");
1987
1988 return NULL;
1989 }
1990
1991 if (vinfo_for_stmt (use_stmt)
1992 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1993 nloop_uses++;
1994 if (nloop_uses > 1)
1995 {
1996 if (vect_print_dump_info (REPORT_DETAILS))
1997 fprintf (vect_dump, "reduction used in loop.");
1998 return NULL;
1999 }
2000 }
2001
2002 if (TREE_CODE (loop_arg) != SSA_NAME)
2003 {
2004 if (vect_print_dump_info (REPORT_DETAILS))
2005 {
2006 fprintf (vect_dump, "reduction: not ssa_name: ");
2007 print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
2008 }
2009 return NULL;
2010 }
2011
2012 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2013 if (!def_stmt)
2014 {
2015 if (vect_print_dump_info (REPORT_DETAILS))
2016 fprintf (vect_dump, "reduction: no def_stmt.");
2017 return NULL;
2018 }
2019
2020 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2021 {
2022 if (vect_print_dump_info (REPORT_DETAILS))
2023 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
2024 return NULL;
2025 }
2026
2027 if (is_gimple_assign (def_stmt))
2028 {
2029 name = gimple_assign_lhs (def_stmt);
2030 phi_def = false;
2031 }
2032 else
2033 {
2034 name = PHI_RESULT (def_stmt);
2035 phi_def = true;
2036 }
2037
2038 nloop_uses = 0;
2039 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2040 {
2041 gimple use_stmt = USE_STMT (use_p);
2042 if (is_gimple_debug (use_stmt))
2043 continue;
2044 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2045 && vinfo_for_stmt (use_stmt)
2046 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2047 nloop_uses++;
2048 if (nloop_uses > 1)
2049 {
2050 if (vect_print_dump_info (REPORT_DETAILS))
2051 fprintf (vect_dump, "reduction used in loop.");
2052 return NULL;
2053 }
2054 }
2055
2056 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2057 defined in the inner loop. */
2058 if (phi_def)
2059 {
2060 op1 = PHI_ARG_DEF (def_stmt, 0);
2061
2062 if (gimple_phi_num_args (def_stmt) != 1
2063 || TREE_CODE (op1) != SSA_NAME)
2064 {
2065 if (vect_print_dump_info (REPORT_DETAILS))
2066 fprintf (vect_dump, "unsupported phi node definition.");
2067
2068 return NULL;
2069 }
2070
2071 def1 = SSA_NAME_DEF_STMT (op1);
2072 if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2073 && loop->inner
2074 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2075 && is_gimple_assign (def1))
2076 {
2077 if (vect_print_dump_info (REPORT_DETAILS))
2078 report_vect_op (def_stmt, "detected double reduction: ");
2079
2080 *double_reduc = true;
2081 return def_stmt;
2082 }
2083
2084 return NULL;
2085 }
2086
2087 code = orig_code = gimple_assign_rhs_code (def_stmt);
2088
2089 /* We can handle "res -= x[i]", which is non-associative by
2090 simply rewriting this into "res += -x[i]". Avoid changing
2091 gimple instruction for the first simple tests and only do this
2092 if we're allowed to change code at all. */
2093 if (code == MINUS_EXPR
2094 && modify
2095 && (op1 = gimple_assign_rhs1 (def_stmt))
2096 && TREE_CODE (op1) == SSA_NAME
2097 && SSA_NAME_DEF_STMT (op1) == phi)
2098 code = PLUS_EXPR;
2099
2100 if (check_reduction
2101 && (!commutative_tree_code (code) || !associative_tree_code (code)))
2102 {
2103 if (vect_print_dump_info (REPORT_DETAILS))
2104 report_vect_op (def_stmt, "reduction: not commutative/associative: ");
2105 return NULL;
2106 }
2107
2108 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2109 {
2110 if (code != COND_EXPR)
2111 {
2112 if (vect_print_dump_info (REPORT_DETAILS))
2113 report_vect_op (def_stmt, "reduction: not binary operation: ");
2114
2115 return NULL;
2116 }
2117
2118 op3 = gimple_assign_rhs1 (def_stmt);
2119 if (COMPARISON_CLASS_P (op3))
2120 {
2121 op4 = TREE_OPERAND (op3, 1);
2122 op3 = TREE_OPERAND (op3, 0);
2123 }
2124
2125 op1 = gimple_assign_rhs2 (def_stmt);
2126 op2 = gimple_assign_rhs3 (def_stmt);
2127
2128 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2129 {
2130 if (vect_print_dump_info (REPORT_DETAILS))
2131 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
2132
2133 return NULL;
2134 }
2135 }
2136 else
2137 {
2138 op1 = gimple_assign_rhs1 (def_stmt);
2139 op2 = gimple_assign_rhs2 (def_stmt);
2140
2141 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2142 {
2143 if (vect_print_dump_info (REPORT_DETAILS))
2144 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
2145
2146 return NULL;
2147 }
2148 }
2149
2150 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2151 if ((TREE_CODE (op1) == SSA_NAME
2152 && !types_compatible_p (type,TREE_TYPE (op1)))
2153 || (TREE_CODE (op2) == SSA_NAME
2154 && !types_compatible_p (type, TREE_TYPE (op2)))
2155 || (op3 && TREE_CODE (op3) == SSA_NAME
2156 && !types_compatible_p (type, TREE_TYPE (op3)))
2157 || (op4 && TREE_CODE (op4) == SSA_NAME
2158 && !types_compatible_p (type, TREE_TYPE (op4))))
2159 {
2160 if (vect_print_dump_info (REPORT_DETAILS))
2161 {
2162 fprintf (vect_dump, "reduction: multiple types: operation type: ");
2163 print_generic_expr (vect_dump, type, TDF_SLIM);
2164 fprintf (vect_dump, ", operands types: ");
2165 print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
2166 fprintf (vect_dump, ",");
2167 print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
2168 if (op3)
2169 {
2170 fprintf (vect_dump, ",");
2171 print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
2172 }
2173
2174 if (op4)
2175 {
2176 fprintf (vect_dump, ",");
2177 print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
2178 }
2179 }
2180
2181 return NULL;
2182 }
2183
2184 /* Check that it's ok to change the order of the computation.
2185 Generally, when vectorizing a reduction we change the order of the
2186 computation. This may change the behavior of the program in some
2187 cases, so we need to check that this is ok. One exception is when
2188 vectorizing an outer-loop: the inner-loop is executed sequentially,
2189 and therefore vectorizing reductions in the inner-loop during
2190 outer-loop vectorization is safe. */
2191
2192 /* CHECKME: check for !flag_finite_math_only too? */
2193 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2194 && check_reduction)
2195 {
2196 /* Changing the order of operations changes the semantics. */
2197 if (vect_print_dump_info (REPORT_DETAILS))
2198 report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
2199 return NULL;
2200 }
2201 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2202 && check_reduction)
2203 {
2204 /* Changing the order of operations changes the semantics. */
2205 if (vect_print_dump_info (REPORT_DETAILS))
2206 report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
2207 return NULL;
2208 }
2209 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2210 {
2211 /* Changing the order of operations changes the semantics. */
2212 if (vect_print_dump_info (REPORT_DETAILS))
2213 report_vect_op (def_stmt,
2214 "reduction: unsafe fixed-point math optimization: ");
2215 return NULL;
2216 }
2217
2218 /* If we detected "res -= x[i]" earlier, rewrite it into
2219 "res += -x[i]" now. If this turns out to be useless reassoc
2220 will clean it up again. */
2221 if (orig_code == MINUS_EXPR)
2222 {
2223 tree rhs = gimple_assign_rhs2 (def_stmt);
2224 tree negrhs = make_ssa_name (SSA_NAME_VAR (rhs), NULL);
2225 gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2226 rhs, NULL);
2227 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2228 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2229 loop_info, NULL));
2230 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2231 gimple_assign_set_rhs2 (def_stmt, negrhs);
2232 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2233 update_stmt (def_stmt);
2234 }
2235
2236 /* Reduction is safe. We're dealing with one of the following:
2237 1) integer arithmetic and no trapv
2238 2) floating point arithmetic, and special flags permit this optimization
2239 3) nested cycle (i.e., outer loop vectorization). */
2240 if (TREE_CODE (op1) == SSA_NAME)
2241 def1 = SSA_NAME_DEF_STMT (op1);
2242
2243 if (TREE_CODE (op2) == SSA_NAME)
2244 def2 = SSA_NAME_DEF_STMT (op2);
2245
2246 if (code != COND_EXPR
2247 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2248 {
2249 if (vect_print_dump_info (REPORT_DETAILS))
2250 report_vect_op (def_stmt, "reduction: no defs for operands: ");
2251 return NULL;
2252 }
2253
2254 /* Check that one def is the reduction def, defined by PHI,
2255 the other def is either defined in the loop ("vect_internal_def"),
2256 or it's an induction (defined by a loop-header phi-node). */
2257
2258 if (def2 && def2 == phi
2259 && (code == COND_EXPR
2260 || !def1 || gimple_nop_p (def1)
2261 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2262 && (is_gimple_assign (def1)
2263 || is_gimple_call (def1)
2264 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2265 == vect_induction_def
2266 || (gimple_code (def1) == GIMPLE_PHI
2267 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2268 == vect_internal_def
2269 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2270 {
2271 if (vect_print_dump_info (REPORT_DETAILS))
2272 report_vect_op (def_stmt, "detected reduction: ");
2273 return def_stmt;
2274 }
2275
2276 if (def1 && def1 == phi
2277 && (code == COND_EXPR
2278 || !def2 || gimple_nop_p (def2)
2279 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2280 && (is_gimple_assign (def2)
2281 || is_gimple_call (def2)
2282 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2283 == vect_induction_def
2284 || (gimple_code (def2) == GIMPLE_PHI
2285 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2286 == vect_internal_def
2287 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2288 {
2289 if (check_reduction)
2290 {
2291 /* Swap operands (just for simplicity - so that the rest of the code
2292 can assume that the reduction variable is always the last (second)
2293 argument). */
2294 if (vect_print_dump_info (REPORT_DETAILS))
2295 report_vect_op (def_stmt,
2296 "detected reduction: need to swap operands: ");
2297
2298 swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2299 gimple_assign_rhs2_ptr (def_stmt));
2300 }
2301 else
2302 {
2303 if (vect_print_dump_info (REPORT_DETAILS))
2304 report_vect_op (def_stmt, "detected reduction: ");
2305 }
2306
2307 return def_stmt;
2308 }
2309
2310 /* Try to find SLP reduction chain. */
2311 if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2312 {
2313 if (vect_print_dump_info (REPORT_DETAILS))
2314 report_vect_op (def_stmt, "reduction: detected reduction chain: ");
2315
2316 return def_stmt;
2317 }
2318
2319 if (vect_print_dump_info (REPORT_DETAILS))
2320 report_vect_op (def_stmt, "reduction: unknown pattern: ");
2321
2322 return NULL;
2323 }
2324
2325 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2326 in-place. Arguments as there. */
2327
2328 static gimple
2329 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2330 bool check_reduction, bool *double_reduc)
2331 {
2332 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2333 double_reduc, false);
2334 }
2335
2336 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2337 in-place if it enables detection of more reductions. Arguments
2338 as there. */
2339
2340 gimple
2341 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2342 bool check_reduction, bool *double_reduc)
2343 {
2344 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2345 double_reduc, true);
2346 }
2347
2348 /* Calculate the cost of one scalar iteration of the loop. */
2349 int
2350 vect_get_single_scalar_iteraion_cost (loop_vec_info loop_vinfo)
2351 {
2352 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2353 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2354 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2355 int innerloop_iters, i, stmt_cost;
2356
2357 /* Count statements in scalar loop. Using this as scalar cost for a single
2358 iteration for now.
2359
2360 TODO: Add outer loop support.
2361
2362 TODO: Consider assigning different costs to different scalar
2363 statements. */
2364
2365 /* FORNOW. */
2366 innerloop_iters = 1;
2367 if (loop->inner)
2368 innerloop_iters = 50; /* FIXME */
2369
2370 for (i = 0; i < nbbs; i++)
2371 {
2372 gimple_stmt_iterator si;
2373 basic_block bb = bbs[i];
2374
2375 if (bb->loop_father == loop->inner)
2376 factor = innerloop_iters;
2377 else
2378 factor = 1;
2379
2380 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2381 {
2382 gimple stmt = gsi_stmt (si);
2383 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2384
2385 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2386 continue;
2387
2388 /* Skip stmts that are not vectorized inside the loop. */
2389 if (stmt_info
2390 && !STMT_VINFO_RELEVANT_P (stmt_info)
2391 && (!STMT_VINFO_LIVE_P (stmt_info)
2392 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2393 continue;
2394
2395 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2396 {
2397 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2398 stmt_cost = vect_get_cost (scalar_load);
2399 else
2400 stmt_cost = vect_get_cost (scalar_store);
2401 }
2402 else
2403 stmt_cost = vect_get_cost (scalar_stmt);
2404
2405 scalar_single_iter_cost += stmt_cost * factor;
2406 }
2407 }
2408 return scalar_single_iter_cost;
2409 }
2410
2411 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2412 int
2413 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2414 int *peel_iters_epilogue,
2415 int scalar_single_iter_cost)
2416 {
2417 int peel_guard_costs = 0;
2418 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2419
2420 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2421 {
2422 *peel_iters_epilogue = vf/2;
2423 if (vect_print_dump_info (REPORT_COST))
2424 fprintf (vect_dump, "cost model: "
2425 "epilogue peel iters set to vf/2 because "
2426 "loop iterations are unknown .");
2427
2428 /* If peeled iterations are known but number of scalar loop
2429 iterations are unknown, count a taken branch per peeled loop. */
2430 peel_guard_costs = 2 * vect_get_cost (cond_branch_taken);
2431 }
2432 else
2433 {
2434 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2435 peel_iters_prologue = niters < peel_iters_prologue ?
2436 niters : peel_iters_prologue;
2437 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2438 /* If we need to peel for gaps, but no peeling is required, we have to
2439 peel VF iterations. */
2440 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2441 *peel_iters_epilogue = vf;
2442 }
2443
2444 return (peel_iters_prologue * scalar_single_iter_cost)
2445 + (*peel_iters_epilogue * scalar_single_iter_cost)
2446 + peel_guard_costs;
2447 }
2448
2449 /* Function vect_estimate_min_profitable_iters
2450
2451 Return the number of iterations required for the vector version of the
2452 loop to be profitable relative to the cost of the scalar version of the
2453 loop.
2454
2455 TODO: Take profile info into account before making vectorization
2456 decisions, if available. */
2457
2458 int
2459 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
2460 {
2461 int i;
2462 int min_profitable_iters;
2463 int peel_iters_prologue;
2464 int peel_iters_epilogue;
2465 int vec_inside_cost = 0;
2466 int vec_outside_cost = 0;
2467 int scalar_single_iter_cost = 0;
2468 int scalar_outside_cost = 0;
2469 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2470 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2471 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2472 int nbbs = loop->num_nodes;
2473 int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2474 int peel_guard_costs = 0;
2475 int innerloop_iters = 0, factor;
2476 VEC (slp_instance, heap) *slp_instances;
2477 slp_instance instance;
2478
2479 /* Cost model disabled. */
2480 if (!flag_vect_cost_model)
2481 {
2482 if (vect_print_dump_info (REPORT_COST))
2483 fprintf (vect_dump, "cost model disabled.");
2484 return 0;
2485 }
2486
2487 /* Requires loop versioning tests to handle misalignment. */
2488 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2489 {
2490 /* FIXME: Make cost depend on complexity of individual check. */
2491 vec_outside_cost +=
2492 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
2493 if (vect_print_dump_info (REPORT_COST))
2494 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2495 "versioning to treat misalignment.\n");
2496 }
2497
2498 /* Requires loop versioning with alias checks. */
2499 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2500 {
2501 /* FIXME: Make cost depend on complexity of individual check. */
2502 vec_outside_cost +=
2503 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
2504 if (vect_print_dump_info (REPORT_COST))
2505 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2506 "versioning aliasing.\n");
2507 }
2508
2509 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2510 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2511 vec_outside_cost += vect_get_cost (cond_branch_taken);
2512
2513 /* Count statements in scalar loop. Using this as scalar cost for a single
2514 iteration for now.
2515
2516 TODO: Add outer loop support.
2517
2518 TODO: Consider assigning different costs to different scalar
2519 statements. */
2520
2521 /* FORNOW. */
2522 if (loop->inner)
2523 innerloop_iters = 50; /* FIXME */
2524
2525 for (i = 0; i < nbbs; i++)
2526 {
2527 gimple_stmt_iterator si;
2528 basic_block bb = bbs[i];
2529
2530 if (bb->loop_father == loop->inner)
2531 factor = innerloop_iters;
2532 else
2533 factor = 1;
2534
2535 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2536 {
2537 gimple stmt = gsi_stmt (si);
2538 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2539 /* Skip stmts that are not vectorized inside the loop. */
2540 if (!STMT_VINFO_RELEVANT_P (stmt_info)
2541 && (!STMT_VINFO_LIVE_P (stmt_info)
2542 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2543 continue;
2544 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
2545 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
2546 some of the "outside" costs are generated inside the outer-loop. */
2547 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
2548 }
2549 }
2550
2551 scalar_single_iter_cost = vect_get_single_scalar_iteraion_cost (loop_vinfo);
2552
2553 /* Add additional cost for the peeled instructions in prologue and epilogue
2554 loop.
2555
2556 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2557 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2558
2559 TODO: Build an expression that represents peel_iters for prologue and
2560 epilogue to be used in a run-time test. */
2561
2562 if (npeel < 0)
2563 {
2564 peel_iters_prologue = vf/2;
2565 if (vect_print_dump_info (REPORT_COST))
2566 fprintf (vect_dump, "cost model: "
2567 "prologue peel iters set to vf/2.");
2568
2569 /* If peeling for alignment is unknown, loop bound of main loop becomes
2570 unknown. */
2571 peel_iters_epilogue = vf/2;
2572 if (vect_print_dump_info (REPORT_COST))
2573 fprintf (vect_dump, "cost model: "
2574 "epilogue peel iters set to vf/2 because "
2575 "peeling for alignment is unknown .");
2576
2577 /* If peeled iterations are unknown, count a taken branch and a not taken
2578 branch per peeled loop. Even if scalar loop iterations are known,
2579 vector iterations are not known since peeled prologue iterations are
2580 not known. Hence guards remain the same. */
2581 peel_guard_costs += 2 * (vect_get_cost (cond_branch_taken)
2582 + vect_get_cost (cond_branch_not_taken));
2583 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2584 + (peel_iters_epilogue * scalar_single_iter_cost)
2585 + peel_guard_costs;
2586 }
2587 else
2588 {
2589 peel_iters_prologue = npeel;
2590 vec_outside_cost += vect_get_known_peeling_cost (loop_vinfo,
2591 peel_iters_prologue, &peel_iters_epilogue,
2592 scalar_single_iter_cost);
2593 }
2594
2595 /* FORNOW: The scalar outside cost is incremented in one of the
2596 following ways:
2597
2598 1. The vectorizer checks for alignment and aliasing and generates
2599 a condition that allows dynamic vectorization. A cost model
2600 check is ANDED with the versioning condition. Hence scalar code
2601 path now has the added cost of the versioning check.
2602
2603 if (cost > th & versioning_check)
2604 jmp to vector code
2605
2606 Hence run-time scalar is incremented by not-taken branch cost.
2607
2608 2. The vectorizer then checks if a prologue is required. If the
2609 cost model check was not done before during versioning, it has to
2610 be done before the prologue check.
2611
2612 if (cost <= th)
2613 prologue = scalar_iters
2614 if (prologue == 0)
2615 jmp to vector code
2616 else
2617 execute prologue
2618 if (prologue == num_iters)
2619 go to exit
2620
2621 Hence the run-time scalar cost is incremented by a taken branch,
2622 plus a not-taken branch, plus a taken branch cost.
2623
2624 3. The vectorizer then checks if an epilogue is required. If the
2625 cost model check was not done before during prologue check, it
2626 has to be done with the epilogue check.
2627
2628 if (prologue == 0)
2629 jmp to vector code
2630 else
2631 execute prologue
2632 if (prologue == num_iters)
2633 go to exit
2634 vector code:
2635 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2636 jmp to epilogue
2637
2638 Hence the run-time scalar cost should be incremented by 2 taken
2639 branches.
2640
2641 TODO: The back end may reorder the BBS's differently and reverse
2642 conditions/branch directions. Change the estimates below to
2643 something more reasonable. */
2644
2645 /* If the number of iterations is known and we do not do versioning, we can
2646 decide whether to vectorize at compile time. Hence the scalar version
2647 do not carry cost model guard costs. */
2648 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2649 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2650 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2651 {
2652 /* Cost model check occurs at versioning. */
2653 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2654 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2655 scalar_outside_cost += vect_get_cost (cond_branch_not_taken);
2656 else
2657 {
2658 /* Cost model check occurs at prologue generation. */
2659 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2660 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken)
2661 + vect_get_cost (cond_branch_not_taken);
2662 /* Cost model check occurs at epilogue generation. */
2663 else
2664 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken);
2665 }
2666 }
2667
2668 /* Add SLP costs. */
2669 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2670 FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance)
2671 {
2672 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2673 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2674 }
2675
2676 /* Calculate number of iterations required to make the vector version
2677 profitable, relative to the loop bodies only. The following condition
2678 must hold true:
2679 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2680 where
2681 SIC = scalar iteration cost, VIC = vector iteration cost,
2682 VOC = vector outside cost, VF = vectorization factor,
2683 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2684 SOC = scalar outside cost for run time cost model check. */
2685
2686 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2687 {
2688 if (vec_outside_cost <= 0)
2689 min_profitable_iters = 1;
2690 else
2691 {
2692 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2693 - vec_inside_cost * peel_iters_prologue
2694 - vec_inside_cost * peel_iters_epilogue)
2695 / ((scalar_single_iter_cost * vf)
2696 - vec_inside_cost);
2697
2698 if ((scalar_single_iter_cost * vf * min_profitable_iters)
2699 <= ((vec_inside_cost * min_profitable_iters)
2700 + ((vec_outside_cost - scalar_outside_cost) * vf)))
2701 min_profitable_iters++;
2702 }
2703 }
2704 /* vector version will never be profitable. */
2705 else
2706 {
2707 if (vect_print_dump_info (REPORT_COST))
2708 fprintf (vect_dump, "cost model: the vector iteration cost = %d "
2709 "divided by the scalar iteration cost = %d "
2710 "is greater or equal to the vectorization factor = %d.",
2711 vec_inside_cost, scalar_single_iter_cost, vf);
2712 return -1;
2713 }
2714
2715 if (vect_print_dump_info (REPORT_COST))
2716 {
2717 fprintf (vect_dump, "Cost model analysis: \n");
2718 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
2719 vec_inside_cost);
2720 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
2721 vec_outside_cost);
2722 fprintf (vect_dump, " Scalar iteration cost: %d\n",
2723 scalar_single_iter_cost);
2724 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
2725 fprintf (vect_dump, " prologue iterations: %d\n",
2726 peel_iters_prologue);
2727 fprintf (vect_dump, " epilogue iterations: %d\n",
2728 peel_iters_epilogue);
2729 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
2730 min_profitable_iters);
2731 }
2732
2733 min_profitable_iters =
2734 min_profitable_iters < vf ? vf : min_profitable_iters;
2735
2736 /* Because the condition we create is:
2737 if (niters <= min_profitable_iters)
2738 then skip the vectorized loop. */
2739 min_profitable_iters--;
2740
2741 if (vect_print_dump_info (REPORT_COST))
2742 fprintf (vect_dump, " Profitability threshold = %d\n",
2743 min_profitable_iters);
2744
2745 return min_profitable_iters;
2746 }
2747
2748
2749 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2750 functions. Design better to avoid maintenance issues. */
2751
2752 /* Function vect_model_reduction_cost.
2753
2754 Models cost for a reduction operation, including the vector ops
2755 generated within the strip-mine loop, the initial definition before
2756 the loop, and the epilogue code that must be generated. */
2757
2758 static bool
2759 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2760 int ncopies)
2761 {
2762 int outer_cost = 0;
2763 enum tree_code code;
2764 optab optab;
2765 tree vectype;
2766 gimple stmt, orig_stmt;
2767 tree reduction_op;
2768 enum machine_mode mode;
2769 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2770 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2771
2772
2773 /* Cost of reduction op inside loop. */
2774 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2775 += ncopies * vect_get_cost (vector_stmt);
2776
2777 stmt = STMT_VINFO_STMT (stmt_info);
2778
2779 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2780 {
2781 case GIMPLE_SINGLE_RHS:
2782 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2783 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2784 break;
2785 case GIMPLE_UNARY_RHS:
2786 reduction_op = gimple_assign_rhs1 (stmt);
2787 break;
2788 case GIMPLE_BINARY_RHS:
2789 reduction_op = gimple_assign_rhs2 (stmt);
2790 break;
2791 case GIMPLE_TERNARY_RHS:
2792 reduction_op = gimple_assign_rhs3 (stmt);
2793 break;
2794 default:
2795 gcc_unreachable ();
2796 }
2797
2798 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2799 if (!vectype)
2800 {
2801 if (vect_print_dump_info (REPORT_COST))
2802 {
2803 fprintf (vect_dump, "unsupported data-type ");
2804 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2805 }
2806 return false;
2807 }
2808
2809 mode = TYPE_MODE (vectype);
2810 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2811
2812 if (!orig_stmt)
2813 orig_stmt = STMT_VINFO_STMT (stmt_info);
2814
2815 code = gimple_assign_rhs_code (orig_stmt);
2816
2817 /* Add in cost for initial definition. */
2818 outer_cost += vect_get_cost (scalar_to_vec);
2819
2820 /* Determine cost of epilogue code.
2821
2822 We have a reduction operator that will reduce the vector in one statement.
2823 Also requires scalar extract. */
2824
2825 if (!nested_in_vect_loop_p (loop, orig_stmt))
2826 {
2827 if (reduc_code != ERROR_MARK)
2828 outer_cost += vect_get_cost (vector_stmt)
2829 + vect_get_cost (vec_to_scalar);
2830 else
2831 {
2832 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2833 tree bitsize =
2834 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2835 int element_bitsize = tree_low_cst (bitsize, 1);
2836 int nelements = vec_size_in_bits / element_bitsize;
2837
2838 optab = optab_for_tree_code (code, vectype, optab_default);
2839
2840 /* We have a whole vector shift available. */
2841 if (VECTOR_MODE_P (mode)
2842 && optab_handler (optab, mode) != CODE_FOR_nothing
2843 && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
2844 /* Final reduction via vector shifts and the reduction operator. Also
2845 requires scalar extract. */
2846 outer_cost += ((exact_log2(nelements) * 2)
2847 * vect_get_cost (vector_stmt)
2848 + vect_get_cost (vec_to_scalar));
2849 else
2850 /* Use extracts and reduction op for final reduction. For N elements,
2851 we have N extracts and N-1 reduction ops. */
2852 outer_cost += ((nelements + nelements - 1)
2853 * vect_get_cost (vector_stmt));
2854 }
2855 }
2856
2857 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2858
2859 if (vect_print_dump_info (REPORT_COST))
2860 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2861 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2862 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2863
2864 return true;
2865 }
2866
2867
2868 /* Function vect_model_induction_cost.
2869
2870 Models cost for induction operations. */
2871
2872 static void
2873 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2874 {
2875 /* loop cost for vec_loop. */
2876 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2877 = ncopies * vect_get_cost (vector_stmt);
2878 /* prologue cost for vec_init and vec_step. */
2879 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)
2880 = 2 * vect_get_cost (scalar_to_vec);
2881
2882 if (vect_print_dump_info (REPORT_COST))
2883 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2884 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2885 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2886 }
2887
2888
2889 /* Function get_initial_def_for_induction
2890
2891 Input:
2892 STMT - a stmt that performs an induction operation in the loop.
2893 IV_PHI - the initial value of the induction variable
2894
2895 Output:
2896 Return a vector variable, initialized with the first VF values of
2897 the induction variable. E.g., for an iv with IV_PHI='X' and
2898 evolution S, for a vector of 4 units, we want to return:
2899 [X, X + S, X + 2*S, X + 3*S]. */
2900
2901 static tree
2902 get_initial_def_for_induction (gimple iv_phi)
2903 {
2904 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2905 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2906 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2907 tree scalar_type;
2908 tree vectype;
2909 int nunits;
2910 edge pe = loop_preheader_edge (loop);
2911 struct loop *iv_loop;
2912 basic_block new_bb;
2913 tree vec, vec_init, vec_step, t;
2914 tree access_fn;
2915 tree new_var;
2916 tree new_name;
2917 gimple init_stmt, induction_phi, new_stmt;
2918 tree induc_def, vec_def, vec_dest;
2919 tree init_expr, step_expr;
2920 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2921 int i;
2922 bool ok;
2923 int ncopies;
2924 tree expr;
2925 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2926 bool nested_in_vect_loop = false;
2927 gimple_seq stmts = NULL;
2928 imm_use_iterator imm_iter;
2929 use_operand_p use_p;
2930 gimple exit_phi;
2931 edge latch_e;
2932 tree loop_arg;
2933 gimple_stmt_iterator si;
2934 basic_block bb = gimple_bb (iv_phi);
2935 tree stepvectype;
2936 tree resvectype;
2937
2938 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
2939 if (nested_in_vect_loop_p (loop, iv_phi))
2940 {
2941 nested_in_vect_loop = true;
2942 iv_loop = loop->inner;
2943 }
2944 else
2945 iv_loop = loop;
2946 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2947
2948 latch_e = loop_latch_edge (iv_loop);
2949 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2950
2951 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2952 gcc_assert (access_fn);
2953 STRIP_NOPS (access_fn);
2954 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2955 &init_expr, &step_expr);
2956 gcc_assert (ok);
2957 pe = loop_preheader_edge (iv_loop);
2958
2959 scalar_type = TREE_TYPE (init_expr);
2960 vectype = get_vectype_for_scalar_type (scalar_type);
2961 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
2962 gcc_assert (vectype);
2963 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2964 ncopies = vf / nunits;
2965
2966 gcc_assert (phi_info);
2967 gcc_assert (ncopies >= 1);
2968
2969 /* Find the first insertion point in the BB. */
2970 si = gsi_after_labels (bb);
2971
2972 /* Create the vector that holds the initial_value of the induction. */
2973 if (nested_in_vect_loop)
2974 {
2975 /* iv_loop is nested in the loop to be vectorized. init_expr had already
2976 been created during vectorization of previous stmts. We obtain it
2977 from the STMT_VINFO_VEC_STMT of the defining stmt. */
2978 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2979 loop_preheader_edge (iv_loop));
2980 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2981 }
2982 else
2983 {
2984 /* iv_loop is the loop to be vectorized. Create:
2985 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
2986 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2987 add_referenced_var (new_var);
2988
2989 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2990 if (stmts)
2991 {
2992 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2993 gcc_assert (!new_bb);
2994 }
2995
2996 t = NULL_TREE;
2997 t = tree_cons (NULL_TREE, new_name, t);
2998 for (i = 1; i < nunits; i++)
2999 {
3000 /* Create: new_name_i = new_name + step_expr */
3001 enum tree_code code = POINTER_TYPE_P (scalar_type)
3002 ? POINTER_PLUS_EXPR : PLUS_EXPR;
3003 init_stmt = gimple_build_assign_with_ops (code, new_var,
3004 new_name, step_expr);
3005 new_name = make_ssa_name (new_var, init_stmt);
3006 gimple_assign_set_lhs (init_stmt, new_name);
3007
3008 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3009 gcc_assert (!new_bb);
3010
3011 if (vect_print_dump_info (REPORT_DETAILS))
3012 {
3013 fprintf (vect_dump, "created new init_stmt: ");
3014 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
3015 }
3016 t = tree_cons (NULL_TREE, new_name, t);
3017 }
3018 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3019 vec = build_constructor_from_list (vectype, nreverse (t));
3020 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
3021 }
3022
3023
3024 /* Create the vector that holds the step of the induction. */
3025 if (nested_in_vect_loop)
3026 /* iv_loop is nested in the loop to be vectorized. Generate:
3027 vec_step = [S, S, S, S] */
3028 new_name = step_expr;
3029 else
3030 {
3031 /* iv_loop is the loop to be vectorized. Generate:
3032 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3033 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3034 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3035 expr, step_expr);
3036 }
3037
3038 t = unshare_expr (new_name);
3039 gcc_assert (CONSTANT_CLASS_P (new_name));
3040 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3041 gcc_assert (stepvectype);
3042 vec = build_vector_from_val (stepvectype, t);
3043 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
3044
3045
3046 /* Create the following def-use cycle:
3047 loop prolog:
3048 vec_init = ...
3049 vec_step = ...
3050 loop:
3051 vec_iv = PHI <vec_init, vec_loop>
3052 ...
3053 STMT
3054 ...
3055 vec_loop = vec_iv + vec_step; */
3056
3057 /* Create the induction-phi that defines the induction-operand. */
3058 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3059 add_referenced_var (vec_dest);
3060 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3061 set_vinfo_for_stmt (induction_phi,
3062 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3063 induc_def = PHI_RESULT (induction_phi);
3064
3065 /* Create the iv update inside the loop */
3066 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3067 induc_def, vec_step);
3068 vec_def = make_ssa_name (vec_dest, new_stmt);
3069 gimple_assign_set_lhs (new_stmt, vec_def);
3070 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3071 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3072 NULL));
3073
3074 /* Set the arguments of the phi node: */
3075 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3076 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3077 UNKNOWN_LOCATION);
3078
3079
3080 /* In case that vectorization factor (VF) is bigger than the number
3081 of elements that we can fit in a vectype (nunits), we have to generate
3082 more than one vector stmt - i.e - we need to "unroll" the
3083 vector stmt by a factor VF/nunits. For more details see documentation
3084 in vectorizable_operation. */
3085
3086 if (ncopies > 1)
3087 {
3088 stmt_vec_info prev_stmt_vinfo;
3089 /* FORNOW. This restriction should be relaxed. */
3090 gcc_assert (!nested_in_vect_loop);
3091
3092 /* Create the vector that holds the step of the induction. */
3093 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3094 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3095 expr, step_expr);
3096 t = unshare_expr (new_name);
3097 gcc_assert (CONSTANT_CLASS_P (new_name));
3098 vec = build_vector_from_val (stepvectype, t);
3099 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
3100
3101 vec_def = induc_def;
3102 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3103 for (i = 1; i < ncopies; i++)
3104 {
3105 /* vec_i = vec_prev + vec_step */
3106 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3107 vec_def, vec_step);
3108 vec_def = make_ssa_name (vec_dest, new_stmt);
3109 gimple_assign_set_lhs (new_stmt, vec_def);
3110
3111 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3112 if (!useless_type_conversion_p (resvectype, vectype))
3113 {
3114 new_stmt = gimple_build_assign_with_ops
3115 (VIEW_CONVERT_EXPR,
3116 vect_get_new_vect_var (resvectype, vect_simple_var,
3117 "vec_iv_"),
3118 build1 (VIEW_CONVERT_EXPR, resvectype,
3119 gimple_assign_lhs (new_stmt)), NULL_TREE);
3120 gimple_assign_set_lhs (new_stmt,
3121 make_ssa_name
3122 (gimple_assign_lhs (new_stmt), new_stmt));
3123 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3124 }
3125 set_vinfo_for_stmt (new_stmt,
3126 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3127 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3128 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3129 }
3130 }
3131
3132 if (nested_in_vect_loop)
3133 {
3134 /* Find the loop-closed exit-phi of the induction, and record
3135 the final vector of induction results: */
3136 exit_phi = NULL;
3137 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3138 {
3139 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3140 {
3141 exit_phi = USE_STMT (use_p);
3142 break;
3143 }
3144 }
3145 if (exit_phi)
3146 {
3147 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3148 /* FORNOW. Currently not supporting the case that an inner-loop induction
3149 is not used in the outer-loop (i.e. only outside the outer-loop). */
3150 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3151 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3152
3153 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3154 if (vect_print_dump_info (REPORT_DETAILS))
3155 {
3156 fprintf (vect_dump, "vector of inductions after inner-loop:");
3157 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
3158 }
3159 }
3160 }
3161
3162
3163 if (vect_print_dump_info (REPORT_DETAILS))
3164 {
3165 fprintf (vect_dump, "transform induction: created def-use cycle: ");
3166 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
3167 fprintf (vect_dump, "\n");
3168 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
3169 }
3170
3171 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3172 if (!useless_type_conversion_p (resvectype, vectype))
3173 {
3174 new_stmt = gimple_build_assign_with_ops
3175 (VIEW_CONVERT_EXPR,
3176 vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3177 build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3178 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3179 gimple_assign_set_lhs (new_stmt, induc_def);
3180 si = gsi_start_bb (bb);
3181 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3182 set_vinfo_for_stmt (new_stmt,
3183 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3184 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3185 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3186 }
3187
3188 return induc_def;
3189 }
3190
3191
3192 /* Function get_initial_def_for_reduction
3193
3194 Input:
3195 STMT - a stmt that performs a reduction operation in the loop.
3196 INIT_VAL - the initial value of the reduction variable
3197
3198 Output:
3199 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3200 of the reduction (used for adjusting the epilog - see below).
3201 Return a vector variable, initialized according to the operation that STMT
3202 performs. This vector will be used as the initial value of the
3203 vector of partial results.
3204
3205 Option1 (adjust in epilog): Initialize the vector as follows:
3206 add/bit or/xor: [0,0,...,0,0]
3207 mult/bit and: [1,1,...,1,1]
3208 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3209 and when necessary (e.g. add/mult case) let the caller know
3210 that it needs to adjust the result by init_val.
3211
3212 Option2: Initialize the vector as follows:
3213 add/bit or/xor: [init_val,0,0,...,0]
3214 mult/bit and: [init_val,1,1,...,1]
3215 min/max/cond_expr: [init_val,init_val,...,init_val]
3216 and no adjustments are needed.
3217
3218 For example, for the following code:
3219
3220 s = init_val;
3221 for (i=0;i<n;i++)
3222 s = s + a[i];
3223
3224 STMT is 's = s + a[i]', and the reduction variable is 's'.
3225 For a vector of 4 units, we want to return either [0,0,0,init_val],
3226 or [0,0,0,0] and let the caller know that it needs to adjust
3227 the result at the end by 'init_val'.
3228
3229 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3230 initialization vector is simpler (same element in all entries), if
3231 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3232
3233 A cost model should help decide between these two schemes. */
3234
3235 tree
3236 get_initial_def_for_reduction (gimple stmt, tree init_val,
3237 tree *adjustment_def)
3238 {
3239 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3240 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3241 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3242 tree scalar_type = TREE_TYPE (init_val);
3243 tree vectype = get_vectype_for_scalar_type (scalar_type);
3244 int nunits;
3245 enum tree_code code = gimple_assign_rhs_code (stmt);
3246 tree def_for_init;
3247 tree init_def;
3248 tree t = NULL_TREE;
3249 int i;
3250 bool nested_in_vect_loop = false;
3251 tree init_value;
3252 REAL_VALUE_TYPE real_init_val = dconst0;
3253 int int_init_val = 0;
3254 gimple def_stmt = NULL;
3255
3256 gcc_assert (vectype);
3257 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3258
3259 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3260 || SCALAR_FLOAT_TYPE_P (scalar_type));
3261
3262 if (nested_in_vect_loop_p (loop, stmt))
3263 nested_in_vect_loop = true;
3264 else
3265 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3266
3267 /* In case of double reduction we only create a vector variable to be put
3268 in the reduction phi node. The actual statement creation is done in
3269 vect_create_epilog_for_reduction. */
3270 if (adjustment_def && nested_in_vect_loop
3271 && TREE_CODE (init_val) == SSA_NAME
3272 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3273 && gimple_code (def_stmt) == GIMPLE_PHI
3274 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3275 && vinfo_for_stmt (def_stmt)
3276 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3277 == vect_double_reduction_def)
3278 {
3279 *adjustment_def = NULL;
3280 return vect_create_destination_var (init_val, vectype);
3281 }
3282
3283 if (TREE_CONSTANT (init_val))
3284 {
3285 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3286 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3287 else
3288 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3289 }
3290 else
3291 init_value = init_val;
3292
3293 switch (code)
3294 {
3295 case WIDEN_SUM_EXPR:
3296 case DOT_PROD_EXPR:
3297 case PLUS_EXPR:
3298 case MINUS_EXPR:
3299 case BIT_IOR_EXPR:
3300 case BIT_XOR_EXPR:
3301 case MULT_EXPR:
3302 case BIT_AND_EXPR:
3303 /* ADJUSMENT_DEF is NULL when called from
3304 vect_create_epilog_for_reduction to vectorize double reduction. */
3305 if (adjustment_def)
3306 {
3307 if (nested_in_vect_loop)
3308 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3309 NULL);
3310 else
3311 *adjustment_def = init_val;
3312 }
3313
3314 if (code == MULT_EXPR)
3315 {
3316 real_init_val = dconst1;
3317 int_init_val = 1;
3318 }
3319
3320 if (code == BIT_AND_EXPR)
3321 int_init_val = -1;
3322
3323 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3324 def_for_init = build_real (scalar_type, real_init_val);
3325 else
3326 def_for_init = build_int_cst (scalar_type, int_init_val);
3327
3328 /* Create a vector of '0' or '1' except the first element. */
3329 for (i = nunits - 2; i >= 0; --i)
3330 t = tree_cons (NULL_TREE, def_for_init, t);
3331
3332 /* Option1: the first element is '0' or '1' as well. */
3333 if (adjustment_def)
3334 {
3335 t = tree_cons (NULL_TREE, def_for_init, t);
3336 init_def = build_vector (vectype, t);
3337 break;
3338 }
3339
3340 /* Option2: the first element is INIT_VAL. */
3341 t = tree_cons (NULL_TREE, init_value, t);
3342 if (TREE_CONSTANT (init_val))
3343 init_def = build_vector (vectype, t);
3344 else
3345 init_def = build_constructor_from_list (vectype, t);
3346
3347 break;
3348
3349 case MIN_EXPR:
3350 case MAX_EXPR:
3351 case COND_EXPR:
3352 if (adjustment_def)
3353 {
3354 *adjustment_def = NULL_TREE;
3355 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3356 break;
3357 }
3358
3359 init_def = build_vector_from_val (vectype, init_value);
3360 break;
3361
3362 default:
3363 gcc_unreachable ();
3364 }
3365
3366 return init_def;
3367 }
3368
3369
3370 /* Function vect_create_epilog_for_reduction
3371
3372 Create code at the loop-epilog to finalize the result of a reduction
3373 computation.
3374
3375 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3376 reduction statements.
3377 STMT is the scalar reduction stmt that is being vectorized.
3378 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3379 number of elements that we can fit in a vectype (nunits). In this case
3380 we have to generate more than one vector stmt - i.e - we need to "unroll"
3381 the vector stmt by a factor VF/nunits. For more details see documentation
3382 in vectorizable_operation.
3383 REDUC_CODE is the tree-code for the epilog reduction.
3384 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3385 computation.
3386 REDUC_INDEX is the index of the operand in the right hand side of the
3387 statement that is defined by REDUCTION_PHI.
3388 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3389 SLP_NODE is an SLP node containing a group of reduction statements. The
3390 first one in this group is STMT.
3391
3392 This function:
3393 1. Creates the reduction def-use cycles: sets the arguments for
3394 REDUCTION_PHIS:
3395 The loop-entry argument is the vectorized initial-value of the reduction.
3396 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3397 sums.
3398 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3399 by applying the operation specified by REDUC_CODE if available, or by
3400 other means (whole-vector shifts or a scalar loop).
3401 The function also creates a new phi node at the loop exit to preserve
3402 loop-closed form, as illustrated below.
3403
3404 The flow at the entry to this function:
3405
3406 loop:
3407 vec_def = phi <null, null> # REDUCTION_PHI
3408 VECT_DEF = vector_stmt # vectorized form of STMT
3409 s_loop = scalar_stmt # (scalar) STMT
3410 loop_exit:
3411 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3412 use <s_out0>
3413 use <s_out0>
3414
3415 The above is transformed by this function into:
3416
3417 loop:
3418 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3419 VECT_DEF = vector_stmt # vectorized form of STMT
3420 s_loop = scalar_stmt # (scalar) STMT
3421 loop_exit:
3422 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3423 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3424 v_out2 = reduce <v_out1>
3425 s_out3 = extract_field <v_out2, 0>
3426 s_out4 = adjust_result <s_out3>
3427 use <s_out4>
3428 use <s_out4>
3429 */
3430
3431 static void
3432 vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
3433 int ncopies, enum tree_code reduc_code,
3434 VEC (gimple, heap) *reduction_phis,
3435 int reduc_index, bool double_reduc,
3436 slp_tree slp_node)
3437 {
3438 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3439 stmt_vec_info prev_phi_info;
3440 tree vectype;
3441 enum machine_mode mode;
3442 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3443 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3444 basic_block exit_bb;
3445 tree scalar_dest;
3446 tree scalar_type;
3447 gimple new_phi = NULL, phi;
3448 gimple_stmt_iterator exit_gsi;
3449 tree vec_dest;
3450 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3451 gimple epilog_stmt = NULL;
3452 enum tree_code code = gimple_assign_rhs_code (stmt);
3453 gimple exit_phi;
3454 tree bitsize, bitpos;
3455 tree adjustment_def = NULL;
3456 tree vec_initial_def = NULL;
3457 tree reduction_op, expr, def;
3458 tree orig_name, scalar_result;
3459 imm_use_iterator imm_iter, phi_imm_iter;
3460 use_operand_p use_p, phi_use_p;
3461 bool extract_scalar_result = false;
3462 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3463 bool nested_in_vect_loop = false;
3464 VEC (gimple, heap) *new_phis = NULL;
3465 VEC (gimple, heap) *inner_phis = NULL;
3466 enum vect_def_type dt = vect_unknown_def_type;
3467 int j, i;
3468 VEC (tree, heap) *scalar_results = NULL;
3469 unsigned int group_size = 1, k, ratio;
3470 VEC (tree, heap) *vec_initial_defs = NULL;
3471 VEC (gimple, heap) *phis;
3472 bool slp_reduc = false;
3473 tree new_phi_result;
3474 gimple inner_phi = NULL;
3475
3476 if (slp_node)
3477 group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node));
3478
3479 if (nested_in_vect_loop_p (loop, stmt))
3480 {
3481 outer_loop = loop;
3482 loop = loop->inner;
3483 nested_in_vect_loop = true;
3484 gcc_assert (!slp_node);
3485 }
3486
3487 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3488 {
3489 case GIMPLE_SINGLE_RHS:
3490 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3491 == ternary_op);
3492 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3493 break;
3494 case GIMPLE_UNARY_RHS:
3495 reduction_op = gimple_assign_rhs1 (stmt);
3496 break;
3497 case GIMPLE_BINARY_RHS:
3498 reduction_op = reduc_index ?
3499 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3500 break;
3501 case GIMPLE_TERNARY_RHS:
3502 reduction_op = gimple_op (stmt, reduc_index + 1);
3503 break;
3504 default:
3505 gcc_unreachable ();
3506 }
3507
3508 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3509 gcc_assert (vectype);
3510 mode = TYPE_MODE (vectype);
3511
3512 /* 1. Create the reduction def-use cycle:
3513 Set the arguments of REDUCTION_PHIS, i.e., transform
3514
3515 loop:
3516 vec_def = phi <null, null> # REDUCTION_PHI
3517 VECT_DEF = vector_stmt # vectorized form of STMT
3518 ...
3519
3520 into:
3521
3522 loop:
3523 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3524 VECT_DEF = vector_stmt # vectorized form of STMT
3525 ...
3526
3527 (in case of SLP, do it for all the phis). */
3528
3529 /* Get the loop-entry arguments. */
3530 if (slp_node)
3531 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3532 NULL, slp_node, reduc_index);
3533 else
3534 {
3535 vec_initial_defs = VEC_alloc (tree, heap, 1);
3536 /* For the case of reduction, vect_get_vec_def_for_operand returns
3537 the scalar def before the loop, that defines the initial value
3538 of the reduction variable. */
3539 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3540 &adjustment_def);
3541 VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
3542 }
3543
3544 /* Set phi nodes arguments. */
3545 FOR_EACH_VEC_ELT (gimple, reduction_phis, i, phi)
3546 {
3547 tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
3548 tree def = VEC_index (tree, vect_defs, i);
3549 for (j = 0; j < ncopies; j++)
3550 {
3551 /* Set the loop-entry arg of the reduction-phi. */
3552 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3553 UNKNOWN_LOCATION);
3554
3555 /* Set the loop-latch arg for the reduction-phi. */
3556 if (j > 0)
3557 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3558
3559 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3560
3561 if (vect_print_dump_info (REPORT_DETAILS))
3562 {
3563 fprintf (vect_dump, "transform reduction: created def-use"
3564 " cycle: ");
3565 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
3566 fprintf (vect_dump, "\n");
3567 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0,
3568 TDF_SLIM);
3569 }
3570
3571 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3572 }
3573 }
3574
3575 VEC_free (tree, heap, vec_initial_defs);
3576
3577 /* 2. Create epilog code.
3578 The reduction epilog code operates across the elements of the vector
3579 of partial results computed by the vectorized loop.
3580 The reduction epilog code consists of:
3581
3582 step 1: compute the scalar result in a vector (v_out2)
3583 step 2: extract the scalar result (s_out3) from the vector (v_out2)
3584 step 3: adjust the scalar result (s_out3) if needed.
3585
3586 Step 1 can be accomplished using one the following three schemes:
3587 (scheme 1) using reduc_code, if available.
3588 (scheme 2) using whole-vector shifts, if available.
3589 (scheme 3) using a scalar loop. In this case steps 1+2 above are
3590 combined.
3591
3592 The overall epilog code looks like this:
3593
3594 s_out0 = phi <s_loop> # original EXIT_PHI
3595 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3596 v_out2 = reduce <v_out1> # step 1
3597 s_out3 = extract_field <v_out2, 0> # step 2
3598 s_out4 = adjust_result <s_out3> # step 3
3599
3600 (step 3 is optional, and steps 1 and 2 may be combined).
3601 Lastly, the uses of s_out0 are replaced by s_out4. */
3602
3603
3604 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3605 v_out1 = phi <VECT_DEF>
3606 Store them in NEW_PHIS. */
3607
3608 exit_bb = single_exit (loop)->dest;
3609 prev_phi_info = NULL;
3610 new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3611 FOR_EACH_VEC_ELT (tree, vect_defs, i, def)
3612 {
3613 for (j = 0; j < ncopies; j++)
3614 {
3615 phi = create_phi_node (SSA_NAME_VAR (def), exit_bb);
3616 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3617 if (j == 0)
3618 VEC_quick_push (gimple, new_phis, phi);
3619 else
3620 {
3621 def = vect_get_vec_def_for_stmt_copy (dt, def);
3622 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3623 }
3624
3625 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3626 prev_phi_info = vinfo_for_stmt (phi);
3627 }
3628 }
3629
3630 /* The epilogue is created for the outer-loop, i.e., for the loop being
3631 vectorized. Create exit phis for the outer loop. */
3632 if (double_reduc)
3633 {
3634 loop = outer_loop;
3635 exit_bb = single_exit (loop)->dest;
3636 inner_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3637 FOR_EACH_VEC_ELT (gimple, new_phis, i, phi)
3638 {
3639 gimple outer_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (phi)),
3640 exit_bb);
3641 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3642 PHI_RESULT (phi));
3643 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3644 loop_vinfo, NULL));
3645 VEC_quick_push (gimple, inner_phis, phi);
3646 VEC_replace (gimple, new_phis, i, outer_phi);
3647 prev_phi_info = vinfo_for_stmt (outer_phi);
3648 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
3649 {
3650 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3651 outer_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (phi)),
3652 exit_bb);
3653 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3654 PHI_RESULT (phi));
3655 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3656 loop_vinfo, NULL));
3657 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
3658 prev_phi_info = vinfo_for_stmt (outer_phi);
3659 }
3660 }
3661 }
3662
3663 exit_gsi = gsi_after_labels (exit_bb);
3664
3665 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3666 (i.e. when reduc_code is not available) and in the final adjustment
3667 code (if needed). Also get the original scalar reduction variable as
3668 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
3669 represents a reduction pattern), the tree-code and scalar-def are
3670 taken from the original stmt that the pattern-stmt (STMT) replaces.
3671 Otherwise (it is a regular reduction) - the tree-code and scalar-def
3672 are taken from STMT. */
3673
3674 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3675 if (!orig_stmt)
3676 {
3677 /* Regular reduction */
3678 orig_stmt = stmt;
3679 }
3680 else
3681 {
3682 /* Reduction pattern */
3683 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3684 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3685 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3686 }
3687
3688 code = gimple_assign_rhs_code (orig_stmt);
3689 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3690 partial results are added and not subtracted. */
3691 if (code == MINUS_EXPR)
3692 code = PLUS_EXPR;
3693
3694 scalar_dest = gimple_assign_lhs (orig_stmt);
3695 scalar_type = TREE_TYPE (scalar_dest);
3696 scalar_results = VEC_alloc (tree, heap, group_size);
3697 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3698 bitsize = TYPE_SIZE (scalar_type);
3699
3700 /* In case this is a reduction in an inner-loop while vectorizing an outer
3701 loop - we don't need to extract a single scalar result at the end of the
3702 inner-loop (unless it is double reduction, i.e., the use of reduction is
3703 outside the outer-loop). The final vector of partial results will be used
3704 in the vectorized outer-loop, or reduced to a scalar result at the end of
3705 the outer-loop. */
3706 if (nested_in_vect_loop && !double_reduc)
3707 goto vect_finalize_reduction;
3708
3709 /* SLP reduction without reduction chain, e.g.,
3710 # a1 = phi <a2, a0>
3711 # b1 = phi <b2, b0>
3712 a2 = operation (a1)
3713 b2 = operation (b1) */
3714 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
3715
3716 /* In case of reduction chain, e.g.,
3717 # a1 = phi <a3, a0>
3718 a2 = operation (a1)
3719 a3 = operation (a2),
3720
3721 we may end up with more than one vector result. Here we reduce them to
3722 one vector. */
3723 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
3724 {
3725 tree first_vect = PHI_RESULT (VEC_index (gimple, new_phis, 0));
3726 tree tmp;
3727 gimple new_vec_stmt = NULL;
3728
3729 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3730 for (k = 1; k < VEC_length (gimple, new_phis); k++)
3731 {
3732 gimple next_phi = VEC_index (gimple, new_phis, k);
3733 tree second_vect = PHI_RESULT (next_phi);
3734
3735 tmp = build2 (code, vectype, first_vect, second_vect);
3736 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
3737 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
3738 gimple_assign_set_lhs (new_vec_stmt, first_vect);
3739 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
3740 }
3741
3742 new_phi_result = first_vect;
3743 if (new_vec_stmt)
3744 {
3745 VEC_truncate (gimple, new_phis, 0);
3746 VEC_safe_push (gimple, heap, new_phis, new_vec_stmt);
3747 }
3748 }
3749 else
3750 new_phi_result = PHI_RESULT (VEC_index (gimple, new_phis, 0));
3751
3752 /* 2.3 Create the reduction code, using one of the three schemes described
3753 above. In SLP we simply need to extract all the elements from the
3754 vector (without reducing them), so we use scalar shifts. */
3755 if (reduc_code != ERROR_MARK && !slp_reduc)
3756 {
3757 tree tmp;
3758
3759 /*** Case 1: Create:
3760 v_out2 = reduc_expr <v_out1> */
3761
3762 if (vect_print_dump_info (REPORT_DETAILS))
3763 fprintf (vect_dump, "Reduce using direct vector reduction.");
3764
3765 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3766 tmp = build1 (reduc_code, vectype, new_phi_result);
3767 epilog_stmt = gimple_build_assign (vec_dest, tmp);
3768 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3769 gimple_assign_set_lhs (epilog_stmt, new_temp);
3770 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3771
3772 extract_scalar_result = true;
3773 }
3774 else
3775 {
3776 enum tree_code shift_code = ERROR_MARK;
3777 bool have_whole_vector_shift = true;
3778 int bit_offset;
3779 int element_bitsize = tree_low_cst (bitsize, 1);
3780 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3781 tree vec_temp;
3782
3783 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3784 shift_code = VEC_RSHIFT_EXPR;
3785 else
3786 have_whole_vector_shift = false;
3787
3788 /* Regardless of whether we have a whole vector shift, if we're
3789 emulating the operation via tree-vect-generic, we don't want
3790 to use it. Only the first round of the reduction is likely
3791 to still be profitable via emulation. */
3792 /* ??? It might be better to emit a reduction tree code here, so that
3793 tree-vect-generic can expand the first round via bit tricks. */
3794 if (!VECTOR_MODE_P (mode))
3795 have_whole_vector_shift = false;
3796 else
3797 {
3798 optab optab = optab_for_tree_code (code, vectype, optab_default);
3799 if (optab_handler (optab, mode) == CODE_FOR_nothing)
3800 have_whole_vector_shift = false;
3801 }
3802
3803 if (have_whole_vector_shift && !slp_reduc)
3804 {
3805 /*** Case 2: Create:
3806 for (offset = VS/2; offset >= element_size; offset/=2)
3807 {
3808 Create: va' = vec_shift <va, offset>
3809 Create: va = vop <va, va'>
3810 } */
3811
3812 if (vect_print_dump_info (REPORT_DETAILS))
3813 fprintf (vect_dump, "Reduce using vector shifts");
3814
3815 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3816 new_temp = new_phi_result;
3817 for (bit_offset = vec_size_in_bits/2;
3818 bit_offset >= element_bitsize;
3819 bit_offset /= 2)
3820 {
3821 tree bitpos = size_int (bit_offset);
3822
3823 epilog_stmt = gimple_build_assign_with_ops (shift_code,
3824 vec_dest, new_temp, bitpos);
3825 new_name = make_ssa_name (vec_dest, epilog_stmt);
3826 gimple_assign_set_lhs (epilog_stmt, new_name);
3827 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3828
3829 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3830 new_name, new_temp);
3831 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3832 gimple_assign_set_lhs (epilog_stmt, new_temp);
3833 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3834 }
3835
3836 extract_scalar_result = true;
3837 }
3838 else
3839 {
3840 tree rhs;
3841
3842 /*** Case 3: Create:
3843 s = extract_field <v_out2, 0>
3844 for (offset = element_size;
3845 offset < vector_size;
3846 offset += element_size;)
3847 {
3848 Create: s' = extract_field <v_out2, offset>
3849 Create: s = op <s, s'> // For non SLP cases
3850 } */
3851
3852 if (vect_print_dump_info (REPORT_DETAILS))
3853 fprintf (vect_dump, "Reduce using scalar code. ");
3854
3855 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3856 FOR_EACH_VEC_ELT (gimple, new_phis, i, new_phi)
3857 {
3858 if (gimple_code (new_phi) == GIMPLE_PHI)
3859 vec_temp = PHI_RESULT (new_phi);
3860 else
3861 vec_temp = gimple_assign_lhs (new_phi);
3862 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3863 bitsize_zero_node);
3864 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3865 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3866 gimple_assign_set_lhs (epilog_stmt, new_temp);
3867 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3868
3869 /* In SLP we don't need to apply reduction operation, so we just
3870 collect s' values in SCALAR_RESULTS. */
3871 if (slp_reduc)
3872 VEC_safe_push (tree, heap, scalar_results, new_temp);
3873
3874 for (bit_offset = element_bitsize;
3875 bit_offset < vec_size_in_bits;
3876 bit_offset += element_bitsize)
3877 {
3878 tree bitpos = bitsize_int (bit_offset);
3879 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
3880 bitsize, bitpos);
3881
3882 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3883 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3884 gimple_assign_set_lhs (epilog_stmt, new_name);
3885 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3886
3887 if (slp_reduc)
3888 {
3889 /* In SLP we don't need to apply reduction operation, so
3890 we just collect s' values in SCALAR_RESULTS. */
3891 new_temp = new_name;
3892 VEC_safe_push (tree, heap, scalar_results, new_name);
3893 }
3894 else
3895 {
3896 epilog_stmt = gimple_build_assign_with_ops (code,
3897 new_scalar_dest, new_name, new_temp);
3898 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3899 gimple_assign_set_lhs (epilog_stmt, new_temp);
3900 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3901 }
3902 }
3903 }
3904
3905 /* The only case where we need to reduce scalar results in SLP, is
3906 unrolling. If the size of SCALAR_RESULTS is greater than
3907 GROUP_SIZE, we reduce them combining elements modulo
3908 GROUP_SIZE. */
3909 if (slp_reduc)
3910 {
3911 tree res, first_res, new_res;
3912 gimple new_stmt;
3913
3914 /* Reduce multiple scalar results in case of SLP unrolling. */
3915 for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
3916 j++)
3917 {
3918 first_res = VEC_index (tree, scalar_results, j % group_size);
3919 new_stmt = gimple_build_assign_with_ops (code,
3920 new_scalar_dest, first_res, res);
3921 new_res = make_ssa_name (new_scalar_dest, new_stmt);
3922 gimple_assign_set_lhs (new_stmt, new_res);
3923 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
3924 VEC_replace (tree, scalar_results, j % group_size, new_res);
3925 }
3926 }
3927 else
3928 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
3929 VEC_safe_push (tree, heap, scalar_results, new_temp);
3930
3931 extract_scalar_result = false;
3932 }
3933 }
3934
3935 /* 2.4 Extract the final scalar result. Create:
3936 s_out3 = extract_field <v_out2, bitpos> */
3937
3938 if (extract_scalar_result)
3939 {
3940 tree rhs;
3941
3942 if (vect_print_dump_info (REPORT_DETAILS))
3943 fprintf (vect_dump, "extract scalar result");
3944
3945 if (BYTES_BIG_ENDIAN)
3946 bitpos = size_binop (MULT_EXPR,
3947 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3948 TYPE_SIZE (scalar_type));
3949 else
3950 bitpos = bitsize_zero_node;
3951
3952 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3953 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3954 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3955 gimple_assign_set_lhs (epilog_stmt, new_temp);
3956 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3957 VEC_safe_push (tree, heap, scalar_results, new_temp);
3958 }
3959
3960 vect_finalize_reduction:
3961
3962 if (double_reduc)
3963 loop = loop->inner;
3964
3965 /* 2.5 Adjust the final result by the initial value of the reduction
3966 variable. (When such adjustment is not needed, then
3967 'adjustment_def' is zero). For example, if code is PLUS we create:
3968 new_temp = loop_exit_def + adjustment_def */
3969
3970 if (adjustment_def)
3971 {
3972 gcc_assert (!slp_reduc);
3973 if (nested_in_vect_loop)
3974 {
3975 new_phi = VEC_index (gimple, new_phis, 0);
3976 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3977 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3978 new_dest = vect_create_destination_var (scalar_dest, vectype);
3979 }
3980 else
3981 {
3982 new_temp = VEC_index (tree, scalar_results, 0);
3983 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3984 expr = build2 (code, scalar_type, new_temp, adjustment_def);
3985 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3986 }
3987
3988 epilog_stmt = gimple_build_assign (new_dest, expr);
3989 new_temp = make_ssa_name (new_dest, epilog_stmt);
3990 gimple_assign_set_lhs (epilog_stmt, new_temp);
3991 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3992 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3993 if (nested_in_vect_loop)
3994 {
3995 set_vinfo_for_stmt (epilog_stmt,
3996 new_stmt_vec_info (epilog_stmt, loop_vinfo,
3997 NULL));
3998 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3999 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4000
4001 if (!double_reduc)
4002 VEC_quick_push (tree, scalar_results, new_temp);
4003 else
4004 VEC_replace (tree, scalar_results, 0, new_temp);
4005 }
4006 else
4007 VEC_replace (tree, scalar_results, 0, new_temp);
4008
4009 VEC_replace (gimple, new_phis, 0, epilog_stmt);
4010 }
4011
4012 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4013 phis with new adjusted scalar results, i.e., replace use <s_out0>
4014 with use <s_out4>.
4015
4016 Transform:
4017 loop_exit:
4018 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4019 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4020 v_out2 = reduce <v_out1>
4021 s_out3 = extract_field <v_out2, 0>
4022 s_out4 = adjust_result <s_out3>
4023 use <s_out0>
4024 use <s_out0>
4025
4026 into:
4027
4028 loop_exit:
4029 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4030 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4031 v_out2 = reduce <v_out1>
4032 s_out3 = extract_field <v_out2, 0>
4033 s_out4 = adjust_result <s_out3>
4034 use <s_out4>
4035 use <s_out4> */
4036
4037
4038 /* In SLP reduction chain we reduce vector results into one vector if
4039 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4040 the last stmt in the reduction chain, since we are looking for the loop
4041 exit phi node. */
4042 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4043 {
4044 scalar_dest = gimple_assign_lhs (VEC_index (gimple,
4045 SLP_TREE_SCALAR_STMTS (slp_node),
4046 group_size - 1));
4047 group_size = 1;
4048 }
4049
4050 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4051 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
4052 need to match SCALAR_RESULTS with corresponding statements. The first
4053 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4054 the first vector stmt, etc.
4055 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
4056 if (group_size > VEC_length (gimple, new_phis))
4057 {
4058 ratio = group_size / VEC_length (gimple, new_phis);
4059 gcc_assert (!(group_size % VEC_length (gimple, new_phis)));
4060 }
4061 else
4062 ratio = 1;
4063
4064 for (k = 0; k < group_size; k++)
4065 {
4066 if (k % ratio == 0)
4067 {
4068 epilog_stmt = VEC_index (gimple, new_phis, k / ratio);
4069 reduction_phi = VEC_index (gimple, reduction_phis, k / ratio);
4070 if (double_reduc)
4071 inner_phi = VEC_index (gimple, inner_phis, k / ratio);
4072 }
4073
4074 if (slp_reduc)
4075 {
4076 gimple current_stmt = VEC_index (gimple,
4077 SLP_TREE_SCALAR_STMTS (slp_node), k);
4078
4079 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4080 /* SLP statements can't participate in patterns. */
4081 gcc_assert (!orig_stmt);
4082 scalar_dest = gimple_assign_lhs (current_stmt);
4083 }
4084
4085 phis = VEC_alloc (gimple, heap, 3);
4086 /* Find the loop-closed-use at the loop exit of the original scalar
4087 result. (The reduction result is expected to have two immediate uses -
4088 one at the latch block, and one at the loop exit). */
4089 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4090 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4091 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
4092
4093 /* We expect to have found an exit_phi because of loop-closed-ssa
4094 form. */
4095 gcc_assert (!VEC_empty (gimple, phis));
4096
4097 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
4098 {
4099 if (outer_loop)
4100 {
4101 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4102 gimple vect_phi;
4103
4104 /* FORNOW. Currently not supporting the case that an inner-loop
4105 reduction is not used in the outer-loop (but only outside the
4106 outer-loop), unless it is double reduction. */
4107 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4108 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4109 || double_reduc);
4110
4111 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4112 if (!double_reduc
4113 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4114 != vect_double_reduction_def)
4115 continue;
4116
4117 /* Handle double reduction:
4118
4119 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4120 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4121 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4122 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
4123
4124 At that point the regular reduction (stmt2 and stmt3) is
4125 already vectorized, as well as the exit phi node, stmt4.
4126 Here we vectorize the phi node of double reduction, stmt1, and
4127 update all relevant statements. */
4128
4129 /* Go through all the uses of s2 to find double reduction phi
4130 node, i.e., stmt1 above. */
4131 orig_name = PHI_RESULT (exit_phi);
4132 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4133 {
4134 stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4135 stmt_vec_info new_phi_vinfo;
4136 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4137 basic_block bb = gimple_bb (use_stmt);
4138 gimple use;
4139
4140 /* Check that USE_STMT is really double reduction phi
4141 node. */
4142 if (gimple_code (use_stmt) != GIMPLE_PHI
4143 || gimple_phi_num_args (use_stmt) != 2
4144 || !use_stmt_vinfo
4145 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4146 != vect_double_reduction_def
4147 || bb->loop_father != outer_loop)
4148 continue;
4149
4150 /* Create vector phi node for double reduction:
4151 vs1 = phi <vs0, vs2>
4152 vs1 was created previously in this function by a call to
4153 vect_get_vec_def_for_operand and is stored in
4154 vec_initial_def;
4155 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4156 vs0 is created here. */
4157
4158 /* Create vector phi node. */
4159 vect_phi = create_phi_node (vec_initial_def, bb);
4160 new_phi_vinfo = new_stmt_vec_info (vect_phi,
4161 loop_vec_info_for_loop (outer_loop), NULL);
4162 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4163
4164 /* Create vs0 - initial def of the double reduction phi. */
4165 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4166 loop_preheader_edge (outer_loop));
4167 init_def = get_initial_def_for_reduction (stmt,
4168 preheader_arg, NULL);
4169 vect_phi_init = vect_init_vector (use_stmt, init_def,
4170 vectype, NULL);
4171
4172 /* Update phi node arguments with vs0 and vs2. */
4173 add_phi_arg (vect_phi, vect_phi_init,
4174 loop_preheader_edge (outer_loop),
4175 UNKNOWN_LOCATION);
4176 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4177 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4178 if (vect_print_dump_info (REPORT_DETAILS))
4179 {
4180 fprintf (vect_dump, "created double reduction phi "
4181 "node: ");
4182 print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
4183 }
4184
4185 vect_phi_res = PHI_RESULT (vect_phi);
4186
4187 /* Replace the use, i.e., set the correct vs1 in the regular
4188 reduction phi node. FORNOW, NCOPIES is always 1, so the
4189 loop is redundant. */
4190 use = reduction_phi;
4191 for (j = 0; j < ncopies; j++)
4192 {
4193 edge pr_edge = loop_preheader_edge (loop);
4194 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4195 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4196 }
4197 }
4198 }
4199 }
4200
4201 VEC_free (gimple, heap, phis);
4202 if (nested_in_vect_loop)
4203 {
4204 if (double_reduc)
4205 loop = outer_loop;
4206 else
4207 continue;
4208 }
4209
4210 phis = VEC_alloc (gimple, heap, 3);
4211 /* Find the loop-closed-use at the loop exit of the original scalar
4212 result. (The reduction result is expected to have two immediate uses,
4213 one at the latch block, and one at the loop exit). For double
4214 reductions we are looking for exit phis of the outer loop. */
4215 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4216 {
4217 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4218 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
4219 else
4220 {
4221 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4222 {
4223 tree phi_res = PHI_RESULT (USE_STMT (use_p));
4224
4225 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4226 {
4227 if (!flow_bb_inside_loop_p (loop,
4228 gimple_bb (USE_STMT (phi_use_p))))
4229 VEC_safe_push (gimple, heap, phis,
4230 USE_STMT (phi_use_p));
4231 }
4232 }
4233 }
4234 }
4235
4236 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
4237 {
4238 /* Replace the uses: */
4239 orig_name = PHI_RESULT (exit_phi);
4240 scalar_result = VEC_index (tree, scalar_results, k);
4241 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4242 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4243 SET_USE (use_p, scalar_result);
4244 }
4245
4246 VEC_free (gimple, heap, phis);
4247 }
4248
4249 VEC_free (tree, heap, scalar_results);
4250 VEC_free (gimple, heap, new_phis);
4251 }
4252
4253
4254 /* Function vectorizable_reduction.
4255
4256 Check if STMT performs a reduction operation that can be vectorized.
4257 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4258 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4259 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4260
4261 This function also handles reduction idioms (patterns) that have been
4262 recognized in advance during vect_pattern_recog. In this case, STMT may be
4263 of this form:
4264 X = pattern_expr (arg0, arg1, ..., X)
4265 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4266 sequence that had been detected and replaced by the pattern-stmt (STMT).
4267
4268 In some cases of reduction patterns, the type of the reduction variable X is
4269 different than the type of the other arguments of STMT.
4270 In such cases, the vectype that is used when transforming STMT into a vector
4271 stmt is different than the vectype that is used to determine the
4272 vectorization factor, because it consists of a different number of elements
4273 than the actual number of elements that are being operated upon in parallel.
4274
4275 For example, consider an accumulation of shorts into an int accumulator.
4276 On some targets it's possible to vectorize this pattern operating on 8
4277 shorts at a time (hence, the vectype for purposes of determining the
4278 vectorization factor should be V8HI); on the other hand, the vectype that
4279 is used to create the vector form is actually V4SI (the type of the result).
4280
4281 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4282 indicates what is the actual level of parallelism (V8HI in the example), so
4283 that the right vectorization factor would be derived. This vectype
4284 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4285 be used to create the vectorized stmt. The right vectype for the vectorized
4286 stmt is obtained from the type of the result X:
4287 get_vectype_for_scalar_type (TREE_TYPE (X))
4288
4289 This means that, contrary to "regular" reductions (or "regular" stmts in
4290 general), the following equation:
4291 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4292 does *NOT* necessarily hold for reduction patterns. */
4293
4294 bool
4295 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4296 gimple *vec_stmt, slp_tree slp_node)
4297 {
4298 tree vec_dest;
4299 tree scalar_dest;
4300 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4301 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4302 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4303 tree vectype_in = NULL_TREE;
4304 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4305 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4306 enum tree_code code, orig_code, epilog_reduc_code;
4307 enum machine_mode vec_mode;
4308 int op_type;
4309 optab optab, reduc_optab;
4310 tree new_temp = NULL_TREE;
4311 tree def;
4312 gimple def_stmt;
4313 enum vect_def_type dt;
4314 gimple new_phi = NULL;
4315 tree scalar_type;
4316 bool is_simple_use;
4317 gimple orig_stmt;
4318 stmt_vec_info orig_stmt_info;
4319 tree expr = NULL_TREE;
4320 int i;
4321 int ncopies;
4322 int epilog_copies;
4323 stmt_vec_info prev_stmt_info, prev_phi_info;
4324 bool single_defuse_cycle = false;
4325 tree reduc_def = NULL_TREE;
4326 gimple new_stmt = NULL;
4327 int j;
4328 tree ops[3];
4329 bool nested_cycle = false, found_nested_cycle_def = false;
4330 gimple reduc_def_stmt = NULL;
4331 /* The default is that the reduction variable is the last in statement. */
4332 int reduc_index = 2;
4333 bool double_reduc = false, dummy;
4334 basic_block def_bb;
4335 struct loop * def_stmt_loop, *outer_loop = NULL;
4336 tree def_arg;
4337 gimple def_arg_stmt;
4338 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
4339 VEC (gimple, heap) *phis = NULL;
4340 int vec_num;
4341 tree def0, def1, tem, op0, op1 = NULL_TREE;
4342
4343 /* In case of reduction chain we switch to the first stmt in the chain, but
4344 we don't update STMT_INFO, since only the last stmt is marked as reduction
4345 and has reduction properties. */
4346 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4347 stmt = GROUP_FIRST_ELEMENT (stmt_info);
4348
4349 if (nested_in_vect_loop_p (loop, stmt))
4350 {
4351 outer_loop = loop;
4352 loop = loop->inner;
4353 nested_cycle = true;
4354 }
4355
4356 /* 1. Is vectorizable reduction? */
4357 /* Not supportable if the reduction variable is used in the loop, unless
4358 it's a reduction chain. */
4359 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4360 && !GROUP_FIRST_ELEMENT (stmt_info))
4361 return false;
4362
4363 /* Reductions that are not used even in an enclosing outer-loop,
4364 are expected to be "live" (used out of the loop). */
4365 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4366 && !STMT_VINFO_LIVE_P (stmt_info))
4367 return false;
4368
4369 /* Make sure it was already recognized as a reduction computation. */
4370 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4371 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4372 return false;
4373
4374 /* 2. Has this been recognized as a reduction pattern?
4375
4376 Check if STMT represents a pattern that has been recognized
4377 in earlier analysis stages. For stmts that represent a pattern,
4378 the STMT_VINFO_RELATED_STMT field records the last stmt in
4379 the original sequence that constitutes the pattern. */
4380
4381 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4382 if (orig_stmt)
4383 {
4384 orig_stmt_info = vinfo_for_stmt (orig_stmt);
4385 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
4386 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4387 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4388 }
4389
4390 /* 3. Check the operands of the operation. The first operands are defined
4391 inside the loop body. The last operand is the reduction variable,
4392 which is defined by the loop-header-phi. */
4393
4394 gcc_assert (is_gimple_assign (stmt));
4395
4396 /* Flatten RHS. */
4397 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4398 {
4399 case GIMPLE_SINGLE_RHS:
4400 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4401 if (op_type == ternary_op)
4402 {
4403 tree rhs = gimple_assign_rhs1 (stmt);
4404 ops[0] = TREE_OPERAND (rhs, 0);
4405 ops[1] = TREE_OPERAND (rhs, 1);
4406 ops[2] = TREE_OPERAND (rhs, 2);
4407 code = TREE_CODE (rhs);
4408 }
4409 else
4410 return false;
4411 break;
4412
4413 case GIMPLE_BINARY_RHS:
4414 code = gimple_assign_rhs_code (stmt);
4415 op_type = TREE_CODE_LENGTH (code);
4416 gcc_assert (op_type == binary_op);
4417 ops[0] = gimple_assign_rhs1 (stmt);
4418 ops[1] = gimple_assign_rhs2 (stmt);
4419 break;
4420
4421 case GIMPLE_TERNARY_RHS:
4422 code = gimple_assign_rhs_code (stmt);
4423 op_type = TREE_CODE_LENGTH (code);
4424 gcc_assert (op_type == ternary_op);
4425 ops[0] = gimple_assign_rhs1 (stmt);
4426 ops[1] = gimple_assign_rhs2 (stmt);
4427 ops[2] = gimple_assign_rhs3 (stmt);
4428 break;
4429
4430 case GIMPLE_UNARY_RHS:
4431 return false;
4432
4433 default:
4434 gcc_unreachable ();
4435 }
4436
4437 if (code == COND_EXPR && slp_node)
4438 return false;
4439
4440 scalar_dest = gimple_assign_lhs (stmt);
4441 scalar_type = TREE_TYPE (scalar_dest);
4442 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4443 && !SCALAR_FLOAT_TYPE_P (scalar_type))
4444 return false;
4445
4446 /* Do not try to vectorize bit-precision reductions. */
4447 if ((TYPE_PRECISION (scalar_type)
4448 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4449 return false;
4450
4451 /* All uses but the last are expected to be defined in the loop.
4452 The last use is the reduction variable. In case of nested cycle this
4453 assumption is not true: we use reduc_index to record the index of the
4454 reduction variable. */
4455 for (i = 0; i < op_type-1; i++)
4456 {
4457 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4458 if (i == 0 && code == COND_EXPR)
4459 continue;
4460
4461 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL,
4462 &def_stmt, &def, &dt, &tem);
4463 if (!vectype_in)
4464 vectype_in = tem;
4465 gcc_assert (is_simple_use);
4466
4467 if (dt != vect_internal_def
4468 && dt != vect_external_def
4469 && dt != vect_constant_def
4470 && dt != vect_induction_def
4471 && !(dt == vect_nested_cycle && nested_cycle))
4472 return false;
4473
4474 if (dt == vect_nested_cycle)
4475 {
4476 found_nested_cycle_def = true;
4477 reduc_def_stmt = def_stmt;
4478 reduc_index = i;
4479 }
4480 }
4481
4482 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL, &def_stmt,
4483 &def, &dt, &tem);
4484 if (!vectype_in)
4485 vectype_in = tem;
4486 gcc_assert (is_simple_use);
4487 gcc_assert (dt == vect_reduction_def
4488 || dt == vect_nested_cycle
4489 || ((dt == vect_internal_def || dt == vect_external_def
4490 || dt == vect_constant_def || dt == vect_induction_def)
4491 && nested_cycle && found_nested_cycle_def));
4492 if (!found_nested_cycle_def)
4493 reduc_def_stmt = def_stmt;
4494
4495 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4496 if (orig_stmt)
4497 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4498 reduc_def_stmt,
4499 !nested_cycle,
4500 &dummy));
4501 else
4502 {
4503 gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4504 !nested_cycle, &dummy);
4505 /* We changed STMT to be the first stmt in reduction chain, hence we
4506 check that in this case the first element in the chain is STMT. */
4507 gcc_assert (stmt == tmp
4508 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4509 }
4510
4511 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4512 return false;
4513
4514 if (slp_node || PURE_SLP_STMT (stmt_info))
4515 ncopies = 1;
4516 else
4517 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4518 / TYPE_VECTOR_SUBPARTS (vectype_in));
4519
4520 gcc_assert (ncopies >= 1);
4521
4522 vec_mode = TYPE_MODE (vectype_in);
4523
4524 if (code == COND_EXPR)
4525 {
4526 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4527 {
4528 if (vect_print_dump_info (REPORT_DETAILS))
4529 fprintf (vect_dump, "unsupported condition in reduction");
4530
4531 return false;
4532 }
4533 }
4534 else
4535 {
4536 /* 4. Supportable by target? */
4537
4538 /* 4.1. check support for the operation in the loop */
4539 optab = optab_for_tree_code (code, vectype_in, optab_default);
4540 if (!optab)
4541 {
4542 if (vect_print_dump_info (REPORT_DETAILS))
4543 fprintf (vect_dump, "no optab.");
4544
4545 return false;
4546 }
4547
4548 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4549 {
4550 if (vect_print_dump_info (REPORT_DETAILS))
4551 fprintf (vect_dump, "op not supported by target.");
4552
4553 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4554 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4555 < vect_min_worthwhile_factor (code))
4556 return false;
4557
4558 if (vect_print_dump_info (REPORT_DETAILS))
4559 fprintf (vect_dump, "proceeding using word mode.");
4560 }
4561
4562 /* Worthwhile without SIMD support? */
4563 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4564 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4565 < vect_min_worthwhile_factor (code))
4566 {
4567 if (vect_print_dump_info (REPORT_DETAILS))
4568 fprintf (vect_dump, "not worthwhile without SIMD support.");
4569
4570 return false;
4571 }
4572 }
4573
4574 /* 4.2. Check support for the epilog operation.
4575
4576 If STMT represents a reduction pattern, then the type of the
4577 reduction variable may be different than the type of the rest
4578 of the arguments. For example, consider the case of accumulation
4579 of shorts into an int accumulator; The original code:
4580 S1: int_a = (int) short_a;
4581 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
4582
4583 was replaced with:
4584 STMT: int_acc = widen_sum <short_a, int_acc>
4585
4586 This means that:
4587 1. The tree-code that is used to create the vector operation in the
4588 epilog code (that reduces the partial results) is not the
4589 tree-code of STMT, but is rather the tree-code of the original
4590 stmt from the pattern that STMT is replacing. I.e, in the example
4591 above we want to use 'widen_sum' in the loop, but 'plus' in the
4592 epilog.
4593 2. The type (mode) we use to check available target support
4594 for the vector operation to be created in the *epilog*, is
4595 determined by the type of the reduction variable (in the example
4596 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4597 However the type (mode) we use to check available target support
4598 for the vector operation to be created *inside the loop*, is
4599 determined by the type of the other arguments to STMT (in the
4600 example we'd check this: optab_handler (widen_sum_optab,
4601 vect_short_mode)).
4602
4603 This is contrary to "regular" reductions, in which the types of all
4604 the arguments are the same as the type of the reduction variable.
4605 For "regular" reductions we can therefore use the same vector type
4606 (and also the same tree-code) when generating the epilog code and
4607 when generating the code inside the loop. */
4608
4609 if (orig_stmt)
4610 {
4611 /* This is a reduction pattern: get the vectype from the type of the
4612 reduction variable, and get the tree-code from orig_stmt. */
4613 orig_code = gimple_assign_rhs_code (orig_stmt);
4614 gcc_assert (vectype_out);
4615 vec_mode = TYPE_MODE (vectype_out);
4616 }
4617 else
4618 {
4619 /* Regular reduction: use the same vectype and tree-code as used for
4620 the vector code inside the loop can be used for the epilog code. */
4621 orig_code = code;
4622 }
4623
4624 if (nested_cycle)
4625 {
4626 def_bb = gimple_bb (reduc_def_stmt);
4627 def_stmt_loop = def_bb->loop_father;
4628 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4629 loop_preheader_edge (def_stmt_loop));
4630 if (TREE_CODE (def_arg) == SSA_NAME
4631 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4632 && gimple_code (def_arg_stmt) == GIMPLE_PHI
4633 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4634 && vinfo_for_stmt (def_arg_stmt)
4635 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4636 == vect_double_reduction_def)
4637 double_reduc = true;
4638 }
4639
4640 epilog_reduc_code = ERROR_MARK;
4641 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4642 {
4643 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4644 optab_default);
4645 if (!reduc_optab)
4646 {
4647 if (vect_print_dump_info (REPORT_DETAILS))
4648 fprintf (vect_dump, "no optab for reduction.");
4649
4650 epilog_reduc_code = ERROR_MARK;
4651 }
4652
4653 if (reduc_optab
4654 && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4655 {
4656 if (vect_print_dump_info (REPORT_DETAILS))
4657 fprintf (vect_dump, "reduc op not supported by target.");
4658
4659 epilog_reduc_code = ERROR_MARK;
4660 }
4661 }
4662 else
4663 {
4664 if (!nested_cycle || double_reduc)
4665 {
4666 if (vect_print_dump_info (REPORT_DETAILS))
4667 fprintf (vect_dump, "no reduc code for scalar code.");
4668
4669 return false;
4670 }
4671 }
4672
4673 if (double_reduc && ncopies > 1)
4674 {
4675 if (vect_print_dump_info (REPORT_DETAILS))
4676 fprintf (vect_dump, "multiple types in double reduction");
4677
4678 return false;
4679 }
4680
4681 /* In case of widenning multiplication by a constant, we update the type
4682 of the constant to be the type of the other operand. We check that the
4683 constant fits the type in the pattern recognition pass. */
4684 if (code == DOT_PROD_EXPR
4685 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
4686 {
4687 if (TREE_CODE (ops[0]) == INTEGER_CST)
4688 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
4689 else if (TREE_CODE (ops[1]) == INTEGER_CST)
4690 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
4691 else
4692 {
4693 if (vect_print_dump_info (REPORT_DETAILS))
4694 fprintf (vect_dump, "invalid types in dot-prod");
4695
4696 return false;
4697 }
4698 }
4699
4700 if (!vec_stmt) /* transformation not required. */
4701 {
4702 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4703 return false;
4704 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4705 return true;
4706 }
4707
4708 /** Transform. **/
4709
4710 if (vect_print_dump_info (REPORT_DETAILS))
4711 fprintf (vect_dump, "transform reduction.");
4712
4713 /* FORNOW: Multiple types are not supported for condition. */
4714 if (code == COND_EXPR)
4715 gcc_assert (ncopies == 1);
4716
4717 /* Create the destination vector */
4718 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4719
4720 /* In case the vectorization factor (VF) is bigger than the number
4721 of elements that we can fit in a vectype (nunits), we have to generate
4722 more than one vector stmt - i.e - we need to "unroll" the
4723 vector stmt by a factor VF/nunits. For more details see documentation
4724 in vectorizable_operation. */
4725
4726 /* If the reduction is used in an outer loop we need to generate
4727 VF intermediate results, like so (e.g. for ncopies=2):
4728 r0 = phi (init, r0)
4729 r1 = phi (init, r1)
4730 r0 = x0 + r0;
4731 r1 = x1 + r1;
4732 (i.e. we generate VF results in 2 registers).
4733 In this case we have a separate def-use cycle for each copy, and therefore
4734 for each copy we get the vector def for the reduction variable from the
4735 respective phi node created for this copy.
4736
4737 Otherwise (the reduction is unused in the loop nest), we can combine
4738 together intermediate results, like so (e.g. for ncopies=2):
4739 r = phi (init, r)
4740 r = x0 + r;
4741 r = x1 + r;
4742 (i.e. we generate VF/2 results in a single register).
4743 In this case for each copy we get the vector def for the reduction variable
4744 from the vectorized reduction operation generated in the previous iteration.
4745 */
4746
4747 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
4748 {
4749 single_defuse_cycle = true;
4750 epilog_copies = 1;
4751 }
4752 else
4753 epilog_copies = ncopies;
4754
4755 prev_stmt_info = NULL;
4756 prev_phi_info = NULL;
4757 if (slp_node)
4758 {
4759 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4760 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
4761 == TYPE_VECTOR_SUBPARTS (vectype_in));
4762 }
4763 else
4764 {
4765 vec_num = 1;
4766 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4767 if (op_type == ternary_op)
4768 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4769 }
4770
4771 phis = VEC_alloc (gimple, heap, vec_num);
4772 vect_defs = VEC_alloc (tree, heap, vec_num);
4773 if (!slp_node)
4774 VEC_quick_push (tree, vect_defs, NULL_TREE);
4775
4776 for (j = 0; j < ncopies; j++)
4777 {
4778 if (j == 0 || !single_defuse_cycle)
4779 {
4780 for (i = 0; i < vec_num; i++)
4781 {
4782 /* Create the reduction-phi that defines the reduction
4783 operand. */
4784 new_phi = create_phi_node (vec_dest, loop->header);
4785 set_vinfo_for_stmt (new_phi,
4786 new_stmt_vec_info (new_phi, loop_vinfo,
4787 NULL));
4788 if (j == 0 || slp_node)
4789 VEC_quick_push (gimple, phis, new_phi);
4790 }
4791 }
4792
4793 if (code == COND_EXPR)
4794 {
4795 gcc_assert (!slp_node);
4796 vectorizable_condition (stmt, gsi, vec_stmt,
4797 PHI_RESULT (VEC_index (gimple, phis, 0)),
4798 reduc_index, NULL);
4799 /* Multiple types are not supported for condition. */
4800 break;
4801 }
4802
4803 /* Handle uses. */
4804 if (j == 0)
4805 {
4806 op0 = ops[!reduc_index];
4807 if (op_type == ternary_op)
4808 {
4809 if (reduc_index == 0)
4810 op1 = ops[2];
4811 else
4812 op1 = ops[1];
4813 }
4814
4815 if (slp_node)
4816 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
4817 slp_node, -1);
4818 else
4819 {
4820 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
4821 stmt, NULL);
4822 VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
4823 if (op_type == ternary_op)
4824 {
4825 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
4826 NULL);
4827 VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
4828 }
4829 }
4830 }
4831 else
4832 {
4833 if (!slp_node)
4834 {
4835 enum vect_def_type dt;
4836 gimple dummy_stmt;
4837 tree dummy;
4838
4839 vect_is_simple_use (ops[!reduc_index], loop_vinfo, NULL,
4840 &dummy_stmt, &dummy, &dt);
4841 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
4842 loop_vec_def0);
4843 VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0);
4844 if (op_type == ternary_op)
4845 {
4846 vect_is_simple_use (op1, loop_vinfo, NULL, &dummy_stmt,
4847 &dummy, &dt);
4848 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
4849 loop_vec_def1);
4850 VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1);
4851 }
4852 }
4853
4854 if (single_defuse_cycle)
4855 reduc_def = gimple_assign_lhs (new_stmt);
4856
4857 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
4858 }
4859
4860 FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, def0)
4861 {
4862 if (slp_node)
4863 reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
4864 else
4865 {
4866 if (!single_defuse_cycle || j == 0)
4867 reduc_def = PHI_RESULT (new_phi);
4868 }
4869
4870 def1 = ((op_type == ternary_op)
4871 ? VEC_index (tree, vec_oprnds1, i) : NULL);
4872 if (op_type == binary_op)
4873 {
4874 if (reduc_index == 0)
4875 expr = build2 (code, vectype_out, reduc_def, def0);
4876 else
4877 expr = build2 (code, vectype_out, def0, reduc_def);
4878 }
4879 else
4880 {
4881 if (reduc_index == 0)
4882 expr = build3 (code, vectype_out, reduc_def, def0, def1);
4883 else
4884 {
4885 if (reduc_index == 1)
4886 expr = build3 (code, vectype_out, def0, reduc_def, def1);
4887 else
4888 expr = build3 (code, vectype_out, def0, def1, reduc_def);
4889 }
4890 }
4891
4892 new_stmt = gimple_build_assign (vec_dest, expr);
4893 new_temp = make_ssa_name (vec_dest, new_stmt);
4894 gimple_assign_set_lhs (new_stmt, new_temp);
4895 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4896
4897 if (slp_node)
4898 {
4899 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4900 VEC_quick_push (tree, vect_defs, new_temp);
4901 }
4902 else
4903 VEC_replace (tree, vect_defs, 0, new_temp);
4904 }
4905
4906 if (slp_node)
4907 continue;
4908
4909 if (j == 0)
4910 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4911 else
4912 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4913
4914 prev_stmt_info = vinfo_for_stmt (new_stmt);
4915 prev_phi_info = vinfo_for_stmt (new_phi);
4916 }
4917
4918 /* Finalize the reduction-phi (set its arguments) and create the
4919 epilog reduction code. */
4920 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
4921 {
4922 new_temp = gimple_assign_lhs (*vec_stmt);
4923 VEC_replace (tree, vect_defs, 0, new_temp);
4924 }
4925
4926 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
4927 epilog_reduc_code, phis, reduc_index,
4928 double_reduc, slp_node);
4929
4930 VEC_free (gimple, heap, phis);
4931 VEC_free (tree, heap, vec_oprnds0);
4932 if (vec_oprnds1)
4933 VEC_free (tree, heap, vec_oprnds1);
4934
4935 return true;
4936 }
4937
4938 /* Function vect_min_worthwhile_factor.
4939
4940 For a loop where we could vectorize the operation indicated by CODE,
4941 return the minimum vectorization factor that makes it worthwhile
4942 to use generic vectors. */
4943 int
4944 vect_min_worthwhile_factor (enum tree_code code)
4945 {
4946 switch (code)
4947 {
4948 case PLUS_EXPR:
4949 case MINUS_EXPR:
4950 case NEGATE_EXPR:
4951 return 4;
4952
4953 case BIT_AND_EXPR:
4954 case BIT_IOR_EXPR:
4955 case BIT_XOR_EXPR:
4956 case BIT_NOT_EXPR:
4957 return 2;
4958
4959 default:
4960 return INT_MAX;
4961 }
4962 }
4963
4964
4965 /* Function vectorizable_induction
4966
4967 Check if PHI performs an induction computation that can be vectorized.
4968 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
4969 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
4970 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4971
4972 bool
4973 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4974 gimple *vec_stmt)
4975 {
4976 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
4977 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4978 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4979 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4980 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4981 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4982 tree vec_def;
4983
4984 gcc_assert (ncopies >= 1);
4985 /* FORNOW. This restriction should be relaxed. */
4986 if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
4987 {
4988 if (vect_print_dump_info (REPORT_DETAILS))
4989 fprintf (vect_dump, "multiple types in nested loop.");
4990 return false;
4991 }
4992
4993 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4994 return false;
4995
4996 /* FORNOW: SLP not supported. */
4997 if (STMT_SLP_TYPE (stmt_info))
4998 return false;
4999
5000 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5001
5002 if (gimple_code (phi) != GIMPLE_PHI)
5003 return false;
5004
5005 if (!vec_stmt) /* transformation not required. */
5006 {
5007 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5008 if (vect_print_dump_info (REPORT_DETAILS))
5009 fprintf (vect_dump, "=== vectorizable_induction ===");
5010 vect_model_induction_cost (stmt_info, ncopies);
5011 return true;
5012 }
5013
5014 /** Transform. **/
5015
5016 if (vect_print_dump_info (REPORT_DETAILS))
5017 fprintf (vect_dump, "transform induction phi.");
5018
5019 vec_def = get_initial_def_for_induction (phi);
5020 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5021 return true;
5022 }
5023
5024 /* Function vectorizable_live_operation.
5025
5026 STMT computes a value that is used outside the loop. Check if
5027 it can be supported. */
5028
5029 bool
5030 vectorizable_live_operation (gimple stmt,
5031 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5032 gimple *vec_stmt ATTRIBUTE_UNUSED)
5033 {
5034 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5035 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5036 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5037 int i;
5038 int op_type;
5039 tree op;
5040 tree def;
5041 gimple def_stmt;
5042 enum vect_def_type dt;
5043 enum tree_code code;
5044 enum gimple_rhs_class rhs_class;
5045
5046 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5047
5048 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5049 return false;
5050
5051 if (!is_gimple_assign (stmt))
5052 return false;
5053
5054 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5055 return false;
5056
5057 /* FORNOW. CHECKME. */
5058 if (nested_in_vect_loop_p (loop, stmt))
5059 return false;
5060
5061 code = gimple_assign_rhs_code (stmt);
5062 op_type = TREE_CODE_LENGTH (code);
5063 rhs_class = get_gimple_rhs_class (code);
5064 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5065 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5066
5067 /* FORNOW: support only if all uses are invariant. This means
5068 that the scalar operations can remain in place, unvectorized.
5069 The original last scalar value that they compute will be used. */
5070
5071 for (i = 0; i < op_type; i++)
5072 {
5073 if (rhs_class == GIMPLE_SINGLE_RHS)
5074 op = TREE_OPERAND (gimple_op (stmt, 1), i);
5075 else
5076 op = gimple_op (stmt, i + 1);
5077 if (op
5078 && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
5079 {
5080 if (vect_print_dump_info (REPORT_DETAILS))
5081 fprintf (vect_dump, "use not simple.");
5082 return false;
5083 }
5084
5085 if (dt != vect_external_def && dt != vect_constant_def)
5086 return false;
5087 }
5088
5089 /* No transformation is required for the cases we currently support. */
5090 return true;
5091 }
5092
5093 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
5094
5095 static void
5096 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5097 {
5098 ssa_op_iter op_iter;
5099 imm_use_iterator imm_iter;
5100 def_operand_p def_p;
5101 gimple ustmt;
5102
5103 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5104 {
5105 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5106 {
5107 basic_block bb;
5108
5109 if (!is_gimple_debug (ustmt))
5110 continue;
5111
5112 bb = gimple_bb (ustmt);
5113
5114 if (!flow_bb_inside_loop_p (loop, bb))
5115 {
5116 if (gimple_debug_bind_p (ustmt))
5117 {
5118 if (vect_print_dump_info (REPORT_DETAILS))
5119 fprintf (vect_dump, "killing debug use");
5120
5121 gimple_debug_bind_reset_value (ustmt);
5122 update_stmt (ustmt);
5123 }
5124 else
5125 gcc_unreachable ();
5126 }
5127 }
5128 }
5129 }
5130
5131 /* Function vect_transform_loop.
5132
5133 The analysis phase has determined that the loop is vectorizable.
5134 Vectorize the loop - created vectorized stmts to replace the scalar
5135 stmts in the loop, and update the loop exit condition. */
5136
5137 void
5138 vect_transform_loop (loop_vec_info loop_vinfo)
5139 {
5140 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5141 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5142 int nbbs = loop->num_nodes;
5143 gimple_stmt_iterator si;
5144 int i;
5145 tree ratio = NULL;
5146 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5147 bool strided_store;
5148 bool slp_scheduled = false;
5149 unsigned int nunits;
5150 tree cond_expr = NULL_TREE;
5151 gimple_seq cond_expr_stmt_list = NULL;
5152 bool do_peeling_for_loop_bound;
5153 gimple stmt, pattern_stmt, pattern_def_stmt;
5154 bool transform_pattern_stmt = false, pattern_def = false;
5155
5156 if (vect_print_dump_info (REPORT_DETAILS))
5157 fprintf (vect_dump, "=== vec_transform_loop ===");
5158
5159 /* Peel the loop if there are data refs with unknown alignment.
5160 Only one data ref with unknown store is allowed. */
5161
5162 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5163 vect_do_peeling_for_alignment (loop_vinfo);
5164
5165 do_peeling_for_loop_bound
5166 = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5167 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5168 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
5169 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
5170
5171 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5172 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5173 vect_loop_versioning (loop_vinfo,
5174 !do_peeling_for_loop_bound,
5175 &cond_expr, &cond_expr_stmt_list);
5176
5177 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5178 compile time constant), or it is a constant that doesn't divide by the
5179 vectorization factor, then an epilog loop needs to be created.
5180 We therefore duplicate the loop: the original loop will be vectorized,
5181 and will compute the first (n/VF) iterations. The second copy of the loop
5182 will remain scalar and will compute the remaining (n%VF) iterations.
5183 (VF is the vectorization factor). */
5184
5185 if (do_peeling_for_loop_bound)
5186 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
5187 cond_expr, cond_expr_stmt_list);
5188 else
5189 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5190 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5191
5192 /* 1) Make sure the loop header has exactly two entries
5193 2) Make sure we have a preheader basic block. */
5194
5195 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5196
5197 split_edge (loop_preheader_edge (loop));
5198
5199 /* FORNOW: the vectorizer supports only loops which body consist
5200 of one basic block (header + empty latch). When the vectorizer will
5201 support more involved loop forms, the order by which the BBs are
5202 traversed need to be reconsidered. */
5203
5204 for (i = 0; i < nbbs; i++)
5205 {
5206 basic_block bb = bbs[i];
5207 stmt_vec_info stmt_info;
5208 gimple phi;
5209
5210 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5211 {
5212 phi = gsi_stmt (si);
5213 if (vect_print_dump_info (REPORT_DETAILS))
5214 {
5215 fprintf (vect_dump, "------>vectorizing phi: ");
5216 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
5217 }
5218 stmt_info = vinfo_for_stmt (phi);
5219 if (!stmt_info)
5220 continue;
5221
5222 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5223 vect_loop_kill_debug_uses (loop, phi);
5224
5225 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5226 && !STMT_VINFO_LIVE_P (stmt_info))
5227 continue;
5228
5229 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5230 != (unsigned HOST_WIDE_INT) vectorization_factor)
5231 && vect_print_dump_info (REPORT_DETAILS))
5232 fprintf (vect_dump, "multiple-types.");
5233
5234 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5235 {
5236 if (vect_print_dump_info (REPORT_DETAILS))
5237 fprintf (vect_dump, "transform phi.");
5238 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5239 }
5240 }
5241
5242 pattern_stmt = NULL;
5243 for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5244 {
5245 bool is_store;
5246
5247 if (transform_pattern_stmt)
5248 {
5249 stmt = pattern_stmt;
5250 transform_pattern_stmt = false;
5251 }
5252 else
5253 stmt = gsi_stmt (si);
5254
5255 if (vect_print_dump_info (REPORT_DETAILS))
5256 {
5257 fprintf (vect_dump, "------>vectorizing statement: ");
5258 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
5259 }
5260
5261 stmt_info = vinfo_for_stmt (stmt);
5262
5263 /* vector stmts created in the outer-loop during vectorization of
5264 stmts in an inner-loop may not have a stmt_info, and do not
5265 need to be vectorized. */
5266 if (!stmt_info)
5267 {
5268 gsi_next (&si);
5269 continue;
5270 }
5271
5272 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5273 vect_loop_kill_debug_uses (loop, stmt);
5274
5275 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5276 && !STMT_VINFO_LIVE_P (stmt_info))
5277 {
5278 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5279 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5280 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5281 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5282 {
5283 stmt = pattern_stmt;
5284 stmt_info = vinfo_for_stmt (stmt);
5285 }
5286 else
5287 {
5288 gsi_next (&si);
5289 continue;
5290 }
5291 }
5292 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5293 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5294 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5295 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5296 transform_pattern_stmt = true;
5297
5298 /* If pattern statement has a def stmt, vectorize it too. */
5299 if (is_pattern_stmt_p (stmt_info)
5300 && (pattern_def_stmt = STMT_VINFO_PATTERN_DEF_STMT (stmt_info))
5301 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_def_stmt))
5302 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_def_stmt))))
5303 {
5304 if (pattern_def)
5305 pattern_def = false;
5306 else
5307 {
5308 if (vect_print_dump_info (REPORT_DETAILS))
5309 {
5310 fprintf (vect_dump, "==> vectorizing pattern def"
5311 " stmt: ");
5312 print_gimple_stmt (vect_dump, pattern_def_stmt, 0,
5313 TDF_SLIM);
5314 }
5315
5316 pattern_def = true;
5317 stmt = pattern_def_stmt;
5318 stmt_info = vinfo_for_stmt (stmt);
5319 }
5320 }
5321
5322 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5323 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
5324 STMT_VINFO_VECTYPE (stmt_info));
5325 if (!STMT_SLP_TYPE (stmt_info)
5326 && nunits != (unsigned int) vectorization_factor
5327 && vect_print_dump_info (REPORT_DETAILS))
5328 /* For SLP VF is set according to unrolling factor, and not to
5329 vector size, hence for SLP this print is not valid. */
5330 fprintf (vect_dump, "multiple-types.");
5331
5332 /* SLP. Schedule all the SLP instances when the first SLP stmt is
5333 reached. */
5334 if (STMT_SLP_TYPE (stmt_info))
5335 {
5336 if (!slp_scheduled)
5337 {
5338 slp_scheduled = true;
5339
5340 if (vect_print_dump_info (REPORT_DETAILS))
5341 fprintf (vect_dump, "=== scheduling SLP instances ===");
5342
5343 vect_schedule_slp (loop_vinfo, NULL);
5344 }
5345
5346 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
5347 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
5348 {
5349 if (!transform_pattern_stmt && !pattern_def)
5350 gsi_next (&si);
5351 continue;
5352 }
5353 }
5354
5355 /* -------- vectorize statement ------------ */
5356 if (vect_print_dump_info (REPORT_DETAILS))
5357 fprintf (vect_dump, "transform statement.");
5358
5359 strided_store = false;
5360 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
5361 if (is_store)
5362 {
5363 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5364 {
5365 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5366 interleaving chain was completed - free all the stores in
5367 the chain. */
5368 gsi_next (&si);
5369 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
5370 continue;
5371 }
5372 else
5373 {
5374 /* Free the attached stmt_vec_info and remove the stmt. */
5375 free_stmt_vec_info (gsi_stmt (si));
5376 gsi_remove (&si, true);
5377 continue;
5378 }
5379 }
5380
5381 if (!transform_pattern_stmt && !pattern_def)
5382 gsi_next (&si);
5383 } /* stmts in BB */
5384 } /* BBs in loop */
5385
5386 slpeel_make_loop_iterate_ntimes (loop, ratio);
5387
5388 /* The memory tags and pointers in vectorized statements need to
5389 have their SSA forms updated. FIXME, why can't this be delayed
5390 until all the loops have been transformed? */
5391 update_ssa (TODO_update_ssa);
5392
5393 if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
5394 fprintf (vect_dump, "LOOP VECTORIZED.");
5395 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
5396 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");
5397 }