95b36f76f15adf605e978463117efebc1dea8473
[gcc.git] / gcc / tree-vect-loop.c
1 /* Loop Vectorization
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
3 Free Software Foundation, Inc.
4 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
5 Ira Rosen <irar@il.ibm.com>
6
7 This file is part of GCC.
8
9 GCC is free software; you can redistribute it and/or modify it under
10 the terms of the GNU General Public License as published by the Free
11 Software Foundation; either version 3, or (at your option) any later
12 version.
13
14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with GCC; see the file COPYING3. If not see
21 <http://www.gnu.org/licenses/>. */
22
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "basic-block.h"
30 #include "tree-pretty-print.h"
31 #include "gimple-pretty-print.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
34 #include "cfgloop.h"
35 #include "cfglayout.h"
36 #include "expr.h"
37 #include "recog.h"
38 #include "optabs.h"
39 #include "params.h"
40 #include "diagnostic-core.h"
41 #include "toplev.h"
42 #include "tree-chrec.h"
43 #include "tree-scalar-evolution.h"
44 #include "tree-vectorizer.h"
45 #include "target.h"
46
47 /* Loop Vectorization Pass.
48
49 This pass tries to vectorize loops.
50
51 For example, the vectorizer transforms the following simple loop:
52
53 short a[N]; short b[N]; short c[N]; int i;
54
55 for (i=0; i<N; i++){
56 a[i] = b[i] + c[i];
57 }
58
59 as if it was manually vectorized by rewriting the source code into:
60
61 typedef int __attribute__((mode(V8HI))) v8hi;
62 short a[N]; short b[N]; short c[N]; int i;
63 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
64 v8hi va, vb, vc;
65
66 for (i=0; i<N/8; i++){
67 vb = pb[i];
68 vc = pc[i];
69 va = vb + vc;
70 pa[i] = va;
71 }
72
73 The main entry to this pass is vectorize_loops(), in which
74 the vectorizer applies a set of analyses on a given set of loops,
75 followed by the actual vectorization transformation for the loops that
76 had successfully passed the analysis phase.
77 Throughout this pass we make a distinction between two types of
78 data: scalars (which are represented by SSA_NAMES), and memory references
79 ("data-refs"). These two types of data require different handling both
80 during analysis and transformation. The types of data-refs that the
81 vectorizer currently supports are ARRAY_REFS which base is an array DECL
82 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
83 accesses are required to have a simple (consecutive) access pattern.
84
85 Analysis phase:
86 ===============
87 The driver for the analysis phase is vect_analyze_loop().
88 It applies a set of analyses, some of which rely on the scalar evolution
89 analyzer (scev) developed by Sebastian Pop.
90
91 During the analysis phase the vectorizer records some information
92 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
93 loop, as well as general information about the loop as a whole, which is
94 recorded in a "loop_vec_info" struct attached to each loop.
95
96 Transformation phase:
97 =====================
98 The loop transformation phase scans all the stmts in the loop, and
99 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
100 the loop that needs to be vectorized. It inserts the vector code sequence
101 just before the scalar stmt S, and records a pointer to the vector code
102 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
103 attached to S). This pointer will be used for the vectorization of following
104 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
105 otherwise, we rely on dead code elimination for removing it.
106
107 For example, say stmt S1 was vectorized into stmt VS1:
108
109 VS1: vb = px[i];
110 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
111 S2: a = b;
112
113 To vectorize stmt S2, the vectorizer first finds the stmt that defines
114 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
115 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
116 resulting sequence would be:
117
118 VS1: vb = px[i];
119 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
120 VS2: va = vb;
121 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
122
123 Operands that are not SSA_NAMEs, are data-refs that appear in
124 load/store operations (like 'x[i]' in S1), and are handled differently.
125
126 Target modeling:
127 =================
128 Currently the only target specific information that is used is the
129 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
130 Targets that can support different sizes of vectors, for now will need
131 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
132 flexibility will be added in the future.
133
134 Since we only vectorize operations which vector form can be
135 expressed using existing tree codes, to verify that an operation is
136 supported, the vectorizer checks the relevant optab at the relevant
137 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
138 the value found is CODE_FOR_nothing, then there's no target support, and
139 we can't vectorize the stmt.
140
141 For additional information on this project see:
142 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
143 */
144
145 /* Function vect_determine_vectorization_factor
146
147 Determine the vectorization factor (VF). VF is the number of data elements
148 that are operated upon in parallel in a single iteration of the vectorized
149 loop. For example, when vectorizing a loop that operates on 4byte elements,
150 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
151 elements can fit in a single vector register.
152
153 We currently support vectorization of loops in which all types operated upon
154 are of the same size. Therefore this function currently sets VF according to
155 the size of the types operated upon, and fails if there are multiple sizes
156 in the loop.
157
158 VF is also the factor by which the loop iterations are strip-mined, e.g.:
159 original loop:
160 for (i=0; i<N; i++){
161 a[i] = b[i] + c[i];
162 }
163
164 vectorized loop:
165 for (i=0; i<N; i+=VF){
166 a[i:VF] = b[i:VF] + c[i:VF];
167 }
168 */
169
170 static bool
171 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
172 {
173 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
174 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
175 int nbbs = loop->num_nodes;
176 gimple_stmt_iterator si;
177 unsigned int vectorization_factor = 0;
178 tree scalar_type;
179 gimple phi;
180 tree vectype;
181 unsigned int nunits;
182 stmt_vec_info stmt_info;
183 int i;
184 HOST_WIDE_INT dummy;
185
186 if (vect_print_dump_info (REPORT_DETAILS))
187 fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
188
189 for (i = 0; i < nbbs; i++)
190 {
191 basic_block bb = bbs[i];
192
193 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
194 {
195 phi = gsi_stmt (si);
196 stmt_info = vinfo_for_stmt (phi);
197 if (vect_print_dump_info (REPORT_DETAILS))
198 {
199 fprintf (vect_dump, "==> examining phi: ");
200 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
201 }
202
203 gcc_assert (stmt_info);
204
205 if (STMT_VINFO_RELEVANT_P (stmt_info))
206 {
207 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
208 scalar_type = TREE_TYPE (PHI_RESULT (phi));
209
210 if (vect_print_dump_info (REPORT_DETAILS))
211 {
212 fprintf (vect_dump, "get vectype for scalar type: ");
213 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
214 }
215
216 vectype = get_vectype_for_scalar_type (scalar_type);
217 if (!vectype)
218 {
219 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
220 {
221 fprintf (vect_dump,
222 "not vectorized: unsupported data-type ");
223 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
224 }
225 return false;
226 }
227 STMT_VINFO_VECTYPE (stmt_info) = vectype;
228
229 if (vect_print_dump_info (REPORT_DETAILS))
230 {
231 fprintf (vect_dump, "vectype: ");
232 print_generic_expr (vect_dump, vectype, TDF_SLIM);
233 }
234
235 nunits = TYPE_VECTOR_SUBPARTS (vectype);
236 if (vect_print_dump_info (REPORT_DETAILS))
237 fprintf (vect_dump, "nunits = %d", nunits);
238
239 if (!vectorization_factor
240 || (nunits > vectorization_factor))
241 vectorization_factor = nunits;
242 }
243 }
244
245 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
246 {
247 tree vf_vectype;
248 gimple stmt = gsi_stmt (si);
249 stmt_info = vinfo_for_stmt (stmt);
250
251 if (vect_print_dump_info (REPORT_DETAILS))
252 {
253 fprintf (vect_dump, "==> examining statement: ");
254 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
255 }
256
257 gcc_assert (stmt_info);
258
259 /* skip stmts which do not need to be vectorized. */
260 if (!STMT_VINFO_RELEVANT_P (stmt_info)
261 && !STMT_VINFO_LIVE_P (stmt_info))
262 {
263 if (vect_print_dump_info (REPORT_DETAILS))
264 fprintf (vect_dump, "skip.");
265 continue;
266 }
267
268 if (gimple_get_lhs (stmt) == NULL_TREE)
269 {
270 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
271 {
272 fprintf (vect_dump, "not vectorized: irregular stmt.");
273 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
274 }
275 return false;
276 }
277
278 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
279 {
280 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
281 {
282 fprintf (vect_dump, "not vectorized: vector stmt in loop:");
283 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
284 }
285 return false;
286 }
287
288 if (STMT_VINFO_VECTYPE (stmt_info))
289 {
290 /* The only case when a vectype had been already set is for stmts
291 that contain a dataref, or for "pattern-stmts" (stmts generated
292 by the vectorizer to represent/replace a certain idiom). */
293 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
294 || is_pattern_stmt_p (stmt_info));
295 vectype = STMT_VINFO_VECTYPE (stmt_info);
296 }
297 else
298 {
299 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)
300 && !is_pattern_stmt_p (stmt_info));
301
302 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
303 if (vect_print_dump_info (REPORT_DETAILS))
304 {
305 fprintf (vect_dump, "get vectype for scalar type: ");
306 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
307 }
308 vectype = get_vectype_for_scalar_type (scalar_type);
309 if (!vectype)
310 {
311 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
312 {
313 fprintf (vect_dump,
314 "not vectorized: unsupported data-type ");
315 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
316 }
317 return false;
318 }
319
320 STMT_VINFO_VECTYPE (stmt_info) = vectype;
321 }
322
323 /* The vectorization factor is according to the smallest
324 scalar type (or the largest vector size, but we only
325 support one vector size per loop). */
326 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
327 &dummy);
328 if (vect_print_dump_info (REPORT_DETAILS))
329 {
330 fprintf (vect_dump, "get vectype for scalar type: ");
331 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
332 }
333 vf_vectype = get_vectype_for_scalar_type (scalar_type);
334 if (!vf_vectype)
335 {
336 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
337 {
338 fprintf (vect_dump,
339 "not vectorized: unsupported data-type ");
340 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
341 }
342 return false;
343 }
344
345 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
346 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
347 {
348 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
349 {
350 fprintf (vect_dump,
351 "not vectorized: different sized vector "
352 "types in statement, ");
353 print_generic_expr (vect_dump, vectype, TDF_SLIM);
354 fprintf (vect_dump, " and ");
355 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
356 }
357 return false;
358 }
359
360 if (vect_print_dump_info (REPORT_DETAILS))
361 {
362 fprintf (vect_dump, "vectype: ");
363 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
364 }
365
366 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
367 if (vect_print_dump_info (REPORT_DETAILS))
368 fprintf (vect_dump, "nunits = %d", nunits);
369
370 if (!vectorization_factor
371 || (nunits > vectorization_factor))
372 vectorization_factor = nunits;
373 }
374 }
375
376 /* TODO: Analyze cost. Decide if worth while to vectorize. */
377 if (vect_print_dump_info (REPORT_DETAILS))
378 fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
379 if (vectorization_factor <= 1)
380 {
381 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
382 fprintf (vect_dump, "not vectorized: unsupported data-type");
383 return false;
384 }
385 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
386
387 return true;
388 }
389
390
391 /* Function vect_is_simple_iv_evolution.
392
393 FORNOW: A simple evolution of an induction variables in the loop is
394 considered a polynomial evolution with constant step. */
395
396 static bool
397 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
398 tree * step)
399 {
400 tree init_expr;
401 tree step_expr;
402 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
403
404 /* When there is no evolution in this loop, the evolution function
405 is not "simple". */
406 if (evolution_part == NULL_TREE)
407 return false;
408
409 /* When the evolution is a polynomial of degree >= 2
410 the evolution function is not "simple". */
411 if (tree_is_chrec (evolution_part))
412 return false;
413
414 step_expr = evolution_part;
415 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
416
417 if (vect_print_dump_info (REPORT_DETAILS))
418 {
419 fprintf (vect_dump, "step: ");
420 print_generic_expr (vect_dump, step_expr, TDF_SLIM);
421 fprintf (vect_dump, ", init: ");
422 print_generic_expr (vect_dump, init_expr, TDF_SLIM);
423 }
424
425 *init = init_expr;
426 *step = step_expr;
427
428 if (TREE_CODE (step_expr) != INTEGER_CST)
429 {
430 if (vect_print_dump_info (REPORT_DETAILS))
431 fprintf (vect_dump, "step unknown.");
432 return false;
433 }
434
435 return true;
436 }
437
438 /* Function vect_analyze_scalar_cycles_1.
439
440 Examine the cross iteration def-use cycles of scalar variables
441 in LOOP. LOOP_VINFO represents the loop that is now being
442 considered for vectorization (can be LOOP, or an outer-loop
443 enclosing LOOP). */
444
445 static void
446 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
447 {
448 basic_block bb = loop->header;
449 tree dumy;
450 VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
451 gimple_stmt_iterator gsi;
452 bool double_reduc;
453
454 if (vect_print_dump_info (REPORT_DETAILS))
455 fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
456
457 /* First - identify all inductions. Reduction detection assumes that all the
458 inductions have been identified, therefore, this order must not be
459 changed. */
460 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
461 {
462 gimple phi = gsi_stmt (gsi);
463 tree access_fn = NULL;
464 tree def = PHI_RESULT (phi);
465 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
466
467 if (vect_print_dump_info (REPORT_DETAILS))
468 {
469 fprintf (vect_dump, "Analyze phi: ");
470 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
471 }
472
473 /* Skip virtual phi's. The data dependences that are associated with
474 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
475 if (!is_gimple_reg (SSA_NAME_VAR (def)))
476 continue;
477
478 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
479
480 /* Analyze the evolution function. */
481 access_fn = analyze_scalar_evolution (loop, def);
482 if (access_fn && vect_print_dump_info (REPORT_DETAILS))
483 {
484 fprintf (vect_dump, "Access function of PHI: ");
485 print_generic_expr (vect_dump, access_fn, TDF_SLIM);
486 }
487
488 if (!access_fn
489 || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
490 {
491 VEC_safe_push (gimple, heap, worklist, phi);
492 continue;
493 }
494
495 if (vect_print_dump_info (REPORT_DETAILS))
496 fprintf (vect_dump, "Detected induction.");
497 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
498 }
499
500
501 /* Second - identify all reductions and nested cycles. */
502 while (VEC_length (gimple, worklist) > 0)
503 {
504 gimple phi = VEC_pop (gimple, worklist);
505 tree def = PHI_RESULT (phi);
506 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
507 gimple reduc_stmt;
508 bool nested_cycle;
509
510 if (vect_print_dump_info (REPORT_DETAILS))
511 {
512 fprintf (vect_dump, "Analyze phi: ");
513 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
514 }
515
516 gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
517 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
518
519 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
520 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
521 &double_reduc);
522 if (reduc_stmt)
523 {
524 if (double_reduc)
525 {
526 if (vect_print_dump_info (REPORT_DETAILS))
527 fprintf (vect_dump, "Detected double reduction.");
528
529 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
530 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
531 vect_double_reduction_def;
532 }
533 else
534 {
535 if (nested_cycle)
536 {
537 if (vect_print_dump_info (REPORT_DETAILS))
538 fprintf (vect_dump, "Detected vectorizable nested cycle.");
539
540 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
541 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
542 vect_nested_cycle;
543 }
544 else
545 {
546 if (vect_print_dump_info (REPORT_DETAILS))
547 fprintf (vect_dump, "Detected reduction.");
548
549 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
550 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
551 vect_reduction_def;
552 /* Store the reduction cycles for possible vectorization in
553 loop-aware SLP. */
554 VEC_safe_push (gimple, heap,
555 LOOP_VINFO_REDUCTIONS (loop_vinfo),
556 reduc_stmt);
557 }
558 }
559 }
560 else
561 if (vect_print_dump_info (REPORT_DETAILS))
562 fprintf (vect_dump, "Unknown def-use cycle pattern.");
563 }
564
565 VEC_free (gimple, heap, worklist);
566 }
567
568
569 /* Function vect_analyze_scalar_cycles.
570
571 Examine the cross iteration def-use cycles of scalar variables, by
572 analyzing the loop-header PHIs of scalar variables. Classify each
573 cycle as one of the following: invariant, induction, reduction, unknown.
574 We do that for the loop represented by LOOP_VINFO, and also to its
575 inner-loop, if exists.
576 Examples for scalar cycles:
577
578 Example1: reduction:
579
580 loop1:
581 for (i=0; i<N; i++)
582 sum += a[i];
583
584 Example2: induction:
585
586 loop2:
587 for (i=0; i<N; i++)
588 a[i] = i; */
589
590 static void
591 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
592 {
593 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
594
595 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
596
597 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
598 Reductions in such inner-loop therefore have different properties than
599 the reductions in the nest that gets vectorized:
600 1. When vectorized, they are executed in the same order as in the original
601 scalar loop, so we can't change the order of computation when
602 vectorizing them.
603 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
604 current checks are too strict. */
605
606 if (loop->inner)
607 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
608 }
609
610 /* Function vect_get_loop_niters.
611
612 Determine how many iterations the loop is executed.
613 If an expression that represents the number of iterations
614 can be constructed, place it in NUMBER_OF_ITERATIONS.
615 Return the loop exit condition. */
616
617 static gimple
618 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
619 {
620 tree niters;
621
622 if (vect_print_dump_info (REPORT_DETAILS))
623 fprintf (vect_dump, "=== get_loop_niters ===");
624
625 niters = number_of_exit_cond_executions (loop);
626
627 if (niters != NULL_TREE
628 && niters != chrec_dont_know)
629 {
630 *number_of_iterations = niters;
631
632 if (vect_print_dump_info (REPORT_DETAILS))
633 {
634 fprintf (vect_dump, "==> get_loop_niters:" );
635 print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
636 }
637 }
638
639 return get_loop_exit_condition (loop);
640 }
641
642
643 /* Function bb_in_loop_p
644
645 Used as predicate for dfs order traversal of the loop bbs. */
646
647 static bool
648 bb_in_loop_p (const_basic_block bb, const void *data)
649 {
650 const struct loop *const loop = (const struct loop *)data;
651 if (flow_bb_inside_loop_p (loop, bb))
652 return true;
653 return false;
654 }
655
656
657 /* Function new_loop_vec_info.
658
659 Create and initialize a new loop_vec_info struct for LOOP, as well as
660 stmt_vec_info structs for all the stmts in LOOP. */
661
662 static loop_vec_info
663 new_loop_vec_info (struct loop *loop)
664 {
665 loop_vec_info res;
666 basic_block *bbs;
667 gimple_stmt_iterator si;
668 unsigned int i, nbbs;
669
670 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
671 LOOP_VINFO_LOOP (res) = loop;
672
673 bbs = get_loop_body (loop);
674
675 /* Create/Update stmt_info for all stmts in the loop. */
676 for (i = 0; i < loop->num_nodes; i++)
677 {
678 basic_block bb = bbs[i];
679
680 /* BBs in a nested inner-loop will have been already processed (because
681 we will have called vect_analyze_loop_form for any nested inner-loop).
682 Therefore, for stmts in an inner-loop we just want to update the
683 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
684 loop_info of the outer-loop we are currently considering to vectorize
685 (instead of the loop_info of the inner-loop).
686 For stmts in other BBs we need to create a stmt_info from scratch. */
687 if (bb->loop_father != loop)
688 {
689 /* Inner-loop bb. */
690 gcc_assert (loop->inner && bb->loop_father == loop->inner);
691 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
692 {
693 gimple phi = gsi_stmt (si);
694 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
695 loop_vec_info inner_loop_vinfo =
696 STMT_VINFO_LOOP_VINFO (stmt_info);
697 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
698 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
699 }
700 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
701 {
702 gimple stmt = gsi_stmt (si);
703 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
704 loop_vec_info inner_loop_vinfo =
705 STMT_VINFO_LOOP_VINFO (stmt_info);
706 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
707 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
708 }
709 }
710 else
711 {
712 /* bb in current nest. */
713 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
714 {
715 gimple phi = gsi_stmt (si);
716 gimple_set_uid (phi, 0);
717 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
718 }
719
720 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
721 {
722 gimple stmt = gsi_stmt (si);
723 gimple_set_uid (stmt, 0);
724 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
725 }
726 }
727 }
728
729 /* CHECKME: We want to visit all BBs before their successors (except for
730 latch blocks, for which this assertion wouldn't hold). In the simple
731 case of the loop forms we allow, a dfs order of the BBs would the same
732 as reversed postorder traversal, so we are safe. */
733
734 free (bbs);
735 bbs = XCNEWVEC (basic_block, loop->num_nodes);
736 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
737 bbs, loop->num_nodes, loop);
738 gcc_assert (nbbs == loop->num_nodes);
739
740 LOOP_VINFO_BBS (res) = bbs;
741 LOOP_VINFO_NITERS (res) = NULL;
742 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
743 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
744 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
745 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
746 LOOP_VINFO_VECT_FACTOR (res) = 0;
747 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
748 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
749 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
750 LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
751 VEC_alloc (gimple, heap,
752 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
753 LOOP_VINFO_MAY_ALIAS_DDRS (res) =
754 VEC_alloc (ddr_p, heap,
755 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
756 LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
757 LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
758 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
759 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
760 LOOP_VINFO_PEELING_HTAB (res) = NULL;
761
762 return res;
763 }
764
765
766 /* Function destroy_loop_vec_info.
767
768 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
769 stmts in the loop. */
770
771 void
772 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
773 {
774 struct loop *loop;
775 basic_block *bbs;
776 int nbbs;
777 gimple_stmt_iterator si;
778 int j;
779 VEC (slp_instance, heap) *slp_instances;
780 slp_instance instance;
781
782 if (!loop_vinfo)
783 return;
784
785 loop = LOOP_VINFO_LOOP (loop_vinfo);
786
787 bbs = LOOP_VINFO_BBS (loop_vinfo);
788 nbbs = loop->num_nodes;
789
790 if (!clean_stmts)
791 {
792 free (LOOP_VINFO_BBS (loop_vinfo));
793 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
794 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
795 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
796
797 free (loop_vinfo);
798 loop->aux = NULL;
799 return;
800 }
801
802 for (j = 0; j < nbbs; j++)
803 {
804 basic_block bb = bbs[j];
805 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
806 free_stmt_vec_info (gsi_stmt (si));
807
808 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
809 {
810 gimple stmt = gsi_stmt (si);
811 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
812
813 if (stmt_info)
814 {
815 /* Check if this is a "pattern stmt" (introduced by the
816 vectorizer during the pattern recognition pass). */
817 bool remove_stmt_p = false;
818 gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
819 if (orig_stmt)
820 {
821 stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
822 if (orig_stmt_info
823 && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
824 remove_stmt_p = true;
825 }
826
827 /* Free stmt_vec_info. */
828 free_stmt_vec_info (stmt);
829
830 /* Remove dead "pattern stmts". */
831 if (remove_stmt_p)
832 gsi_remove (&si, true);
833 }
834 gsi_next (&si);
835 }
836 }
837
838 free (LOOP_VINFO_BBS (loop_vinfo));
839 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
840 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
841 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
842 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
843 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
844 FOR_EACH_VEC_ELT (slp_instance, slp_instances, j, instance)
845 vect_free_slp_instance (instance);
846
847 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
848 VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
849 VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
850
851 if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
852 htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
853
854 free (loop_vinfo);
855 loop->aux = NULL;
856 }
857
858
859 /* Function vect_analyze_loop_1.
860
861 Apply a set of analyses on LOOP, and create a loop_vec_info struct
862 for it. The different analyses will record information in the
863 loop_vec_info struct. This is a subset of the analyses applied in
864 vect_analyze_loop, to be applied on an inner-loop nested in the loop
865 that is now considered for (outer-loop) vectorization. */
866
867 static loop_vec_info
868 vect_analyze_loop_1 (struct loop *loop)
869 {
870 loop_vec_info loop_vinfo;
871
872 if (vect_print_dump_info (REPORT_DETAILS))
873 fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
874
875 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
876
877 loop_vinfo = vect_analyze_loop_form (loop);
878 if (!loop_vinfo)
879 {
880 if (vect_print_dump_info (REPORT_DETAILS))
881 fprintf (vect_dump, "bad inner-loop form.");
882 return NULL;
883 }
884
885 return loop_vinfo;
886 }
887
888
889 /* Function vect_analyze_loop_form.
890
891 Verify that certain CFG restrictions hold, including:
892 - the loop has a pre-header
893 - the loop has a single entry and exit
894 - the loop exit condition is simple enough, and the number of iterations
895 can be analyzed (a countable loop). */
896
897 loop_vec_info
898 vect_analyze_loop_form (struct loop *loop)
899 {
900 loop_vec_info loop_vinfo;
901 gimple loop_cond;
902 tree number_of_iterations = NULL;
903 loop_vec_info inner_loop_vinfo = NULL;
904
905 if (vect_print_dump_info (REPORT_DETAILS))
906 fprintf (vect_dump, "=== vect_analyze_loop_form ===");
907
908 /* Different restrictions apply when we are considering an inner-most loop,
909 vs. an outer (nested) loop.
910 (FORNOW. May want to relax some of these restrictions in the future). */
911
912 if (!loop->inner)
913 {
914 /* Inner-most loop. We currently require that the number of BBs is
915 exactly 2 (the header and latch). Vectorizable inner-most loops
916 look like this:
917
918 (pre-header)
919 |
920 header <--------+
921 | | |
922 | +--> latch --+
923 |
924 (exit-bb) */
925
926 if (loop->num_nodes != 2)
927 {
928 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
929 fprintf (vect_dump, "not vectorized: control flow in loop.");
930 return NULL;
931 }
932
933 if (empty_block_p (loop->header))
934 {
935 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
936 fprintf (vect_dump, "not vectorized: empty loop.");
937 return NULL;
938 }
939 }
940 else
941 {
942 struct loop *innerloop = loop->inner;
943 edge entryedge;
944
945 /* Nested loop. We currently require that the loop is doubly-nested,
946 contains a single inner loop, and the number of BBs is exactly 5.
947 Vectorizable outer-loops look like this:
948
949 (pre-header)
950 |
951 header <---+
952 | |
953 inner-loop |
954 | |
955 tail ------+
956 |
957 (exit-bb)
958
959 The inner-loop has the properties expected of inner-most loops
960 as described above. */
961
962 if ((loop->inner)->inner || (loop->inner)->next)
963 {
964 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
965 fprintf (vect_dump, "not vectorized: multiple nested loops.");
966 return NULL;
967 }
968
969 /* Analyze the inner-loop. */
970 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
971 if (!inner_loop_vinfo)
972 {
973 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
974 fprintf (vect_dump, "not vectorized: Bad inner loop.");
975 return NULL;
976 }
977
978 if (!expr_invariant_in_loop_p (loop,
979 LOOP_VINFO_NITERS (inner_loop_vinfo)))
980 {
981 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
982 fprintf (vect_dump,
983 "not vectorized: inner-loop count not invariant.");
984 destroy_loop_vec_info (inner_loop_vinfo, true);
985 return NULL;
986 }
987
988 if (loop->num_nodes != 5)
989 {
990 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
991 fprintf (vect_dump, "not vectorized: control flow in loop.");
992 destroy_loop_vec_info (inner_loop_vinfo, true);
993 return NULL;
994 }
995
996 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
997 entryedge = EDGE_PRED (innerloop->header, 0);
998 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
999 entryedge = EDGE_PRED (innerloop->header, 1);
1000
1001 if (entryedge->src != loop->header
1002 || !single_exit (innerloop)
1003 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1004 {
1005 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1006 fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
1007 destroy_loop_vec_info (inner_loop_vinfo, true);
1008 return NULL;
1009 }
1010
1011 if (vect_print_dump_info (REPORT_DETAILS))
1012 fprintf (vect_dump, "Considering outer-loop vectorization.");
1013 }
1014
1015 if (!single_exit (loop)
1016 || EDGE_COUNT (loop->header->preds) != 2)
1017 {
1018 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1019 {
1020 if (!single_exit (loop))
1021 fprintf (vect_dump, "not vectorized: multiple exits.");
1022 else if (EDGE_COUNT (loop->header->preds) != 2)
1023 fprintf (vect_dump, "not vectorized: too many incoming edges.");
1024 }
1025 if (inner_loop_vinfo)
1026 destroy_loop_vec_info (inner_loop_vinfo, true);
1027 return NULL;
1028 }
1029
1030 /* We assume that the loop exit condition is at the end of the loop. i.e,
1031 that the loop is represented as a do-while (with a proper if-guard
1032 before the loop if needed), where the loop header contains all the
1033 executable statements, and the latch is empty. */
1034 if (!empty_block_p (loop->latch)
1035 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1036 {
1037 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1038 fprintf (vect_dump, "not vectorized: unexpected loop form.");
1039 if (inner_loop_vinfo)
1040 destroy_loop_vec_info (inner_loop_vinfo, true);
1041 return NULL;
1042 }
1043
1044 /* Make sure there exists a single-predecessor exit bb: */
1045 if (!single_pred_p (single_exit (loop)->dest))
1046 {
1047 edge e = single_exit (loop);
1048 if (!(e->flags & EDGE_ABNORMAL))
1049 {
1050 split_loop_exit_edge (e);
1051 if (vect_print_dump_info (REPORT_DETAILS))
1052 fprintf (vect_dump, "split exit edge.");
1053 }
1054 else
1055 {
1056 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1057 fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1058 if (inner_loop_vinfo)
1059 destroy_loop_vec_info (inner_loop_vinfo, true);
1060 return NULL;
1061 }
1062 }
1063
1064 loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1065 if (!loop_cond)
1066 {
1067 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1068 fprintf (vect_dump, "not vectorized: complicated exit condition.");
1069 if (inner_loop_vinfo)
1070 destroy_loop_vec_info (inner_loop_vinfo, true);
1071 return NULL;
1072 }
1073
1074 if (!number_of_iterations)
1075 {
1076 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1077 fprintf (vect_dump,
1078 "not vectorized: number of iterations cannot be computed.");
1079 if (inner_loop_vinfo)
1080 destroy_loop_vec_info (inner_loop_vinfo, true);
1081 return NULL;
1082 }
1083
1084 if (chrec_contains_undetermined (number_of_iterations))
1085 {
1086 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1087 fprintf (vect_dump, "Infinite number of iterations.");
1088 if (inner_loop_vinfo)
1089 destroy_loop_vec_info (inner_loop_vinfo, true);
1090 return NULL;
1091 }
1092
1093 if (!NITERS_KNOWN_P (number_of_iterations))
1094 {
1095 if (vect_print_dump_info (REPORT_DETAILS))
1096 {
1097 fprintf (vect_dump, "Symbolic number of iterations is ");
1098 print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1099 }
1100 }
1101 else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1102 {
1103 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1104 fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1105 if (inner_loop_vinfo)
1106 destroy_loop_vec_info (inner_loop_vinfo, false);
1107 return NULL;
1108 }
1109
1110 loop_vinfo = new_loop_vec_info (loop);
1111 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1112 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1113
1114 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1115
1116 /* CHECKME: May want to keep it around it in the future. */
1117 if (inner_loop_vinfo)
1118 destroy_loop_vec_info (inner_loop_vinfo, false);
1119
1120 gcc_assert (!loop->aux);
1121 loop->aux = loop_vinfo;
1122 return loop_vinfo;
1123 }
1124
1125
1126 /* Get cost by calling cost target builtin. */
1127
1128 static inline int
1129 vect_get_cost (enum vect_cost_for_stmt type_of_cost)
1130 {
1131 tree dummy_type = NULL;
1132 int dummy = 0;
1133
1134 return targetm.vectorize.builtin_vectorization_cost (type_of_cost,
1135 dummy_type, dummy);
1136 }
1137
1138
1139 /* Function vect_analyze_loop_operations.
1140
1141 Scan the loop stmts and make sure they are all vectorizable. */
1142
1143 static bool
1144 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1145 {
1146 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1147 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1148 int nbbs = loop->num_nodes;
1149 gimple_stmt_iterator si;
1150 unsigned int vectorization_factor = 0;
1151 int i;
1152 gimple phi;
1153 stmt_vec_info stmt_info;
1154 bool need_to_vectorize = false;
1155 int min_profitable_iters;
1156 int min_scalar_loop_bound;
1157 unsigned int th;
1158 bool only_slp_in_loop = true, ok;
1159
1160 if (vect_print_dump_info (REPORT_DETAILS))
1161 fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1162
1163 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1164 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1165
1166 for (i = 0; i < nbbs; i++)
1167 {
1168 basic_block bb = bbs[i];
1169
1170 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1171 {
1172 phi = gsi_stmt (si);
1173 ok = true;
1174
1175 stmt_info = vinfo_for_stmt (phi);
1176 if (vect_print_dump_info (REPORT_DETAILS))
1177 {
1178 fprintf (vect_dump, "examining phi: ");
1179 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1180 }
1181
1182 if (! is_loop_header_bb_p (bb))
1183 {
1184 /* inner-loop loop-closed exit phi in outer-loop vectorization
1185 (i.e. a phi in the tail of the outer-loop).
1186 FORNOW: we currently don't support the case that these phis
1187 are not used in the outerloop (unless it is double reduction,
1188 i.e., this phi is vect_reduction_def), cause this case
1189 requires to actually do something here. */
1190 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1191 || STMT_VINFO_LIVE_P (stmt_info))
1192 && STMT_VINFO_DEF_TYPE (stmt_info)
1193 != vect_double_reduction_def)
1194 {
1195 if (vect_print_dump_info (REPORT_DETAILS))
1196 fprintf (vect_dump,
1197 "Unsupported loop-closed phi in outer-loop.");
1198 return false;
1199 }
1200 continue;
1201 }
1202
1203 gcc_assert (stmt_info);
1204
1205 if (STMT_VINFO_LIVE_P (stmt_info))
1206 {
1207 /* FORNOW: not yet supported. */
1208 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1209 fprintf (vect_dump, "not vectorized: value used after loop.");
1210 return false;
1211 }
1212
1213 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1214 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1215 {
1216 /* A scalar-dependence cycle that we don't support. */
1217 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1218 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1219 return false;
1220 }
1221
1222 if (STMT_VINFO_RELEVANT_P (stmt_info))
1223 {
1224 need_to_vectorize = true;
1225 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1226 ok = vectorizable_induction (phi, NULL, NULL);
1227 }
1228
1229 if (!ok)
1230 {
1231 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1232 {
1233 fprintf (vect_dump,
1234 "not vectorized: relevant phi not supported: ");
1235 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1236 }
1237 return false;
1238 }
1239 }
1240
1241 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1242 {
1243 gimple stmt = gsi_stmt (si);
1244 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1245
1246 gcc_assert (stmt_info);
1247
1248 if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1249 return false;
1250
1251 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1252 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1253 && !PURE_SLP_STMT (stmt_info))
1254 /* STMT needs both SLP and loop-based vectorization. */
1255 only_slp_in_loop = false;
1256 }
1257 } /* bbs */
1258
1259 /* All operations in the loop are either irrelevant (deal with loop
1260 control, or dead), or only used outside the loop and can be moved
1261 out of the loop (e.g. invariants, inductions). The loop can be
1262 optimized away by scalar optimizations. We're better off not
1263 touching this loop. */
1264 if (!need_to_vectorize)
1265 {
1266 if (vect_print_dump_info (REPORT_DETAILS))
1267 fprintf (vect_dump,
1268 "All the computation can be taken out of the loop.");
1269 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1270 fprintf (vect_dump,
1271 "not vectorized: redundant loop. no profit to vectorize.");
1272 return false;
1273 }
1274
1275 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1276 vectorization factor of the loop is the unrolling factor required by the
1277 SLP instances. If that unrolling factor is 1, we say, that we perform
1278 pure SLP on loop - cross iteration parallelism is not exploited. */
1279 if (only_slp_in_loop)
1280 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1281 else
1282 vectorization_factor = least_common_multiple (vectorization_factor,
1283 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1284
1285 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1286
1287 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1288 && vect_print_dump_info (REPORT_DETAILS))
1289 fprintf (vect_dump,
1290 "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1291 vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1292
1293 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1294 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1295 {
1296 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1297 fprintf (vect_dump, "not vectorized: iteration count too small.");
1298 if (vect_print_dump_info (REPORT_DETAILS))
1299 fprintf (vect_dump,"not vectorized: iteration count smaller than "
1300 "vectorization factor.");
1301 return false;
1302 }
1303
1304 /* Analyze cost. Decide if worth while to vectorize. */
1305
1306 /* Once VF is set, SLP costs should be updated since the number of created
1307 vector stmts depends on VF. */
1308 vect_update_slp_costs_according_to_vf (loop_vinfo);
1309
1310 min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1311 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1312
1313 if (min_profitable_iters < 0)
1314 {
1315 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1316 fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1317 if (vect_print_dump_info (REPORT_DETAILS))
1318 fprintf (vect_dump, "not vectorized: vector version will never be "
1319 "profitable.");
1320 return false;
1321 }
1322
1323 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1324 * vectorization_factor) - 1);
1325
1326 /* Use the cost model only if it is more conservative than user specified
1327 threshold. */
1328
1329 th = (unsigned) min_scalar_loop_bound;
1330 if (min_profitable_iters
1331 && (!min_scalar_loop_bound
1332 || min_profitable_iters > min_scalar_loop_bound))
1333 th = (unsigned) min_profitable_iters;
1334
1335 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1336 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1337 {
1338 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1339 fprintf (vect_dump, "not vectorized: vectorization not "
1340 "profitable.");
1341 if (vect_print_dump_info (REPORT_DETAILS))
1342 fprintf (vect_dump, "not vectorized: iteration count smaller than "
1343 "user specified loop bound parameter or minimum "
1344 "profitable iterations (whichever is more conservative).");
1345 return false;
1346 }
1347
1348 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1349 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1350 || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1351 {
1352 if (vect_print_dump_info (REPORT_DETAILS))
1353 fprintf (vect_dump, "epilog loop required.");
1354 if (!vect_can_advance_ivs_p (loop_vinfo))
1355 {
1356 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1357 fprintf (vect_dump,
1358 "not vectorized: can't create epilog loop 1.");
1359 return false;
1360 }
1361 if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1362 {
1363 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1364 fprintf (vect_dump,
1365 "not vectorized: can't create epilog loop 2.");
1366 return false;
1367 }
1368 }
1369
1370 return true;
1371 }
1372
1373
1374 /* Function vect_analyze_loop.
1375
1376 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1377 for it. The different analyses will record information in the
1378 loop_vec_info struct. */
1379 loop_vec_info
1380 vect_analyze_loop (struct loop *loop)
1381 {
1382 bool ok, dummy;
1383 loop_vec_info loop_vinfo;
1384 int max_vf = MAX_VECTORIZATION_FACTOR;
1385 int min_vf = 2;
1386
1387 if (vect_print_dump_info (REPORT_DETAILS))
1388 fprintf (vect_dump, "===== analyze_loop_nest =====");
1389
1390 if (loop_outer (loop)
1391 && loop_vec_info_for_loop (loop_outer (loop))
1392 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1393 {
1394 if (vect_print_dump_info (REPORT_DETAILS))
1395 fprintf (vect_dump, "outer-loop already vectorized.");
1396 return NULL;
1397 }
1398
1399 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
1400
1401 loop_vinfo = vect_analyze_loop_form (loop);
1402 if (!loop_vinfo)
1403 {
1404 if (vect_print_dump_info (REPORT_DETAILS))
1405 fprintf (vect_dump, "bad loop form.");
1406 return NULL;
1407 }
1408
1409 /* Find all data references in the loop (which correspond to vdefs/vuses)
1410 and analyze their evolution in the loop. Also adjust the minimal
1411 vectorization factor according to the loads and stores.
1412
1413 FORNOW: Handle only simple, array references, which
1414 alignment can be forced, and aligned pointer-references. */
1415
1416 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1417 if (!ok)
1418 {
1419 if (vect_print_dump_info (REPORT_DETAILS))
1420 fprintf (vect_dump, "bad data references.");
1421 destroy_loop_vec_info (loop_vinfo, true);
1422 return NULL;
1423 }
1424
1425 /* Classify all cross-iteration scalar data-flow cycles.
1426 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1427
1428 vect_analyze_scalar_cycles (loop_vinfo);
1429
1430 vect_pattern_recog (loop_vinfo);
1431
1432 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1433
1434 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1435 if (!ok)
1436 {
1437 if (vect_print_dump_info (REPORT_DETAILS))
1438 fprintf (vect_dump, "unexpected pattern.");
1439 destroy_loop_vec_info (loop_vinfo, true);
1440 return NULL;
1441 }
1442
1443 /* Analyze data dependences between the data-refs in the loop
1444 and adjust the maximum vectorization factor according to
1445 the dependences.
1446 FORNOW: fail at the first data dependence that we encounter. */
1447
1448 ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf, &dummy);
1449 if (!ok
1450 || max_vf < min_vf)
1451 {
1452 if (vect_print_dump_info (REPORT_DETAILS))
1453 fprintf (vect_dump, "bad data dependence.");
1454 destroy_loop_vec_info (loop_vinfo, true);
1455 return NULL;
1456 }
1457
1458 ok = vect_determine_vectorization_factor (loop_vinfo);
1459 if (!ok)
1460 {
1461 if (vect_print_dump_info (REPORT_DETAILS))
1462 fprintf (vect_dump, "can't determine vectorization factor.");
1463 destroy_loop_vec_info (loop_vinfo, true);
1464 return NULL;
1465 }
1466 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1467 {
1468 if (vect_print_dump_info (REPORT_DETAILS))
1469 fprintf (vect_dump, "bad data dependence.");
1470 destroy_loop_vec_info (loop_vinfo, true);
1471 return NULL;
1472 }
1473
1474 /* Analyze the alignment of the data-refs in the loop.
1475 Fail if a data reference is found that cannot be vectorized. */
1476
1477 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1478 if (!ok)
1479 {
1480 if (vect_print_dump_info (REPORT_DETAILS))
1481 fprintf (vect_dump, "bad data alignment.");
1482 destroy_loop_vec_info (loop_vinfo, true);
1483 return NULL;
1484 }
1485
1486 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1487 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1488
1489 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1490 if (!ok)
1491 {
1492 if (vect_print_dump_info (REPORT_DETAILS))
1493 fprintf (vect_dump, "bad data access.");
1494 destroy_loop_vec_info (loop_vinfo, true);
1495 return NULL;
1496 }
1497
1498 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1499 It is important to call pruning after vect_analyze_data_ref_accesses,
1500 since we use grouping information gathered by interleaving analysis. */
1501 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1502 if (!ok)
1503 {
1504 if (vect_print_dump_info (REPORT_DETAILS))
1505 fprintf (vect_dump, "too long list of versioning for alias "
1506 "run-time tests.");
1507 destroy_loop_vec_info (loop_vinfo, true);
1508 return NULL;
1509 }
1510
1511 /* This pass will decide on using loop versioning and/or loop peeling in
1512 order to enhance the alignment of data references in the loop. */
1513
1514 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1515 if (!ok)
1516 {
1517 if (vect_print_dump_info (REPORT_DETAILS))
1518 fprintf (vect_dump, "bad data alignment.");
1519 destroy_loop_vec_info (loop_vinfo, true);
1520 return NULL;
1521 }
1522
1523 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1524 ok = vect_analyze_slp (loop_vinfo, NULL);
1525 if (ok)
1526 {
1527 /* Decide which possible SLP instances to SLP. */
1528 vect_make_slp_decision (loop_vinfo);
1529
1530 /* Find stmts that need to be both vectorized and SLPed. */
1531 vect_detect_hybrid_slp (loop_vinfo);
1532 }
1533
1534 /* Scan all the operations in the loop and make sure they are
1535 vectorizable. */
1536
1537 ok = vect_analyze_loop_operations (loop_vinfo);
1538 if (!ok)
1539 {
1540 if (vect_print_dump_info (REPORT_DETAILS))
1541 fprintf (vect_dump, "bad operation or unsupported loop bound.");
1542 destroy_loop_vec_info (loop_vinfo, true);
1543 return NULL;
1544 }
1545
1546 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1547
1548 return loop_vinfo;
1549 }
1550
1551
1552 /* Function reduction_code_for_scalar_code
1553
1554 Input:
1555 CODE - tree_code of a reduction operations.
1556
1557 Output:
1558 REDUC_CODE - the corresponding tree-code to be used to reduce the
1559 vector of partial results into a single scalar result (which
1560 will also reside in a vector) or ERROR_MARK if the operation is
1561 a supported reduction operation, but does not have such tree-code.
1562
1563 Return FALSE if CODE currently cannot be vectorized as reduction. */
1564
1565 static bool
1566 reduction_code_for_scalar_code (enum tree_code code,
1567 enum tree_code *reduc_code)
1568 {
1569 switch (code)
1570 {
1571 case MAX_EXPR:
1572 *reduc_code = REDUC_MAX_EXPR;
1573 return true;
1574
1575 case MIN_EXPR:
1576 *reduc_code = REDUC_MIN_EXPR;
1577 return true;
1578
1579 case PLUS_EXPR:
1580 *reduc_code = REDUC_PLUS_EXPR;
1581 return true;
1582
1583 case MULT_EXPR:
1584 case MINUS_EXPR:
1585 case BIT_IOR_EXPR:
1586 case BIT_XOR_EXPR:
1587 case BIT_AND_EXPR:
1588 *reduc_code = ERROR_MARK;
1589 return true;
1590
1591 default:
1592 return false;
1593 }
1594 }
1595
1596
1597 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1598 STMT is printed with a message MSG. */
1599
1600 static void
1601 report_vect_op (gimple stmt, const char *msg)
1602 {
1603 fprintf (vect_dump, "%s", msg);
1604 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1605 }
1606
1607
1608 /* Function vect_is_simple_reduction_1
1609
1610 (1) Detect a cross-iteration def-use cycle that represents a simple
1611 reduction computation. We look for the following pattern:
1612
1613 loop_header:
1614 a1 = phi < a0, a2 >
1615 a3 = ...
1616 a2 = operation (a3, a1)
1617
1618 such that:
1619 1. operation is commutative and associative and it is safe to
1620 change the order of the computation (if CHECK_REDUCTION is true)
1621 2. no uses for a2 in the loop (a2 is used out of the loop)
1622 3. no uses of a1 in the loop besides the reduction operation.
1623
1624 Condition 1 is tested here.
1625 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1626
1627 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1628 nested cycles, if CHECK_REDUCTION is false.
1629
1630 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1631 reductions:
1632
1633 a1 = phi < a0, a2 >
1634 inner loop (def of a3)
1635 a2 = phi < a3 >
1636
1637 If MODIFY is true it tries also to rework the code in-place to enable
1638 detection of more reduction patterns. For the time being we rewrite
1639 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
1640 */
1641
1642 static gimple
1643 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
1644 bool check_reduction, bool *double_reduc,
1645 bool modify)
1646 {
1647 struct loop *loop = (gimple_bb (phi))->loop_father;
1648 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1649 edge latch_e = loop_latch_edge (loop);
1650 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1651 gimple def_stmt, def1 = NULL, def2 = NULL;
1652 enum tree_code orig_code, code;
1653 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1654 tree type;
1655 int nloop_uses;
1656 tree name;
1657 imm_use_iterator imm_iter;
1658 use_operand_p use_p;
1659 bool phi_def;
1660
1661 *double_reduc = false;
1662
1663 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1664 otherwise, we assume outer loop vectorization. */
1665 gcc_assert ((check_reduction && loop == vect_loop)
1666 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1667
1668 name = PHI_RESULT (phi);
1669 nloop_uses = 0;
1670 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1671 {
1672 gimple use_stmt = USE_STMT (use_p);
1673 if (is_gimple_debug (use_stmt))
1674 continue;
1675 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1676 && vinfo_for_stmt (use_stmt)
1677 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1678 nloop_uses++;
1679 if (nloop_uses > 1)
1680 {
1681 if (vect_print_dump_info (REPORT_DETAILS))
1682 fprintf (vect_dump, "reduction used in loop.");
1683 return NULL;
1684 }
1685 }
1686
1687 if (TREE_CODE (loop_arg) != SSA_NAME)
1688 {
1689 if (vect_print_dump_info (REPORT_DETAILS))
1690 {
1691 fprintf (vect_dump, "reduction: not ssa_name: ");
1692 print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
1693 }
1694 return NULL;
1695 }
1696
1697 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
1698 if (!def_stmt)
1699 {
1700 if (vect_print_dump_info (REPORT_DETAILS))
1701 fprintf (vect_dump, "reduction: no def_stmt.");
1702 return NULL;
1703 }
1704
1705 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
1706 {
1707 if (vect_print_dump_info (REPORT_DETAILS))
1708 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1709 return NULL;
1710 }
1711
1712 if (is_gimple_assign (def_stmt))
1713 {
1714 name = gimple_assign_lhs (def_stmt);
1715 phi_def = false;
1716 }
1717 else
1718 {
1719 name = PHI_RESULT (def_stmt);
1720 phi_def = true;
1721 }
1722
1723 nloop_uses = 0;
1724 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1725 {
1726 gimple use_stmt = USE_STMT (use_p);
1727 if (is_gimple_debug (use_stmt))
1728 continue;
1729 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1730 && vinfo_for_stmt (use_stmt)
1731 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1732 nloop_uses++;
1733 if (nloop_uses > 1)
1734 {
1735 if (vect_print_dump_info (REPORT_DETAILS))
1736 fprintf (vect_dump, "reduction used in loop.");
1737 return NULL;
1738 }
1739 }
1740
1741 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
1742 defined in the inner loop. */
1743 if (phi_def)
1744 {
1745 op1 = PHI_ARG_DEF (def_stmt, 0);
1746
1747 if (gimple_phi_num_args (def_stmt) != 1
1748 || TREE_CODE (op1) != SSA_NAME)
1749 {
1750 if (vect_print_dump_info (REPORT_DETAILS))
1751 fprintf (vect_dump, "unsupported phi node definition.");
1752
1753 return NULL;
1754 }
1755
1756 def1 = SSA_NAME_DEF_STMT (op1);
1757 if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1758 && loop->inner
1759 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
1760 && is_gimple_assign (def1))
1761 {
1762 if (vect_print_dump_info (REPORT_DETAILS))
1763 report_vect_op (def_stmt, "detected double reduction: ");
1764
1765 *double_reduc = true;
1766 return def_stmt;
1767 }
1768
1769 return NULL;
1770 }
1771
1772 code = orig_code = gimple_assign_rhs_code (def_stmt);
1773
1774 /* We can handle "res -= x[i]", which is non-associative by
1775 simply rewriting this into "res += -x[i]". Avoid changing
1776 gimple instruction for the first simple tests and only do this
1777 if we're allowed to change code at all. */
1778 if (code == MINUS_EXPR && modify)
1779 code = PLUS_EXPR;
1780
1781 if (check_reduction
1782 && (!commutative_tree_code (code) || !associative_tree_code (code)))
1783 {
1784 if (vect_print_dump_info (REPORT_DETAILS))
1785 report_vect_op (def_stmt, "reduction: not commutative/associative: ");
1786 return NULL;
1787 }
1788
1789 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
1790 {
1791 if (code != COND_EXPR)
1792 {
1793 if (vect_print_dump_info (REPORT_DETAILS))
1794 report_vect_op (def_stmt, "reduction: not binary operation: ");
1795
1796 return NULL;
1797 }
1798
1799 op3 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
1800 if (COMPARISON_CLASS_P (op3))
1801 {
1802 op4 = TREE_OPERAND (op3, 1);
1803 op3 = TREE_OPERAND (op3, 0);
1804 }
1805
1806 op1 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 1);
1807 op2 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 2);
1808
1809 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
1810 {
1811 if (vect_print_dump_info (REPORT_DETAILS))
1812 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1813
1814 return NULL;
1815 }
1816 }
1817 else
1818 {
1819 op1 = gimple_assign_rhs1 (def_stmt);
1820 op2 = gimple_assign_rhs2 (def_stmt);
1821
1822 if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
1823 {
1824 if (vect_print_dump_info (REPORT_DETAILS))
1825 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1826
1827 return NULL;
1828 }
1829 }
1830
1831 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
1832 if ((TREE_CODE (op1) == SSA_NAME
1833 && !types_compatible_p (type,TREE_TYPE (op1)))
1834 || (TREE_CODE (op2) == SSA_NAME
1835 && !types_compatible_p (type, TREE_TYPE (op2)))
1836 || (op3 && TREE_CODE (op3) == SSA_NAME
1837 && !types_compatible_p (type, TREE_TYPE (op3)))
1838 || (op4 && TREE_CODE (op4) == SSA_NAME
1839 && !types_compatible_p (type, TREE_TYPE (op4))))
1840 {
1841 if (vect_print_dump_info (REPORT_DETAILS))
1842 {
1843 fprintf (vect_dump, "reduction: multiple types: operation type: ");
1844 print_generic_expr (vect_dump, type, TDF_SLIM);
1845 fprintf (vect_dump, ", operands types: ");
1846 print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
1847 fprintf (vect_dump, ",");
1848 print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
1849 if (op3)
1850 {
1851 fprintf (vect_dump, ",");
1852 print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
1853 }
1854
1855 if (op4)
1856 {
1857 fprintf (vect_dump, ",");
1858 print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
1859 }
1860 }
1861
1862 return NULL;
1863 }
1864
1865 /* Check that it's ok to change the order of the computation.
1866 Generally, when vectorizing a reduction we change the order of the
1867 computation. This may change the behavior of the program in some
1868 cases, so we need to check that this is ok. One exception is when
1869 vectorizing an outer-loop: the inner-loop is executed sequentially,
1870 and therefore vectorizing reductions in the inner-loop during
1871 outer-loop vectorization is safe. */
1872
1873 /* CHECKME: check for !flag_finite_math_only too? */
1874 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
1875 && check_reduction)
1876 {
1877 /* Changing the order of operations changes the semantics. */
1878 if (vect_print_dump_info (REPORT_DETAILS))
1879 report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
1880 return NULL;
1881 }
1882 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
1883 && check_reduction)
1884 {
1885 /* Changing the order of operations changes the semantics. */
1886 if (vect_print_dump_info (REPORT_DETAILS))
1887 report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
1888 return NULL;
1889 }
1890 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
1891 {
1892 /* Changing the order of operations changes the semantics. */
1893 if (vect_print_dump_info (REPORT_DETAILS))
1894 report_vect_op (def_stmt,
1895 "reduction: unsafe fixed-point math optimization: ");
1896 return NULL;
1897 }
1898
1899 /* If we detected "res -= x[i]" earlier, rewrite it into
1900 "res += -x[i]" now. If this turns out to be useless reassoc
1901 will clean it up again. */
1902 if (orig_code == MINUS_EXPR)
1903 {
1904 tree rhs = gimple_assign_rhs2 (def_stmt);
1905 tree negrhs = make_ssa_name (SSA_NAME_VAR (rhs), NULL);
1906 gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
1907 rhs, NULL);
1908 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
1909 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
1910 loop_info, NULL));
1911 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
1912 gimple_assign_set_rhs2 (def_stmt, negrhs);
1913 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
1914 update_stmt (def_stmt);
1915 }
1916
1917 /* Reduction is safe. We're dealing with one of the following:
1918 1) integer arithmetic and no trapv
1919 2) floating point arithmetic, and special flags permit this optimization
1920 3) nested cycle (i.e., outer loop vectorization). */
1921 if (TREE_CODE (op1) == SSA_NAME)
1922 def1 = SSA_NAME_DEF_STMT (op1);
1923
1924 if (TREE_CODE (op2) == SSA_NAME)
1925 def2 = SSA_NAME_DEF_STMT (op2);
1926
1927 if (code != COND_EXPR
1928 && (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2)))
1929 {
1930 if (vect_print_dump_info (REPORT_DETAILS))
1931 report_vect_op (def_stmt, "reduction: no defs for operands: ");
1932 return NULL;
1933 }
1934
1935 /* Check that one def is the reduction def, defined by PHI,
1936 the other def is either defined in the loop ("vect_internal_def"),
1937 or it's an induction (defined by a loop-header phi-node). */
1938
1939 if (def2 && def2 == phi
1940 && (code == COND_EXPR
1941 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
1942 && (is_gimple_assign (def1)
1943 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1944 == vect_induction_def
1945 || (gimple_code (def1) == GIMPLE_PHI
1946 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1947 == vect_internal_def
1948 && !is_loop_header_bb_p (gimple_bb (def1)))))))
1949 {
1950 if (vect_print_dump_info (REPORT_DETAILS))
1951 report_vect_op (def_stmt, "detected reduction: ");
1952 return def_stmt;
1953 }
1954 else if (def1 && def1 == phi
1955 && (code == COND_EXPR
1956 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
1957 && (is_gimple_assign (def2)
1958 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1959 == vect_induction_def
1960 || (gimple_code (def2) == GIMPLE_PHI
1961 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1962 == vect_internal_def
1963 && !is_loop_header_bb_p (gimple_bb (def2)))))))
1964 {
1965 if (check_reduction)
1966 {
1967 /* Swap operands (just for simplicity - so that the rest of the code
1968 can assume that the reduction variable is always the last (second)
1969 argument). */
1970 if (vect_print_dump_info (REPORT_DETAILS))
1971 report_vect_op (def_stmt,
1972 "detected reduction: need to swap operands: ");
1973
1974 swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
1975 gimple_assign_rhs2_ptr (def_stmt));
1976 }
1977 else
1978 {
1979 if (vect_print_dump_info (REPORT_DETAILS))
1980 report_vect_op (def_stmt, "detected reduction: ");
1981 }
1982
1983 return def_stmt;
1984 }
1985 else
1986 {
1987 if (vect_print_dump_info (REPORT_DETAILS))
1988 report_vect_op (def_stmt, "reduction: unknown pattern: ");
1989
1990 return NULL;
1991 }
1992 }
1993
1994 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
1995 in-place. Arguments as there. */
1996
1997 static gimple
1998 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
1999 bool check_reduction, bool *double_reduc)
2000 {
2001 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2002 double_reduc, false);
2003 }
2004
2005 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2006 in-place if it enables detection of more reductions. Arguments
2007 as there. */
2008
2009 gimple
2010 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2011 bool check_reduction, bool *double_reduc)
2012 {
2013 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2014 double_reduc, true);
2015 }
2016
2017 /* Calculate the cost of one scalar iteration of the loop. */
2018 int
2019 vect_get_single_scalar_iteraion_cost (loop_vec_info loop_vinfo)
2020 {
2021 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2022 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2023 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2024 int innerloop_iters, i, stmt_cost;
2025
2026 /* Count statements in scalar loop. Using this as scalar cost for a single
2027 iteration for now.
2028
2029 TODO: Add outer loop support.
2030
2031 TODO: Consider assigning different costs to different scalar
2032 statements. */
2033
2034 /* FORNOW. */
2035 innerloop_iters = 1;
2036 if (loop->inner)
2037 innerloop_iters = 50; /* FIXME */
2038
2039 for (i = 0; i < nbbs; i++)
2040 {
2041 gimple_stmt_iterator si;
2042 basic_block bb = bbs[i];
2043
2044 if (bb->loop_father == loop->inner)
2045 factor = innerloop_iters;
2046 else
2047 factor = 1;
2048
2049 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2050 {
2051 gimple stmt = gsi_stmt (si);
2052 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2053
2054 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2055 continue;
2056
2057 /* Skip stmts that are not vectorized inside the loop. */
2058 if (stmt_info
2059 && !STMT_VINFO_RELEVANT_P (stmt_info)
2060 && (!STMT_VINFO_LIVE_P (stmt_info)
2061 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2062 continue;
2063
2064 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2065 {
2066 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2067 stmt_cost = vect_get_cost (scalar_load);
2068 else
2069 stmt_cost = vect_get_cost (scalar_store);
2070 }
2071 else
2072 stmt_cost = vect_get_cost (scalar_stmt);
2073
2074 scalar_single_iter_cost += stmt_cost * factor;
2075 }
2076 }
2077 return scalar_single_iter_cost;
2078 }
2079
2080 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2081 int
2082 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2083 int *peel_iters_epilogue,
2084 int scalar_single_iter_cost)
2085 {
2086 int peel_guard_costs = 0;
2087 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2088
2089 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2090 {
2091 *peel_iters_epilogue = vf/2;
2092 if (vect_print_dump_info (REPORT_COST))
2093 fprintf (vect_dump, "cost model: "
2094 "epilogue peel iters set to vf/2 because "
2095 "loop iterations are unknown .");
2096
2097 /* If peeled iterations are known but number of scalar loop
2098 iterations are unknown, count a taken branch per peeled loop. */
2099 peel_guard_costs = 2 * vect_get_cost (cond_branch_taken);
2100 }
2101 else
2102 {
2103 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2104 peel_iters_prologue = niters < peel_iters_prologue ?
2105 niters : peel_iters_prologue;
2106 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2107 }
2108
2109 return (peel_iters_prologue * scalar_single_iter_cost)
2110 + (*peel_iters_epilogue * scalar_single_iter_cost)
2111 + peel_guard_costs;
2112 }
2113
2114 /* Function vect_estimate_min_profitable_iters
2115
2116 Return the number of iterations required for the vector version of the
2117 loop to be profitable relative to the cost of the scalar version of the
2118 loop.
2119
2120 TODO: Take profile info into account before making vectorization
2121 decisions, if available. */
2122
2123 int
2124 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
2125 {
2126 int i;
2127 int min_profitable_iters;
2128 int peel_iters_prologue;
2129 int peel_iters_epilogue;
2130 int vec_inside_cost = 0;
2131 int vec_outside_cost = 0;
2132 int scalar_single_iter_cost = 0;
2133 int scalar_outside_cost = 0;
2134 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2135 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2136 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2137 int nbbs = loop->num_nodes;
2138 int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2139 int peel_guard_costs = 0;
2140 int innerloop_iters = 0, factor;
2141 VEC (slp_instance, heap) *slp_instances;
2142 slp_instance instance;
2143
2144 /* Cost model disabled. */
2145 if (!flag_vect_cost_model)
2146 {
2147 if (vect_print_dump_info (REPORT_COST))
2148 fprintf (vect_dump, "cost model disabled.");
2149 return 0;
2150 }
2151
2152 /* Requires loop versioning tests to handle misalignment. */
2153 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2154 {
2155 /* FIXME: Make cost depend on complexity of individual check. */
2156 vec_outside_cost +=
2157 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
2158 if (vect_print_dump_info (REPORT_COST))
2159 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2160 "versioning to treat misalignment.\n");
2161 }
2162
2163 /* Requires loop versioning with alias checks. */
2164 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2165 {
2166 /* FIXME: Make cost depend on complexity of individual check. */
2167 vec_outside_cost +=
2168 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
2169 if (vect_print_dump_info (REPORT_COST))
2170 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2171 "versioning aliasing.\n");
2172 }
2173
2174 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2175 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2176 vec_outside_cost += vect_get_cost (cond_branch_taken);
2177
2178 /* Count statements in scalar loop. Using this as scalar cost for a single
2179 iteration for now.
2180
2181 TODO: Add outer loop support.
2182
2183 TODO: Consider assigning different costs to different scalar
2184 statements. */
2185
2186 /* FORNOW. */
2187 if (loop->inner)
2188 innerloop_iters = 50; /* FIXME */
2189
2190 for (i = 0; i < nbbs; i++)
2191 {
2192 gimple_stmt_iterator si;
2193 basic_block bb = bbs[i];
2194
2195 if (bb->loop_father == loop->inner)
2196 factor = innerloop_iters;
2197 else
2198 factor = 1;
2199
2200 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2201 {
2202 gimple stmt = gsi_stmt (si);
2203 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2204 /* Skip stmts that are not vectorized inside the loop. */
2205 if (!STMT_VINFO_RELEVANT_P (stmt_info)
2206 && (!STMT_VINFO_LIVE_P (stmt_info)
2207 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2208 continue;
2209 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
2210 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
2211 some of the "outside" costs are generated inside the outer-loop. */
2212 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
2213 }
2214 }
2215
2216 scalar_single_iter_cost = vect_get_single_scalar_iteraion_cost (loop_vinfo);
2217
2218 /* Add additional cost for the peeled instructions in prologue and epilogue
2219 loop.
2220
2221 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2222 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2223
2224 TODO: Build an expression that represents peel_iters for prologue and
2225 epilogue to be used in a run-time test. */
2226
2227 if (npeel < 0)
2228 {
2229 peel_iters_prologue = vf/2;
2230 if (vect_print_dump_info (REPORT_COST))
2231 fprintf (vect_dump, "cost model: "
2232 "prologue peel iters set to vf/2.");
2233
2234 /* If peeling for alignment is unknown, loop bound of main loop becomes
2235 unknown. */
2236 peel_iters_epilogue = vf/2;
2237 if (vect_print_dump_info (REPORT_COST))
2238 fprintf (vect_dump, "cost model: "
2239 "epilogue peel iters set to vf/2 because "
2240 "peeling for alignment is unknown .");
2241
2242 /* If peeled iterations are unknown, count a taken branch and a not taken
2243 branch per peeled loop. Even if scalar loop iterations are known,
2244 vector iterations are not known since peeled prologue iterations are
2245 not known. Hence guards remain the same. */
2246 peel_guard_costs += 2 * (vect_get_cost (cond_branch_taken)
2247 + vect_get_cost (cond_branch_not_taken));
2248 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2249 + (peel_iters_epilogue * scalar_single_iter_cost)
2250 + peel_guard_costs;
2251 }
2252 else
2253 {
2254 peel_iters_prologue = npeel;
2255 vec_outside_cost += vect_get_known_peeling_cost (loop_vinfo,
2256 peel_iters_prologue, &peel_iters_epilogue,
2257 scalar_single_iter_cost);
2258 }
2259
2260 /* FORNOW: The scalar outside cost is incremented in one of the
2261 following ways:
2262
2263 1. The vectorizer checks for alignment and aliasing and generates
2264 a condition that allows dynamic vectorization. A cost model
2265 check is ANDED with the versioning condition. Hence scalar code
2266 path now has the added cost of the versioning check.
2267
2268 if (cost > th & versioning_check)
2269 jmp to vector code
2270
2271 Hence run-time scalar is incremented by not-taken branch cost.
2272
2273 2. The vectorizer then checks if a prologue is required. If the
2274 cost model check was not done before during versioning, it has to
2275 be done before the prologue check.
2276
2277 if (cost <= th)
2278 prologue = scalar_iters
2279 if (prologue == 0)
2280 jmp to vector code
2281 else
2282 execute prologue
2283 if (prologue == num_iters)
2284 go to exit
2285
2286 Hence the run-time scalar cost is incremented by a taken branch,
2287 plus a not-taken branch, plus a taken branch cost.
2288
2289 3. The vectorizer then checks if an epilogue is required. If the
2290 cost model check was not done before during prologue check, it
2291 has to be done with the epilogue check.
2292
2293 if (prologue == 0)
2294 jmp to vector code
2295 else
2296 execute prologue
2297 if (prologue == num_iters)
2298 go to exit
2299 vector code:
2300 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2301 jmp to epilogue
2302
2303 Hence the run-time scalar cost should be incremented by 2 taken
2304 branches.
2305
2306 TODO: The back end may reorder the BBS's differently and reverse
2307 conditions/branch directions. Change the estimates below to
2308 something more reasonable. */
2309
2310 /* If the number of iterations is known and we do not do versioning, we can
2311 decide whether to vectorize at compile time. Hence the scalar version
2312 do not carry cost model guard costs. */
2313 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2314 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2315 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2316 {
2317 /* Cost model check occurs at versioning. */
2318 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2319 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2320 scalar_outside_cost += vect_get_cost (cond_branch_not_taken);
2321 else
2322 {
2323 /* Cost model check occurs at prologue generation. */
2324 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2325 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken)
2326 + vect_get_cost (cond_branch_not_taken);
2327 /* Cost model check occurs at epilogue generation. */
2328 else
2329 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken);
2330 }
2331 }
2332
2333 /* Add SLP costs. */
2334 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2335 FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance)
2336 {
2337 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2338 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2339 }
2340
2341 /* Calculate number of iterations required to make the vector version
2342 profitable, relative to the loop bodies only. The following condition
2343 must hold true:
2344 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2345 where
2346 SIC = scalar iteration cost, VIC = vector iteration cost,
2347 VOC = vector outside cost, VF = vectorization factor,
2348 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2349 SOC = scalar outside cost for run time cost model check. */
2350
2351 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2352 {
2353 if (vec_outside_cost <= 0)
2354 min_profitable_iters = 1;
2355 else
2356 {
2357 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2358 - vec_inside_cost * peel_iters_prologue
2359 - vec_inside_cost * peel_iters_epilogue)
2360 / ((scalar_single_iter_cost * vf)
2361 - vec_inside_cost);
2362
2363 if ((scalar_single_iter_cost * vf * min_profitable_iters)
2364 <= ((vec_inside_cost * min_profitable_iters)
2365 + ((vec_outside_cost - scalar_outside_cost) * vf)))
2366 min_profitable_iters++;
2367 }
2368 }
2369 /* vector version will never be profitable. */
2370 else
2371 {
2372 if (vect_print_dump_info (REPORT_COST))
2373 fprintf (vect_dump, "cost model: the vector iteration cost = %d "
2374 "divided by the scalar iteration cost = %d "
2375 "is greater or equal to the vectorization factor = %d.",
2376 vec_inside_cost, scalar_single_iter_cost, vf);
2377 return -1;
2378 }
2379
2380 if (vect_print_dump_info (REPORT_COST))
2381 {
2382 fprintf (vect_dump, "Cost model analysis: \n");
2383 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
2384 vec_inside_cost);
2385 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
2386 vec_outside_cost);
2387 fprintf (vect_dump, " Scalar iteration cost: %d\n",
2388 scalar_single_iter_cost);
2389 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
2390 fprintf (vect_dump, " prologue iterations: %d\n",
2391 peel_iters_prologue);
2392 fprintf (vect_dump, " epilogue iterations: %d\n",
2393 peel_iters_epilogue);
2394 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
2395 min_profitable_iters);
2396 }
2397
2398 min_profitable_iters =
2399 min_profitable_iters < vf ? vf : min_profitable_iters;
2400
2401 /* Because the condition we create is:
2402 if (niters <= min_profitable_iters)
2403 then skip the vectorized loop. */
2404 min_profitable_iters--;
2405
2406 if (vect_print_dump_info (REPORT_COST))
2407 fprintf (vect_dump, " Profitability threshold = %d\n",
2408 min_profitable_iters);
2409
2410 return min_profitable_iters;
2411 }
2412
2413
2414 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2415 functions. Design better to avoid maintenance issues. */
2416
2417 /* Function vect_model_reduction_cost.
2418
2419 Models cost for a reduction operation, including the vector ops
2420 generated within the strip-mine loop, the initial definition before
2421 the loop, and the epilogue code that must be generated. */
2422
2423 static bool
2424 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2425 int ncopies)
2426 {
2427 int outer_cost = 0;
2428 enum tree_code code;
2429 optab optab;
2430 tree vectype;
2431 gimple stmt, orig_stmt;
2432 tree reduction_op;
2433 enum machine_mode mode;
2434 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2435 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2436
2437
2438 /* Cost of reduction op inside loop. */
2439 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2440 += ncopies * vect_get_cost (vector_stmt);
2441
2442 stmt = STMT_VINFO_STMT (stmt_info);
2443
2444 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2445 {
2446 case GIMPLE_SINGLE_RHS:
2447 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2448 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2449 break;
2450 case GIMPLE_UNARY_RHS:
2451 reduction_op = gimple_assign_rhs1 (stmt);
2452 break;
2453 case GIMPLE_BINARY_RHS:
2454 reduction_op = gimple_assign_rhs2 (stmt);
2455 break;
2456 default:
2457 gcc_unreachable ();
2458 }
2459
2460 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2461 if (!vectype)
2462 {
2463 if (vect_print_dump_info (REPORT_COST))
2464 {
2465 fprintf (vect_dump, "unsupported data-type ");
2466 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2467 }
2468 return false;
2469 }
2470
2471 mode = TYPE_MODE (vectype);
2472 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2473
2474 if (!orig_stmt)
2475 orig_stmt = STMT_VINFO_STMT (stmt_info);
2476
2477 code = gimple_assign_rhs_code (orig_stmt);
2478
2479 /* Add in cost for initial definition. */
2480 outer_cost += vect_get_cost (scalar_to_vec);
2481
2482 /* Determine cost of epilogue code.
2483
2484 We have a reduction operator that will reduce the vector in one statement.
2485 Also requires scalar extract. */
2486
2487 if (!nested_in_vect_loop_p (loop, orig_stmt))
2488 {
2489 if (reduc_code != ERROR_MARK)
2490 outer_cost += vect_get_cost (vector_stmt)
2491 + vect_get_cost (vec_to_scalar);
2492 else
2493 {
2494 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2495 tree bitsize =
2496 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2497 int element_bitsize = tree_low_cst (bitsize, 1);
2498 int nelements = vec_size_in_bits / element_bitsize;
2499
2500 optab = optab_for_tree_code (code, vectype, optab_default);
2501
2502 /* We have a whole vector shift available. */
2503 if (VECTOR_MODE_P (mode)
2504 && optab_handler (optab, mode) != CODE_FOR_nothing
2505 && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
2506 /* Final reduction via vector shifts and the reduction operator. Also
2507 requires scalar extract. */
2508 outer_cost += ((exact_log2(nelements) * 2)
2509 * vect_get_cost (vector_stmt)
2510 + vect_get_cost (vec_to_scalar));
2511 else
2512 /* Use extracts and reduction op for final reduction. For N elements,
2513 we have N extracts and N-1 reduction ops. */
2514 outer_cost += ((nelements + nelements - 1)
2515 * vect_get_cost (vector_stmt));
2516 }
2517 }
2518
2519 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2520
2521 if (vect_print_dump_info (REPORT_COST))
2522 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2523 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2524 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2525
2526 return true;
2527 }
2528
2529
2530 /* Function vect_model_induction_cost.
2531
2532 Models cost for induction operations. */
2533
2534 static void
2535 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2536 {
2537 /* loop cost for vec_loop. */
2538 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2539 = ncopies * vect_get_cost (vector_stmt);
2540 /* prologue cost for vec_init and vec_step. */
2541 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)
2542 = 2 * vect_get_cost (scalar_to_vec);
2543
2544 if (vect_print_dump_info (REPORT_COST))
2545 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2546 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2547 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2548 }
2549
2550
2551 /* Function get_initial_def_for_induction
2552
2553 Input:
2554 STMT - a stmt that performs an induction operation in the loop.
2555 IV_PHI - the initial value of the induction variable
2556
2557 Output:
2558 Return a vector variable, initialized with the first VF values of
2559 the induction variable. E.g., for an iv with IV_PHI='X' and
2560 evolution S, for a vector of 4 units, we want to return:
2561 [X, X + S, X + 2*S, X + 3*S]. */
2562
2563 static tree
2564 get_initial_def_for_induction (gimple iv_phi)
2565 {
2566 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2567 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2568 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2569 tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
2570 tree vectype;
2571 int nunits;
2572 edge pe = loop_preheader_edge (loop);
2573 struct loop *iv_loop;
2574 basic_block new_bb;
2575 tree vec, vec_init, vec_step, t;
2576 tree access_fn;
2577 tree new_var;
2578 tree new_name;
2579 gimple init_stmt, induction_phi, new_stmt;
2580 tree induc_def, vec_def, vec_dest;
2581 tree init_expr, step_expr;
2582 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2583 int i;
2584 bool ok;
2585 int ncopies;
2586 tree expr;
2587 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2588 bool nested_in_vect_loop = false;
2589 gimple_seq stmts = NULL;
2590 imm_use_iterator imm_iter;
2591 use_operand_p use_p;
2592 gimple exit_phi;
2593 edge latch_e;
2594 tree loop_arg;
2595 gimple_stmt_iterator si;
2596 basic_block bb = gimple_bb (iv_phi);
2597 tree stepvectype;
2598
2599 vectype = get_vectype_for_scalar_type (scalar_type);
2600 gcc_assert (vectype);
2601 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2602 ncopies = vf / nunits;
2603
2604 gcc_assert (phi_info);
2605 gcc_assert (ncopies >= 1);
2606
2607 /* Find the first insertion point in the BB. */
2608 si = gsi_after_labels (bb);
2609
2610 if (INTEGRAL_TYPE_P (scalar_type))
2611 step_expr = build_int_cst (scalar_type, 0);
2612 else if (POINTER_TYPE_P (scalar_type))
2613 step_expr = size_zero_node;
2614 else
2615 step_expr = build_real (scalar_type, dconst0);
2616
2617 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
2618 if (nested_in_vect_loop_p (loop, iv_phi))
2619 {
2620 nested_in_vect_loop = true;
2621 iv_loop = loop->inner;
2622 }
2623 else
2624 iv_loop = loop;
2625 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2626
2627 latch_e = loop_latch_edge (iv_loop);
2628 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2629
2630 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2631 gcc_assert (access_fn);
2632 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2633 &init_expr, &step_expr);
2634 gcc_assert (ok);
2635 pe = loop_preheader_edge (iv_loop);
2636
2637 /* Create the vector that holds the initial_value of the induction. */
2638 if (nested_in_vect_loop)
2639 {
2640 /* iv_loop is nested in the loop to be vectorized. init_expr had already
2641 been created during vectorization of previous stmts. We obtain it
2642 from the STMT_VINFO_VEC_STMT of the defining stmt. */
2643 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2644 loop_preheader_edge (iv_loop));
2645 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2646 }
2647 else
2648 {
2649 /* iv_loop is the loop to be vectorized. Create:
2650 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
2651 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2652 add_referenced_var (new_var);
2653
2654 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2655 if (stmts)
2656 {
2657 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2658 gcc_assert (!new_bb);
2659 }
2660
2661 t = NULL_TREE;
2662 t = tree_cons (NULL_TREE, init_expr, t);
2663 for (i = 1; i < nunits; i++)
2664 {
2665 /* Create: new_name_i = new_name + step_expr */
2666 enum tree_code code = POINTER_TYPE_P (scalar_type)
2667 ? POINTER_PLUS_EXPR : PLUS_EXPR;
2668 init_stmt = gimple_build_assign_with_ops (code, new_var,
2669 new_name, step_expr);
2670 new_name = make_ssa_name (new_var, init_stmt);
2671 gimple_assign_set_lhs (init_stmt, new_name);
2672
2673 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
2674 gcc_assert (!new_bb);
2675
2676 if (vect_print_dump_info (REPORT_DETAILS))
2677 {
2678 fprintf (vect_dump, "created new init_stmt: ");
2679 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
2680 }
2681 t = tree_cons (NULL_TREE, new_name, t);
2682 }
2683 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
2684 vec = build_constructor_from_list (vectype, nreverse (t));
2685 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
2686 }
2687
2688
2689 /* Create the vector that holds the step of the induction. */
2690 if (nested_in_vect_loop)
2691 /* iv_loop is nested in the loop to be vectorized. Generate:
2692 vec_step = [S, S, S, S] */
2693 new_name = step_expr;
2694 else
2695 {
2696 /* iv_loop is the loop to be vectorized. Generate:
2697 vec_step = [VF*S, VF*S, VF*S, VF*S] */
2698 expr = build_int_cst (TREE_TYPE (step_expr), vf);
2699 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2700 expr, step_expr);
2701 }
2702
2703 t = NULL_TREE;
2704 for (i = 0; i < nunits; i++)
2705 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2706 gcc_assert (CONSTANT_CLASS_P (new_name));
2707 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
2708 gcc_assert (stepvectype);
2709 vec = build_vector (stepvectype, t);
2710 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2711
2712
2713 /* Create the following def-use cycle:
2714 loop prolog:
2715 vec_init = ...
2716 vec_step = ...
2717 loop:
2718 vec_iv = PHI <vec_init, vec_loop>
2719 ...
2720 STMT
2721 ...
2722 vec_loop = vec_iv + vec_step; */
2723
2724 /* Create the induction-phi that defines the induction-operand. */
2725 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
2726 add_referenced_var (vec_dest);
2727 induction_phi = create_phi_node (vec_dest, iv_loop->header);
2728 set_vinfo_for_stmt (induction_phi,
2729 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
2730 induc_def = PHI_RESULT (induction_phi);
2731
2732 /* Create the iv update inside the loop */
2733 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2734 induc_def, vec_step);
2735 vec_def = make_ssa_name (vec_dest, new_stmt);
2736 gimple_assign_set_lhs (new_stmt, vec_def);
2737 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2738 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
2739 NULL));
2740
2741 /* Set the arguments of the phi node: */
2742 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
2743 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
2744 UNKNOWN_LOCATION);
2745
2746
2747 /* In case that vectorization factor (VF) is bigger than the number
2748 of elements that we can fit in a vectype (nunits), we have to generate
2749 more than one vector stmt - i.e - we need to "unroll" the
2750 vector stmt by a factor VF/nunits. For more details see documentation
2751 in vectorizable_operation. */
2752
2753 if (ncopies > 1)
2754 {
2755 stmt_vec_info prev_stmt_vinfo;
2756 /* FORNOW. This restriction should be relaxed. */
2757 gcc_assert (!nested_in_vect_loop);
2758
2759 /* Create the vector that holds the step of the induction. */
2760 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
2761 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2762 expr, step_expr);
2763 t = NULL_TREE;
2764 for (i = 0; i < nunits; i++)
2765 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2766 gcc_assert (CONSTANT_CLASS_P (new_name));
2767 vec = build_vector (stepvectype, t);
2768 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2769
2770 vec_def = induc_def;
2771 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
2772 for (i = 1; i < ncopies; i++)
2773 {
2774 /* vec_i = vec_prev + vec_step */
2775 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2776 vec_def, vec_step);
2777 vec_def = make_ssa_name (vec_dest, new_stmt);
2778 gimple_assign_set_lhs (new_stmt, vec_def);
2779
2780 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2781 set_vinfo_for_stmt (new_stmt,
2782 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2783 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
2784 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
2785 }
2786 }
2787
2788 if (nested_in_vect_loop)
2789 {
2790 /* Find the loop-closed exit-phi of the induction, and record
2791 the final vector of induction results: */
2792 exit_phi = NULL;
2793 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
2794 {
2795 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
2796 {
2797 exit_phi = USE_STMT (use_p);
2798 break;
2799 }
2800 }
2801 if (exit_phi)
2802 {
2803 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2804 /* FORNOW. Currently not supporting the case that an inner-loop induction
2805 is not used in the outer-loop (i.e. only outside the outer-loop). */
2806 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2807 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2808
2809 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
2810 if (vect_print_dump_info (REPORT_DETAILS))
2811 {
2812 fprintf (vect_dump, "vector of inductions after inner-loop:");
2813 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
2814 }
2815 }
2816 }
2817
2818
2819 if (vect_print_dump_info (REPORT_DETAILS))
2820 {
2821 fprintf (vect_dump, "transform induction: created def-use cycle: ");
2822 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
2823 fprintf (vect_dump, "\n");
2824 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
2825 }
2826
2827 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
2828 return induc_def;
2829 }
2830
2831
2832 /* Function get_initial_def_for_reduction
2833
2834 Input:
2835 STMT - a stmt that performs a reduction operation in the loop.
2836 INIT_VAL - the initial value of the reduction variable
2837
2838 Output:
2839 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2840 of the reduction (used for adjusting the epilog - see below).
2841 Return a vector variable, initialized according to the operation that STMT
2842 performs. This vector will be used as the initial value of the
2843 vector of partial results.
2844
2845 Option1 (adjust in epilog): Initialize the vector as follows:
2846 add/bit or/xor: [0,0,...,0,0]
2847 mult/bit and: [1,1,...,1,1]
2848 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
2849 and when necessary (e.g. add/mult case) let the caller know
2850 that it needs to adjust the result by init_val.
2851
2852 Option2: Initialize the vector as follows:
2853 add/bit or/xor: [init_val,0,0,...,0]
2854 mult/bit and: [init_val,1,1,...,1]
2855 min/max/cond_expr: [init_val,init_val,...,init_val]
2856 and no adjustments are needed.
2857
2858 For example, for the following code:
2859
2860 s = init_val;
2861 for (i=0;i<n;i++)
2862 s = s + a[i];
2863
2864 STMT is 's = s + a[i]', and the reduction variable is 's'.
2865 For a vector of 4 units, we want to return either [0,0,0,init_val],
2866 or [0,0,0,0] and let the caller know that it needs to adjust
2867 the result at the end by 'init_val'.
2868
2869 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2870 initialization vector is simpler (same element in all entries), if
2871 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
2872
2873 A cost model should help decide between these two schemes. */
2874
2875 tree
2876 get_initial_def_for_reduction (gimple stmt, tree init_val,
2877 tree *adjustment_def)
2878 {
2879 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2880 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2881 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2882 tree scalar_type = TREE_TYPE (init_val);
2883 tree vectype = get_vectype_for_scalar_type (scalar_type);
2884 int nunits;
2885 enum tree_code code = gimple_assign_rhs_code (stmt);
2886 tree def_for_init;
2887 tree init_def;
2888 tree t = NULL_TREE;
2889 int i;
2890 bool nested_in_vect_loop = false;
2891 tree init_value;
2892 REAL_VALUE_TYPE real_init_val = dconst0;
2893 int int_init_val = 0;
2894 gimple def_stmt = NULL;
2895
2896 gcc_assert (vectype);
2897 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2898
2899 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
2900 || SCALAR_FLOAT_TYPE_P (scalar_type));
2901
2902 if (nested_in_vect_loop_p (loop, stmt))
2903 nested_in_vect_loop = true;
2904 else
2905 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2906
2907 /* In case of double reduction we only create a vector variable to be put
2908 in the reduction phi node. The actual statement creation is done in
2909 vect_create_epilog_for_reduction. */
2910 if (adjustment_def && nested_in_vect_loop
2911 && TREE_CODE (init_val) == SSA_NAME
2912 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
2913 && gimple_code (def_stmt) == GIMPLE_PHI
2914 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2915 && vinfo_for_stmt (def_stmt)
2916 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2917 == vect_double_reduction_def)
2918 {
2919 *adjustment_def = NULL;
2920 return vect_create_destination_var (init_val, vectype);
2921 }
2922
2923 if (TREE_CONSTANT (init_val))
2924 {
2925 if (SCALAR_FLOAT_TYPE_P (scalar_type))
2926 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
2927 else
2928 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
2929 }
2930 else
2931 init_value = init_val;
2932
2933 switch (code)
2934 {
2935 case WIDEN_SUM_EXPR:
2936 case DOT_PROD_EXPR:
2937 case PLUS_EXPR:
2938 case MINUS_EXPR:
2939 case BIT_IOR_EXPR:
2940 case BIT_XOR_EXPR:
2941 case MULT_EXPR:
2942 case BIT_AND_EXPR:
2943 /* ADJUSMENT_DEF is NULL when called from
2944 vect_create_epilog_for_reduction to vectorize double reduction. */
2945 if (adjustment_def)
2946 {
2947 if (nested_in_vect_loop)
2948 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
2949 NULL);
2950 else
2951 *adjustment_def = init_val;
2952 }
2953
2954 if (code == MULT_EXPR)
2955 {
2956 real_init_val = dconst1;
2957 int_init_val = 1;
2958 }
2959
2960 if (code == BIT_AND_EXPR)
2961 int_init_val = -1;
2962
2963 if (SCALAR_FLOAT_TYPE_P (scalar_type))
2964 def_for_init = build_real (scalar_type, real_init_val);
2965 else
2966 def_for_init = build_int_cst (scalar_type, int_init_val);
2967
2968 /* Create a vector of '0' or '1' except the first element. */
2969 for (i = nunits - 2; i >= 0; --i)
2970 t = tree_cons (NULL_TREE, def_for_init, t);
2971
2972 /* Option1: the first element is '0' or '1' as well. */
2973 if (adjustment_def)
2974 {
2975 t = tree_cons (NULL_TREE, def_for_init, t);
2976 init_def = build_vector (vectype, t);
2977 break;
2978 }
2979
2980 /* Option2: the first element is INIT_VAL. */
2981 t = tree_cons (NULL_TREE, init_value, t);
2982 if (TREE_CONSTANT (init_val))
2983 init_def = build_vector (vectype, t);
2984 else
2985 init_def = build_constructor_from_list (vectype, t);
2986
2987 break;
2988
2989 case MIN_EXPR:
2990 case MAX_EXPR:
2991 case COND_EXPR:
2992 if (adjustment_def)
2993 {
2994 *adjustment_def = NULL_TREE;
2995 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2996 break;
2997 }
2998
2999 for (i = nunits - 1; i >= 0; --i)
3000 t = tree_cons (NULL_TREE, init_value, t);
3001
3002 if (TREE_CONSTANT (init_val))
3003 init_def = build_vector (vectype, t);
3004 else
3005 init_def = build_constructor_from_list (vectype, t);
3006
3007 break;
3008
3009 default:
3010 gcc_unreachable ();
3011 }
3012
3013 return init_def;
3014 }
3015
3016
3017 /* Function vect_create_epilog_for_reduction
3018
3019 Create code at the loop-epilog to finalize the result of a reduction
3020 computation.
3021
3022 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3023 reduction statements.
3024 STMT is the scalar reduction stmt that is being vectorized.
3025 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3026 number of elements that we can fit in a vectype (nunits). In this case
3027 we have to generate more than one vector stmt - i.e - we need to "unroll"
3028 the vector stmt by a factor VF/nunits. For more details see documentation
3029 in vectorizable_operation.
3030 REDUC_CODE is the tree-code for the epilog reduction.
3031 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3032 computation.
3033 REDUC_INDEX is the index of the operand in the right hand side of the
3034 statement that is defined by REDUCTION_PHI.
3035 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3036 SLP_NODE is an SLP node containing a group of reduction statements. The
3037 first one in this group is STMT.
3038
3039 This function:
3040 1. Creates the reduction def-use cycles: sets the arguments for
3041 REDUCTION_PHIS:
3042 The loop-entry argument is the vectorized initial-value of the reduction.
3043 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3044 sums.
3045 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3046 by applying the operation specified by REDUC_CODE if available, or by
3047 other means (whole-vector shifts or a scalar loop).
3048 The function also creates a new phi node at the loop exit to preserve
3049 loop-closed form, as illustrated below.
3050
3051 The flow at the entry to this function:
3052
3053 loop:
3054 vec_def = phi <null, null> # REDUCTION_PHI
3055 VECT_DEF = vector_stmt # vectorized form of STMT
3056 s_loop = scalar_stmt # (scalar) STMT
3057 loop_exit:
3058 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3059 use <s_out0>
3060 use <s_out0>
3061
3062 The above is transformed by this function into:
3063
3064 loop:
3065 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3066 VECT_DEF = vector_stmt # vectorized form of STMT
3067 s_loop = scalar_stmt # (scalar) STMT
3068 loop_exit:
3069 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3070 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3071 v_out2 = reduce <v_out1>
3072 s_out3 = extract_field <v_out2, 0>
3073 s_out4 = adjust_result <s_out3>
3074 use <s_out4>
3075 use <s_out4>
3076 */
3077
3078 static void
3079 vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
3080 int ncopies, enum tree_code reduc_code,
3081 VEC (gimple, heap) *reduction_phis,
3082 int reduc_index, bool double_reduc,
3083 slp_tree slp_node)
3084 {
3085 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3086 stmt_vec_info prev_phi_info;
3087 tree vectype;
3088 enum machine_mode mode;
3089 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3090 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3091 basic_block exit_bb;
3092 tree scalar_dest;
3093 tree scalar_type;
3094 gimple new_phi = NULL, phi;
3095 gimple_stmt_iterator exit_gsi;
3096 tree vec_dest;
3097 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3098 gimple epilog_stmt = NULL;
3099 enum tree_code code = gimple_assign_rhs_code (stmt);
3100 gimple exit_phi;
3101 tree bitsize, bitpos;
3102 tree adjustment_def = NULL;
3103 tree vec_initial_def = NULL;
3104 tree reduction_op, expr, def;
3105 tree orig_name, scalar_result;
3106 imm_use_iterator imm_iter, phi_imm_iter;
3107 use_operand_p use_p, phi_use_p;
3108 bool extract_scalar_result = false;
3109 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3110 bool nested_in_vect_loop = false;
3111 VEC (gimple, heap) *new_phis = NULL;
3112 enum vect_def_type dt = vect_unknown_def_type;
3113 int j, i;
3114 VEC (tree, heap) *scalar_results = NULL;
3115 unsigned int group_size = 1, k, ratio;
3116 VEC (tree, heap) *vec_initial_defs = NULL;
3117 VEC (gimple, heap) *phis;
3118
3119 if (slp_node)
3120 group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node));
3121
3122 if (nested_in_vect_loop_p (loop, stmt))
3123 {
3124 outer_loop = loop;
3125 loop = loop->inner;
3126 nested_in_vect_loop = true;
3127 gcc_assert (!slp_node);
3128 }
3129
3130 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3131 {
3132 case GIMPLE_SINGLE_RHS:
3133 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3134 == ternary_op);
3135 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3136 break;
3137 case GIMPLE_UNARY_RHS:
3138 reduction_op = gimple_assign_rhs1 (stmt);
3139 break;
3140 case GIMPLE_BINARY_RHS:
3141 reduction_op = reduc_index ?
3142 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3143 break;
3144 default:
3145 gcc_unreachable ();
3146 }
3147
3148 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3149 gcc_assert (vectype);
3150 mode = TYPE_MODE (vectype);
3151
3152 /* 1. Create the reduction def-use cycle:
3153 Set the arguments of REDUCTION_PHIS, i.e., transform
3154
3155 loop:
3156 vec_def = phi <null, null> # REDUCTION_PHI
3157 VECT_DEF = vector_stmt # vectorized form of STMT
3158 ...
3159
3160 into:
3161
3162 loop:
3163 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3164 VECT_DEF = vector_stmt # vectorized form of STMT
3165 ...
3166
3167 (in case of SLP, do it for all the phis). */
3168
3169 /* Get the loop-entry arguments. */
3170 if (slp_node)
3171 vect_get_slp_defs (slp_node, &vec_initial_defs, NULL, reduc_index);
3172 else
3173 {
3174 vec_initial_defs = VEC_alloc (tree, heap, 1);
3175 /* For the case of reduction, vect_get_vec_def_for_operand returns
3176 the scalar def before the loop, that defines the initial value
3177 of the reduction variable. */
3178 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3179 &adjustment_def);
3180 VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
3181 }
3182
3183 /* Set phi nodes arguments. */
3184 FOR_EACH_VEC_ELT (gimple, reduction_phis, i, phi)
3185 {
3186 tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
3187 tree def = VEC_index (tree, vect_defs, i);
3188 for (j = 0; j < ncopies; j++)
3189 {
3190 /* Set the loop-entry arg of the reduction-phi. */
3191 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3192 UNKNOWN_LOCATION);
3193
3194 /* Set the loop-latch arg for the reduction-phi. */
3195 if (j > 0)
3196 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3197
3198 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3199
3200 if (vect_print_dump_info (REPORT_DETAILS))
3201 {
3202 fprintf (vect_dump, "transform reduction: created def-use"
3203 " cycle: ");
3204 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
3205 fprintf (vect_dump, "\n");
3206 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0,
3207 TDF_SLIM);
3208 }
3209
3210 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3211 }
3212 }
3213
3214 VEC_free (tree, heap, vec_initial_defs);
3215
3216 /* 2. Create epilog code.
3217 The reduction epilog code operates across the elements of the vector
3218 of partial results computed by the vectorized loop.
3219 The reduction epilog code consists of:
3220
3221 step 1: compute the scalar result in a vector (v_out2)
3222 step 2: extract the scalar result (s_out3) from the vector (v_out2)
3223 step 3: adjust the scalar result (s_out3) if needed.
3224
3225 Step 1 can be accomplished using one the following three schemes:
3226 (scheme 1) using reduc_code, if available.
3227 (scheme 2) using whole-vector shifts, if available.
3228 (scheme 3) using a scalar loop. In this case steps 1+2 above are
3229 combined.
3230
3231 The overall epilog code looks like this:
3232
3233 s_out0 = phi <s_loop> # original EXIT_PHI
3234 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3235 v_out2 = reduce <v_out1> # step 1
3236 s_out3 = extract_field <v_out2, 0> # step 2
3237 s_out4 = adjust_result <s_out3> # step 3
3238
3239 (step 3 is optional, and steps 1 and 2 may be combined).
3240 Lastly, the uses of s_out0 are replaced by s_out4. */
3241
3242
3243 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3244 v_out1 = phi <VECT_DEF>
3245 Store them in NEW_PHIS. */
3246
3247 exit_bb = single_exit (loop)->dest;
3248 prev_phi_info = NULL;
3249 new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3250 FOR_EACH_VEC_ELT (tree, vect_defs, i, def)
3251 {
3252 for (j = 0; j < ncopies; j++)
3253 {
3254 phi = create_phi_node (SSA_NAME_VAR (def), exit_bb);
3255 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3256 if (j == 0)
3257 VEC_quick_push (gimple, new_phis, phi);
3258 else
3259 {
3260 def = vect_get_vec_def_for_stmt_copy (dt, def);
3261 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3262 }
3263
3264 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3265 prev_phi_info = vinfo_for_stmt (phi);
3266 }
3267 }
3268
3269 /* The epilogue is created for the outer-loop, i.e., for the loop being
3270 vectorized. */
3271 if (double_reduc)
3272 {
3273 loop = outer_loop;
3274 exit_bb = single_exit (loop)->dest;
3275 }
3276
3277 exit_gsi = gsi_after_labels (exit_bb);
3278
3279 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3280 (i.e. when reduc_code is not available) and in the final adjustment
3281 code (if needed). Also get the original scalar reduction variable as
3282 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
3283 represents a reduction pattern), the tree-code and scalar-def are
3284 taken from the original stmt that the pattern-stmt (STMT) replaces.
3285 Otherwise (it is a regular reduction) - the tree-code and scalar-def
3286 are taken from STMT. */
3287
3288 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3289 if (!orig_stmt)
3290 {
3291 /* Regular reduction */
3292 orig_stmt = stmt;
3293 }
3294 else
3295 {
3296 /* Reduction pattern */
3297 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3298 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3299 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3300 }
3301
3302 code = gimple_assign_rhs_code (orig_stmt);
3303 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3304 partial results are added and not subtracted. */
3305 if (code == MINUS_EXPR)
3306 code = PLUS_EXPR;
3307
3308 scalar_dest = gimple_assign_lhs (orig_stmt);
3309 scalar_type = TREE_TYPE (scalar_dest);
3310 scalar_results = VEC_alloc (tree, heap, group_size);
3311 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3312 bitsize = TYPE_SIZE (scalar_type);
3313
3314 /* In case this is a reduction in an inner-loop while vectorizing an outer
3315 loop - we don't need to extract a single scalar result at the end of the
3316 inner-loop (unless it is double reduction, i.e., the use of reduction is
3317 outside the outer-loop). The final vector of partial results will be used
3318 in the vectorized outer-loop, or reduced to a scalar result at the end of
3319 the outer-loop. */
3320 if (nested_in_vect_loop && !double_reduc)
3321 goto vect_finalize_reduction;
3322
3323 /* 2.3 Create the reduction code, using one of the three schemes described
3324 above. In SLP we simply need to extract all the elements from the
3325 vector (without reducing them), so we use scalar shifts. */
3326 if (reduc_code != ERROR_MARK && !slp_node)
3327 {
3328 tree tmp;
3329
3330 /*** Case 1: Create:
3331 v_out2 = reduc_expr <v_out1> */
3332
3333 if (vect_print_dump_info (REPORT_DETAILS))
3334 fprintf (vect_dump, "Reduce using direct vector reduction.");
3335
3336 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3337 new_phi = VEC_index (gimple, new_phis, 0);
3338 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
3339 epilog_stmt = gimple_build_assign (vec_dest, tmp);
3340 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3341 gimple_assign_set_lhs (epilog_stmt, new_temp);
3342 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3343
3344 extract_scalar_result = true;
3345 }
3346 else
3347 {
3348 enum tree_code shift_code = ERROR_MARK;
3349 bool have_whole_vector_shift = true;
3350 int bit_offset;
3351 int element_bitsize = tree_low_cst (bitsize, 1);
3352 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3353 tree vec_temp;
3354
3355 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3356 shift_code = VEC_RSHIFT_EXPR;
3357 else
3358 have_whole_vector_shift = false;
3359
3360 /* Regardless of whether we have a whole vector shift, if we're
3361 emulating the operation via tree-vect-generic, we don't want
3362 to use it. Only the first round of the reduction is likely
3363 to still be profitable via emulation. */
3364 /* ??? It might be better to emit a reduction tree code here, so that
3365 tree-vect-generic can expand the first round via bit tricks. */
3366 if (!VECTOR_MODE_P (mode))
3367 have_whole_vector_shift = false;
3368 else
3369 {
3370 optab optab = optab_for_tree_code (code, vectype, optab_default);
3371 if (optab_handler (optab, mode) == CODE_FOR_nothing)
3372 have_whole_vector_shift = false;
3373 }
3374
3375 if (have_whole_vector_shift && !slp_node)
3376 {
3377 /*** Case 2: Create:
3378 for (offset = VS/2; offset >= element_size; offset/=2)
3379 {
3380 Create: va' = vec_shift <va, offset>
3381 Create: va = vop <va, va'>
3382 } */
3383
3384 if (vect_print_dump_info (REPORT_DETAILS))
3385 fprintf (vect_dump, "Reduce using vector shifts");
3386
3387 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3388 new_phi = VEC_index (gimple, new_phis, 0);
3389 new_temp = PHI_RESULT (new_phi);
3390 for (bit_offset = vec_size_in_bits/2;
3391 bit_offset >= element_bitsize;
3392 bit_offset /= 2)
3393 {
3394 tree bitpos = size_int (bit_offset);
3395
3396 epilog_stmt = gimple_build_assign_with_ops (shift_code,
3397 vec_dest, new_temp, bitpos);
3398 new_name = make_ssa_name (vec_dest, epilog_stmt);
3399 gimple_assign_set_lhs (epilog_stmt, new_name);
3400 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3401
3402 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3403 new_name, new_temp);
3404 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3405 gimple_assign_set_lhs (epilog_stmt, new_temp);
3406 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3407 }
3408
3409 extract_scalar_result = true;
3410 }
3411 else
3412 {
3413 tree rhs;
3414
3415 /*** Case 3: Create:
3416 s = extract_field <v_out2, 0>
3417 for (offset = element_size;
3418 offset < vector_size;
3419 offset += element_size;)
3420 {
3421 Create: s' = extract_field <v_out2, offset>
3422 Create: s = op <s, s'> // For non SLP cases
3423 } */
3424
3425 if (vect_print_dump_info (REPORT_DETAILS))
3426 fprintf (vect_dump, "Reduce using scalar code. ");
3427
3428 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3429 FOR_EACH_VEC_ELT (gimple, new_phis, i, new_phi)
3430 {
3431 vec_temp = PHI_RESULT (new_phi);
3432 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3433 bitsize_zero_node);
3434 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3435 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3436 gimple_assign_set_lhs (epilog_stmt, new_temp);
3437 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3438
3439 /* In SLP we don't need to apply reduction operation, so we just
3440 collect s' values in SCALAR_RESULTS. */
3441 if (slp_node)
3442 VEC_safe_push (tree, heap, scalar_results, new_temp);
3443
3444 for (bit_offset = element_bitsize;
3445 bit_offset < vec_size_in_bits;
3446 bit_offset += element_bitsize)
3447 {
3448 tree bitpos = bitsize_int (bit_offset);
3449 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
3450 bitsize, bitpos);
3451
3452 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3453 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3454 gimple_assign_set_lhs (epilog_stmt, new_name);
3455 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3456
3457 if (slp_node)
3458 {
3459 /* In SLP we don't need to apply reduction operation, so
3460 we just collect s' values in SCALAR_RESULTS. */
3461 new_temp = new_name;
3462 VEC_safe_push (tree, heap, scalar_results, new_name);
3463 }
3464 else
3465 {
3466 epilog_stmt = gimple_build_assign_with_ops (code,
3467 new_scalar_dest, new_name, new_temp);
3468 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3469 gimple_assign_set_lhs (epilog_stmt, new_temp);
3470 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3471 }
3472 }
3473 }
3474
3475 /* The only case where we need to reduce scalar results in SLP, is
3476 unrolling. If the size of SCALAR_RESULTS is greater than
3477 GROUP_SIZE, we reduce them combining elements modulo
3478 GROUP_SIZE. */
3479 if (slp_node)
3480 {
3481 tree res, first_res, new_res;
3482 gimple new_stmt;
3483
3484 /* Reduce multiple scalar results in case of SLP unrolling. */
3485 for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
3486 j++)
3487 {
3488 first_res = VEC_index (tree, scalar_results, j % group_size);
3489 new_stmt = gimple_build_assign_with_ops (code,
3490 new_scalar_dest, first_res, res);
3491 new_res = make_ssa_name (new_scalar_dest, new_stmt);
3492 gimple_assign_set_lhs (new_stmt, new_res);
3493 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
3494 VEC_replace (tree, scalar_results, j % group_size, new_res);
3495 }
3496 }
3497 else
3498 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
3499 VEC_safe_push (tree, heap, scalar_results, new_temp);
3500
3501 extract_scalar_result = false;
3502 }
3503 }
3504
3505 /* 2.4 Extract the final scalar result. Create:
3506 s_out3 = extract_field <v_out2, bitpos> */
3507
3508 if (extract_scalar_result)
3509 {
3510 tree rhs;
3511
3512 if (vect_print_dump_info (REPORT_DETAILS))
3513 fprintf (vect_dump, "extract scalar result");
3514
3515 if (BYTES_BIG_ENDIAN)
3516 bitpos = size_binop (MULT_EXPR,
3517 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3518 TYPE_SIZE (scalar_type));
3519 else
3520 bitpos = bitsize_zero_node;
3521
3522 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3523 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3524 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3525 gimple_assign_set_lhs (epilog_stmt, new_temp);
3526 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3527 VEC_safe_push (tree, heap, scalar_results, new_temp);
3528 }
3529
3530 vect_finalize_reduction:
3531
3532 if (double_reduc)
3533 loop = loop->inner;
3534
3535 /* 2.5 Adjust the final result by the initial value of the reduction
3536 variable. (When such adjustment is not needed, then
3537 'adjustment_def' is zero). For example, if code is PLUS we create:
3538 new_temp = loop_exit_def + adjustment_def */
3539
3540 if (adjustment_def)
3541 {
3542 gcc_assert (!slp_node);
3543 if (nested_in_vect_loop)
3544 {
3545 new_phi = VEC_index (gimple, new_phis, 0);
3546 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3547 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3548 new_dest = vect_create_destination_var (scalar_dest, vectype);
3549 }
3550 else
3551 {
3552 new_temp = VEC_index (tree, scalar_results, 0);
3553 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3554 expr = build2 (code, scalar_type, new_temp, adjustment_def);
3555 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3556 }
3557
3558 epilog_stmt = gimple_build_assign (new_dest, expr);
3559 new_temp = make_ssa_name (new_dest, epilog_stmt);
3560 gimple_assign_set_lhs (epilog_stmt, new_temp);
3561 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3562 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3563 if (nested_in_vect_loop)
3564 {
3565 set_vinfo_for_stmt (epilog_stmt,
3566 new_stmt_vec_info (epilog_stmt, loop_vinfo,
3567 NULL));
3568 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3569 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
3570
3571 if (!double_reduc)
3572 VEC_quick_push (tree, scalar_results, new_temp);
3573 else
3574 VEC_replace (tree, scalar_results, 0, new_temp);
3575 }
3576 else
3577 VEC_replace (tree, scalar_results, 0, new_temp);
3578
3579 VEC_replace (gimple, new_phis, 0, epilog_stmt);
3580 }
3581
3582 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
3583 phis with new adjusted scalar results, i.e., replace use <s_out0>
3584 with use <s_out4>.
3585
3586 Transform:
3587 loop_exit:
3588 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3589 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3590 v_out2 = reduce <v_out1>
3591 s_out3 = extract_field <v_out2, 0>
3592 s_out4 = adjust_result <s_out3>
3593 use <s_out0>
3594 use <s_out0>
3595
3596 into:
3597
3598 loop_exit:
3599 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3600 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3601 v_out2 = reduce <v_out1>
3602 s_out3 = extract_field <v_out2, 0>
3603 s_out4 = adjust_result <s_out3>
3604 use <s_out4>
3605 use <s_out4> */
3606
3607 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
3608 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
3609 need to match SCALAR_RESULTS with corresponding statements. The first
3610 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
3611 the first vector stmt, etc.
3612 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
3613 if (group_size > VEC_length (gimple, new_phis))
3614 {
3615 ratio = group_size / VEC_length (gimple, new_phis);
3616 gcc_assert (!(group_size % VEC_length (gimple, new_phis)));
3617 }
3618 else
3619 ratio = 1;
3620
3621 for (k = 0; k < group_size; k++)
3622 {
3623 if (k % ratio == 0)
3624 {
3625 epilog_stmt = VEC_index (gimple, new_phis, k / ratio);
3626 reduction_phi = VEC_index (gimple, reduction_phis, k / ratio);
3627 }
3628
3629 if (slp_node)
3630 {
3631 gimple current_stmt = VEC_index (gimple,
3632 SLP_TREE_SCALAR_STMTS (slp_node), k);
3633
3634 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
3635 /* SLP statements can't participate in patterns. */
3636 gcc_assert (!orig_stmt);
3637 scalar_dest = gimple_assign_lhs (current_stmt);
3638 }
3639
3640 phis = VEC_alloc (gimple, heap, 3);
3641 /* Find the loop-closed-use at the loop exit of the original scalar
3642 result. (The reduction result is expected to have two immediate uses -
3643 one at the latch block, and one at the loop exit). */
3644 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3645 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3646 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
3647
3648 /* We expect to have found an exit_phi because of loop-closed-ssa
3649 form. */
3650 gcc_assert (!VEC_empty (gimple, phis));
3651
3652 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
3653 {
3654 if (outer_loop)
3655 {
3656 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
3657 gimple vect_phi;
3658
3659 /* FORNOW. Currently not supporting the case that an inner-loop
3660 reduction is not used in the outer-loop (but only outside the
3661 outer-loop), unless it is double reduction. */
3662 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
3663 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
3664 || double_reduc);
3665
3666 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
3667 if (!double_reduc
3668 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
3669 != vect_double_reduction_def)
3670 continue;
3671
3672 /* Handle double reduction:
3673
3674 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
3675 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
3676 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
3677 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
3678
3679 At that point the regular reduction (stmt2 and stmt3) is
3680 already vectorized, as well as the exit phi node, stmt4.
3681 Here we vectorize the phi node of double reduction, stmt1, and
3682 update all relevant statements. */
3683
3684 /* Go through all the uses of s2 to find double reduction phi
3685 node, i.e., stmt1 above. */
3686 orig_name = PHI_RESULT (exit_phi);
3687 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3688 {
3689 stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
3690 stmt_vec_info new_phi_vinfo;
3691 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
3692 basic_block bb = gimple_bb (use_stmt);
3693 gimple use;
3694
3695 /* Check that USE_STMT is really double reduction phi
3696 node. */
3697 if (gimple_code (use_stmt) != GIMPLE_PHI
3698 || gimple_phi_num_args (use_stmt) != 2
3699 || !use_stmt_vinfo
3700 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
3701 != vect_double_reduction_def
3702 || bb->loop_father != outer_loop)
3703 continue;
3704
3705 /* Create vector phi node for double reduction:
3706 vs1 = phi <vs0, vs2>
3707 vs1 was created previously in this function by a call to
3708 vect_get_vec_def_for_operand and is stored in
3709 vec_initial_def;
3710 vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
3711 vs0 is created here. */
3712
3713 /* Create vector phi node. */
3714 vect_phi = create_phi_node (vec_initial_def, bb);
3715 new_phi_vinfo = new_stmt_vec_info (vect_phi,
3716 loop_vec_info_for_loop (outer_loop), NULL);
3717 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
3718
3719 /* Create vs0 - initial def of the double reduction phi. */
3720 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
3721 loop_preheader_edge (outer_loop));
3722 init_def = get_initial_def_for_reduction (stmt,
3723 preheader_arg, NULL);
3724 vect_phi_init = vect_init_vector (use_stmt, init_def,
3725 vectype, NULL);
3726
3727 /* Update phi node arguments with vs0 and vs2. */
3728 add_phi_arg (vect_phi, vect_phi_init,
3729 loop_preheader_edge (outer_loop),
3730 UNKNOWN_LOCATION);
3731 add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
3732 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
3733 if (vect_print_dump_info (REPORT_DETAILS))
3734 {
3735 fprintf (vect_dump, "created double reduction phi "
3736 "node: ");
3737 print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
3738 }
3739
3740 vect_phi_res = PHI_RESULT (vect_phi);
3741
3742 /* Replace the use, i.e., set the correct vs1 in the regular
3743 reduction phi node. FORNOW, NCOPIES is always 1, so the
3744 loop is redundant. */
3745 use = reduction_phi;
3746 for (j = 0; j < ncopies; j++)
3747 {
3748 edge pr_edge = loop_preheader_edge (loop);
3749 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
3750 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
3751 }
3752 }
3753 }
3754 }
3755
3756 VEC_free (gimple, heap, phis);
3757 if (nested_in_vect_loop)
3758 {
3759 if (double_reduc)
3760 loop = outer_loop;
3761 else
3762 continue;
3763 }
3764
3765 phis = VEC_alloc (gimple, heap, 3);
3766 /* Find the loop-closed-use at the loop exit of the original scalar
3767 result. (The reduction result is expected to have two immediate uses,
3768 one at the latch block, and one at the loop exit). For double
3769 reductions we are looking for exit phis of the outer loop. */
3770 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3771 {
3772 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3773 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
3774 else
3775 {
3776 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
3777 {
3778 tree phi_res = PHI_RESULT (USE_STMT (use_p));
3779
3780 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
3781 {
3782 if (!flow_bb_inside_loop_p (loop,
3783 gimple_bb (USE_STMT (phi_use_p))))
3784 VEC_safe_push (gimple, heap, phis,
3785 USE_STMT (phi_use_p));
3786 }
3787 }
3788 }
3789 }
3790
3791 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
3792 {
3793 /* Replace the uses: */
3794 orig_name = PHI_RESULT (exit_phi);
3795 scalar_result = VEC_index (tree, scalar_results, k);
3796 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3797 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3798 SET_USE (use_p, scalar_result);
3799 }
3800
3801 VEC_free (gimple, heap, phis);
3802 }
3803
3804 VEC_free (tree, heap, scalar_results);
3805 VEC_free (gimple, heap, new_phis);
3806 }
3807
3808
3809 /* Function vectorizable_reduction.
3810
3811 Check if STMT performs a reduction operation that can be vectorized.
3812 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3813 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
3814 Return FALSE if not a vectorizable STMT, TRUE otherwise.
3815
3816 This function also handles reduction idioms (patterns) that have been
3817 recognized in advance during vect_pattern_recog. In this case, STMT may be
3818 of this form:
3819 X = pattern_expr (arg0, arg1, ..., X)
3820 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
3821 sequence that had been detected and replaced by the pattern-stmt (STMT).
3822
3823 In some cases of reduction patterns, the type of the reduction variable X is
3824 different than the type of the other arguments of STMT.
3825 In such cases, the vectype that is used when transforming STMT into a vector
3826 stmt is different than the vectype that is used to determine the
3827 vectorization factor, because it consists of a different number of elements
3828 than the actual number of elements that are being operated upon in parallel.
3829
3830 For example, consider an accumulation of shorts into an int accumulator.
3831 On some targets it's possible to vectorize this pattern operating on 8
3832 shorts at a time (hence, the vectype for purposes of determining the
3833 vectorization factor should be V8HI); on the other hand, the vectype that
3834 is used to create the vector form is actually V4SI (the type of the result).
3835
3836 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
3837 indicates what is the actual level of parallelism (V8HI in the example), so
3838 that the right vectorization factor would be derived. This vectype
3839 corresponds to the type of arguments to the reduction stmt, and should *NOT*
3840 be used to create the vectorized stmt. The right vectype for the vectorized
3841 stmt is obtained from the type of the result X:
3842 get_vectype_for_scalar_type (TREE_TYPE (X))
3843
3844 This means that, contrary to "regular" reductions (or "regular" stmts in
3845 general), the following equation:
3846 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
3847 does *NOT* necessarily hold for reduction patterns. */
3848
3849 bool
3850 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
3851 gimple *vec_stmt, slp_tree slp_node)
3852 {
3853 tree vec_dest;
3854 tree scalar_dest;
3855 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
3856 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3857 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
3858 tree vectype_in = NULL_TREE;
3859 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3860 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3861 enum tree_code code, orig_code, epilog_reduc_code;
3862 enum machine_mode vec_mode;
3863 int op_type;
3864 optab optab, reduc_optab;
3865 tree new_temp = NULL_TREE;
3866 tree def;
3867 gimple def_stmt;
3868 enum vect_def_type dt;
3869 gimple new_phi = NULL;
3870 tree scalar_type;
3871 bool is_simple_use;
3872 gimple orig_stmt;
3873 stmt_vec_info orig_stmt_info;
3874 tree expr = NULL_TREE;
3875 int i;
3876 int ncopies;
3877 int epilog_copies;
3878 stmt_vec_info prev_stmt_info, prev_phi_info;
3879 bool single_defuse_cycle = false;
3880 tree reduc_def = NULL_TREE;
3881 gimple new_stmt = NULL;
3882 int j;
3883 tree ops[3];
3884 bool nested_cycle = false, found_nested_cycle_def = false;
3885 gimple reduc_def_stmt = NULL;
3886 /* The default is that the reduction variable is the last in statement. */
3887 int reduc_index = 2;
3888 bool double_reduc = false, dummy;
3889 basic_block def_bb;
3890 struct loop * def_stmt_loop, *outer_loop = NULL;
3891 tree def_arg;
3892 gimple def_arg_stmt;
3893 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
3894 VEC (gimple, heap) *phis = NULL;
3895 int vec_num;
3896 tree def0, def1;
3897
3898 if (nested_in_vect_loop_p (loop, stmt))
3899 {
3900 outer_loop = loop;
3901 loop = loop->inner;
3902 nested_cycle = true;
3903 }
3904
3905 /* 1. Is vectorizable reduction? */
3906 /* Not supportable if the reduction variable is used in the loop. */
3907 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
3908 return false;
3909
3910 /* Reductions that are not used even in an enclosing outer-loop,
3911 are expected to be "live" (used out of the loop). */
3912 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
3913 && !STMT_VINFO_LIVE_P (stmt_info))
3914 return false;
3915
3916 /* Make sure it was already recognized as a reduction computation. */
3917 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
3918 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
3919 return false;
3920
3921 /* 2. Has this been recognized as a reduction pattern?
3922
3923 Check if STMT represents a pattern that has been recognized
3924 in earlier analysis stages. For stmts that represent a pattern,
3925 the STMT_VINFO_RELATED_STMT field records the last stmt in
3926 the original sequence that constitutes the pattern. */
3927
3928 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3929 if (orig_stmt)
3930 {
3931 orig_stmt_info = vinfo_for_stmt (orig_stmt);
3932 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
3933 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
3934 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
3935 }
3936
3937 /* 3. Check the operands of the operation. The first operands are defined
3938 inside the loop body. The last operand is the reduction variable,
3939 which is defined by the loop-header-phi. */
3940
3941 gcc_assert (is_gimple_assign (stmt));
3942
3943 /* Flatten RHS */
3944 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3945 {
3946 case GIMPLE_SINGLE_RHS:
3947 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
3948 if (op_type == ternary_op)
3949 {
3950 tree rhs = gimple_assign_rhs1 (stmt);
3951 ops[0] = TREE_OPERAND (rhs, 0);
3952 ops[1] = TREE_OPERAND (rhs, 1);
3953 ops[2] = TREE_OPERAND (rhs, 2);
3954 code = TREE_CODE (rhs);
3955 }
3956 else
3957 return false;
3958 break;
3959
3960 case GIMPLE_BINARY_RHS:
3961 code = gimple_assign_rhs_code (stmt);
3962 op_type = TREE_CODE_LENGTH (code);
3963 gcc_assert (op_type == binary_op);
3964 ops[0] = gimple_assign_rhs1 (stmt);
3965 ops[1] = gimple_assign_rhs2 (stmt);
3966 break;
3967
3968 case GIMPLE_UNARY_RHS:
3969 return false;
3970
3971 default:
3972 gcc_unreachable ();
3973 }
3974
3975 scalar_dest = gimple_assign_lhs (stmt);
3976 scalar_type = TREE_TYPE (scalar_dest);
3977 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
3978 && !SCALAR_FLOAT_TYPE_P (scalar_type))
3979 return false;
3980
3981 /* All uses but the last are expected to be defined in the loop.
3982 The last use is the reduction variable. In case of nested cycle this
3983 assumption is not true: we use reduc_index to record the index of the
3984 reduction variable. */
3985 for (i = 0; i < op_type-1; i++)
3986 {
3987 tree tem;
3988
3989 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
3990 if (i == 0 && code == COND_EXPR)
3991 continue;
3992
3993 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL,
3994 &def_stmt, &def, &dt, &tem);
3995 if (!vectype_in)
3996 vectype_in = tem;
3997 gcc_assert (is_simple_use);
3998 if (dt != vect_internal_def
3999 && dt != vect_external_def
4000 && dt != vect_constant_def
4001 && dt != vect_induction_def
4002 && !(dt == vect_nested_cycle && nested_cycle))
4003 return false;
4004
4005 if (dt == vect_nested_cycle)
4006 {
4007 found_nested_cycle_def = true;
4008 reduc_def_stmt = def_stmt;
4009 reduc_index = i;
4010 }
4011 }
4012
4013 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt,
4014 &def, &dt);
4015 gcc_assert (is_simple_use);
4016 gcc_assert (dt == vect_reduction_def
4017 || dt == vect_nested_cycle
4018 || ((dt == vect_internal_def || dt == vect_external_def
4019 || dt == vect_constant_def || dt == vect_induction_def)
4020 && nested_cycle && found_nested_cycle_def));
4021 if (!found_nested_cycle_def)
4022 reduc_def_stmt = def_stmt;
4023
4024 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4025 if (orig_stmt)
4026 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4027 reduc_def_stmt,
4028 !nested_cycle,
4029 &dummy));
4030 else
4031 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4032 !nested_cycle, &dummy));
4033
4034 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4035 return false;
4036
4037 if (slp_node)
4038 ncopies = 1;
4039 else
4040 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4041 / TYPE_VECTOR_SUBPARTS (vectype_in));
4042
4043 gcc_assert (ncopies >= 1);
4044
4045 vec_mode = TYPE_MODE (vectype_in);
4046
4047 if (code == COND_EXPR)
4048 {
4049 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0))
4050 {
4051 if (vect_print_dump_info (REPORT_DETAILS))
4052 fprintf (vect_dump, "unsupported condition in reduction");
4053
4054 return false;
4055 }
4056 }
4057 else
4058 {
4059 /* 4. Supportable by target? */
4060
4061 /* 4.1. check support for the operation in the loop */
4062 optab = optab_for_tree_code (code, vectype_in, optab_default);
4063 if (!optab)
4064 {
4065 if (vect_print_dump_info (REPORT_DETAILS))
4066 fprintf (vect_dump, "no optab.");
4067
4068 return false;
4069 }
4070
4071 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4072 {
4073 if (vect_print_dump_info (REPORT_DETAILS))
4074 fprintf (vect_dump, "op not supported by target.");
4075
4076 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4077 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4078 < vect_min_worthwhile_factor (code))
4079 return false;
4080
4081 if (vect_print_dump_info (REPORT_DETAILS))
4082 fprintf (vect_dump, "proceeding using word mode.");
4083 }
4084
4085 /* Worthwhile without SIMD support? */
4086 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4087 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4088 < vect_min_worthwhile_factor (code))
4089 {
4090 if (vect_print_dump_info (REPORT_DETAILS))
4091 fprintf (vect_dump, "not worthwhile without SIMD support.");
4092
4093 return false;
4094 }
4095 }
4096
4097 /* 4.2. Check support for the epilog operation.
4098
4099 If STMT represents a reduction pattern, then the type of the
4100 reduction variable may be different than the type of the rest
4101 of the arguments. For example, consider the case of accumulation
4102 of shorts into an int accumulator; The original code:
4103 S1: int_a = (int) short_a;
4104 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
4105
4106 was replaced with:
4107 STMT: int_acc = widen_sum <short_a, int_acc>
4108
4109 This means that:
4110 1. The tree-code that is used to create the vector operation in the
4111 epilog code (that reduces the partial results) is not the
4112 tree-code of STMT, but is rather the tree-code of the original
4113 stmt from the pattern that STMT is replacing. I.e, in the example
4114 above we want to use 'widen_sum' in the loop, but 'plus' in the
4115 epilog.
4116 2. The type (mode) we use to check available target support
4117 for the vector operation to be created in the *epilog*, is
4118 determined by the type of the reduction variable (in the example
4119 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4120 However the type (mode) we use to check available target support
4121 for the vector operation to be created *inside the loop*, is
4122 determined by the type of the other arguments to STMT (in the
4123 example we'd check this: optab_handler (widen_sum_optab,
4124 vect_short_mode)).
4125
4126 This is contrary to "regular" reductions, in which the types of all
4127 the arguments are the same as the type of the reduction variable.
4128 For "regular" reductions we can therefore use the same vector type
4129 (and also the same tree-code) when generating the epilog code and
4130 when generating the code inside the loop. */
4131
4132 if (orig_stmt)
4133 {
4134 /* This is a reduction pattern: get the vectype from the type of the
4135 reduction variable, and get the tree-code from orig_stmt. */
4136 orig_code = gimple_assign_rhs_code (orig_stmt);
4137 gcc_assert (vectype_out);
4138 vec_mode = TYPE_MODE (vectype_out);
4139 }
4140 else
4141 {
4142 /* Regular reduction: use the same vectype and tree-code as used for
4143 the vector code inside the loop can be used for the epilog code. */
4144 orig_code = code;
4145 }
4146
4147 if (nested_cycle)
4148 {
4149 def_bb = gimple_bb (reduc_def_stmt);
4150 def_stmt_loop = def_bb->loop_father;
4151 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4152 loop_preheader_edge (def_stmt_loop));
4153 if (TREE_CODE (def_arg) == SSA_NAME
4154 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4155 && gimple_code (def_arg_stmt) == GIMPLE_PHI
4156 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4157 && vinfo_for_stmt (def_arg_stmt)
4158 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4159 == vect_double_reduction_def)
4160 double_reduc = true;
4161 }
4162
4163 epilog_reduc_code = ERROR_MARK;
4164 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4165 {
4166 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4167 optab_default);
4168 if (!reduc_optab)
4169 {
4170 if (vect_print_dump_info (REPORT_DETAILS))
4171 fprintf (vect_dump, "no optab for reduction.");
4172
4173 epilog_reduc_code = ERROR_MARK;
4174 }
4175
4176 if (reduc_optab
4177 && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4178 {
4179 if (vect_print_dump_info (REPORT_DETAILS))
4180 fprintf (vect_dump, "reduc op not supported by target.");
4181
4182 epilog_reduc_code = ERROR_MARK;
4183 }
4184 }
4185 else
4186 {
4187 if (!nested_cycle || double_reduc)
4188 {
4189 if (vect_print_dump_info (REPORT_DETAILS))
4190 fprintf (vect_dump, "no reduc code for scalar code.");
4191
4192 return false;
4193 }
4194 }
4195
4196 if (double_reduc && ncopies > 1)
4197 {
4198 if (vect_print_dump_info (REPORT_DETAILS))
4199 fprintf (vect_dump, "multiple types in double reduction");
4200
4201 return false;
4202 }
4203
4204 if (!vec_stmt) /* transformation not required. */
4205 {
4206 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4207 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4208 return false;
4209 return true;
4210 }
4211
4212 /** Transform. **/
4213
4214 if (vect_print_dump_info (REPORT_DETAILS))
4215 fprintf (vect_dump, "transform reduction.");
4216
4217 /* FORNOW: Multiple types are not supported for condition. */
4218 if (code == COND_EXPR)
4219 gcc_assert (ncopies == 1);
4220
4221 /* Create the destination vector */
4222 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4223
4224 /* In case the vectorization factor (VF) is bigger than the number
4225 of elements that we can fit in a vectype (nunits), we have to generate
4226 more than one vector stmt - i.e - we need to "unroll" the
4227 vector stmt by a factor VF/nunits. For more details see documentation
4228 in vectorizable_operation. */
4229
4230 /* If the reduction is used in an outer loop we need to generate
4231 VF intermediate results, like so (e.g. for ncopies=2):
4232 r0 = phi (init, r0)
4233 r1 = phi (init, r1)
4234 r0 = x0 + r0;
4235 r1 = x1 + r1;
4236 (i.e. we generate VF results in 2 registers).
4237 In this case we have a separate def-use cycle for each copy, and therefore
4238 for each copy we get the vector def for the reduction variable from the
4239 respective phi node created for this copy.
4240
4241 Otherwise (the reduction is unused in the loop nest), we can combine
4242 together intermediate results, like so (e.g. for ncopies=2):
4243 r = phi (init, r)
4244 r = x0 + r;
4245 r = x1 + r;
4246 (i.e. we generate VF/2 results in a single register).
4247 In this case for each copy we get the vector def for the reduction variable
4248 from the vectorized reduction operation generated in the previous iteration.
4249 */
4250
4251 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
4252 {
4253 single_defuse_cycle = true;
4254 epilog_copies = 1;
4255 }
4256 else
4257 epilog_copies = ncopies;
4258
4259 prev_stmt_info = NULL;
4260 prev_phi_info = NULL;
4261 if (slp_node)
4262 {
4263 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4264 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
4265 == TYPE_VECTOR_SUBPARTS (vectype_in));
4266 }
4267 else
4268 {
4269 vec_num = 1;
4270 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4271 if (op_type == ternary_op)
4272 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4273 }
4274
4275 phis = VEC_alloc (gimple, heap, vec_num);
4276 vect_defs = VEC_alloc (tree, heap, vec_num);
4277 if (!slp_node)
4278 VEC_quick_push (tree, vect_defs, NULL_TREE);
4279
4280 for (j = 0; j < ncopies; j++)
4281 {
4282 if (j == 0 || !single_defuse_cycle)
4283 {
4284 for (i = 0; i < vec_num; i++)
4285 {
4286 /* Create the reduction-phi that defines the reduction
4287 operand. */
4288 new_phi = create_phi_node (vec_dest, loop->header);
4289 set_vinfo_for_stmt (new_phi,
4290 new_stmt_vec_info (new_phi, loop_vinfo,
4291 NULL));
4292 if (j == 0 || slp_node)
4293 VEC_quick_push (gimple, phis, new_phi);
4294 }
4295 }
4296
4297 if (code == COND_EXPR)
4298 {
4299 gcc_assert (!slp_node);
4300 vectorizable_condition (stmt, gsi, vec_stmt,
4301 PHI_RESULT (VEC_index (gimple, phis, 0)),
4302 reduc_index);
4303 /* Multiple types are not supported for condition. */
4304 break;
4305 }
4306
4307 /* Handle uses. */
4308 if (j == 0)
4309 {
4310 if (slp_node)
4311 vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1, -1);
4312 else
4313 {
4314 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
4315 stmt, NULL);
4316 VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
4317 if (op_type == ternary_op)
4318 {
4319 if (reduc_index == 0)
4320 loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt,
4321 NULL);
4322 else
4323 loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt,
4324 NULL);
4325
4326 VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
4327 }
4328 }
4329 }
4330 else
4331 {
4332 if (!slp_node)
4333 {
4334 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
4335 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
4336 VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0);
4337 if (op_type == ternary_op)
4338 {
4339 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
4340 loop_vec_def1);
4341 VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1);
4342 }
4343 }
4344
4345 if (single_defuse_cycle)
4346 reduc_def = gimple_assign_lhs (new_stmt);
4347
4348 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
4349 }
4350
4351 FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, def0)
4352 {
4353 if (slp_node)
4354 reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
4355 else
4356 {
4357 if (!single_defuse_cycle || j == 0)
4358 reduc_def = PHI_RESULT (new_phi);
4359 }
4360
4361 def1 = ((op_type == ternary_op)
4362 ? VEC_index (tree, vec_oprnds1, i) : NULL);
4363 if (op_type == binary_op)
4364 {
4365 if (reduc_index == 0)
4366 expr = build2 (code, vectype_out, reduc_def, def0);
4367 else
4368 expr = build2 (code, vectype_out, def0, reduc_def);
4369 }
4370 else
4371 {
4372 if (reduc_index == 0)
4373 expr = build3 (code, vectype_out, reduc_def, def0, def1);
4374 else
4375 {
4376 if (reduc_index == 1)
4377 expr = build3 (code, vectype_out, def0, reduc_def, def1);
4378 else
4379 expr = build3 (code, vectype_out, def0, def1, reduc_def);
4380 }
4381 }
4382
4383 new_stmt = gimple_build_assign (vec_dest, expr);
4384 new_temp = make_ssa_name (vec_dest, new_stmt);
4385 gimple_assign_set_lhs (new_stmt, new_temp);
4386 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4387 if (slp_node)
4388 {
4389 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4390 VEC_quick_push (tree, vect_defs, new_temp);
4391 }
4392 else
4393 VEC_replace (tree, vect_defs, 0, new_temp);
4394 }
4395
4396 if (slp_node)
4397 continue;
4398
4399 if (j == 0)
4400 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4401 else
4402 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4403
4404 prev_stmt_info = vinfo_for_stmt (new_stmt);
4405 prev_phi_info = vinfo_for_stmt (new_phi);
4406 }
4407
4408 /* Finalize the reduction-phi (set its arguments) and create the
4409 epilog reduction code. */
4410 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
4411 {
4412 new_temp = gimple_assign_lhs (*vec_stmt);
4413 VEC_replace (tree, vect_defs, 0, new_temp);
4414 }
4415
4416 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
4417 epilog_reduc_code, phis, reduc_index,
4418 double_reduc, slp_node);
4419
4420 VEC_free (gimple, heap, phis);
4421 VEC_free (tree, heap, vec_oprnds0);
4422 if (vec_oprnds1)
4423 VEC_free (tree, heap, vec_oprnds1);
4424
4425 return true;
4426 }
4427
4428 /* Function vect_min_worthwhile_factor.
4429
4430 For a loop where we could vectorize the operation indicated by CODE,
4431 return the minimum vectorization factor that makes it worthwhile
4432 to use generic vectors. */
4433 int
4434 vect_min_worthwhile_factor (enum tree_code code)
4435 {
4436 switch (code)
4437 {
4438 case PLUS_EXPR:
4439 case MINUS_EXPR:
4440 case NEGATE_EXPR:
4441 return 4;
4442
4443 case BIT_AND_EXPR:
4444 case BIT_IOR_EXPR:
4445 case BIT_XOR_EXPR:
4446 case BIT_NOT_EXPR:
4447 return 2;
4448
4449 default:
4450 return INT_MAX;
4451 }
4452 }
4453
4454
4455 /* Function vectorizable_induction
4456
4457 Check if PHI performs an induction computation that can be vectorized.
4458 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
4459 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
4460 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4461
4462 bool
4463 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4464 gimple *vec_stmt)
4465 {
4466 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
4467 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4468 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4469 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4470 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4471 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4472 tree vec_def;
4473
4474 gcc_assert (ncopies >= 1);
4475 /* FORNOW. This restriction should be relaxed. */
4476 if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
4477 {
4478 if (vect_print_dump_info (REPORT_DETAILS))
4479 fprintf (vect_dump, "multiple types in nested loop.");
4480 return false;
4481 }
4482
4483 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4484 return false;
4485
4486 /* FORNOW: SLP not supported. */
4487 if (STMT_SLP_TYPE (stmt_info))
4488 return false;
4489
4490 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
4491
4492 if (gimple_code (phi) != GIMPLE_PHI)
4493 return false;
4494
4495 if (!vec_stmt) /* transformation not required. */
4496 {
4497 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
4498 if (vect_print_dump_info (REPORT_DETAILS))
4499 fprintf (vect_dump, "=== vectorizable_induction ===");
4500 vect_model_induction_cost (stmt_info, ncopies);
4501 return true;
4502 }
4503
4504 /** Transform. **/
4505
4506 if (vect_print_dump_info (REPORT_DETAILS))
4507 fprintf (vect_dump, "transform induction phi.");
4508
4509 vec_def = get_initial_def_for_induction (phi);
4510 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
4511 return true;
4512 }
4513
4514 /* Function vectorizable_live_operation.
4515
4516 STMT computes a value that is used outside the loop. Check if
4517 it can be supported. */
4518
4519 bool
4520 vectorizable_live_operation (gimple stmt,
4521 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4522 gimple *vec_stmt ATTRIBUTE_UNUSED)
4523 {
4524 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4525 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4526 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4527 int i;
4528 int op_type;
4529 tree op;
4530 tree def;
4531 gimple def_stmt;
4532 enum vect_def_type dt;
4533 enum tree_code code;
4534 enum gimple_rhs_class rhs_class;
4535
4536 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
4537
4538 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
4539 return false;
4540
4541 if (!is_gimple_assign (stmt))
4542 return false;
4543
4544 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4545 return false;
4546
4547 /* FORNOW. CHECKME. */
4548 if (nested_in_vect_loop_p (loop, stmt))
4549 return false;
4550
4551 code = gimple_assign_rhs_code (stmt);
4552 op_type = TREE_CODE_LENGTH (code);
4553 rhs_class = get_gimple_rhs_class (code);
4554 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
4555 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
4556
4557 /* FORNOW: support only if all uses are invariant. This means
4558 that the scalar operations can remain in place, unvectorized.
4559 The original last scalar value that they compute will be used. */
4560
4561 for (i = 0; i < op_type; i++)
4562 {
4563 if (rhs_class == GIMPLE_SINGLE_RHS)
4564 op = TREE_OPERAND (gimple_op (stmt, 1), i);
4565 else
4566 op = gimple_op (stmt, i + 1);
4567 if (op
4568 && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
4569 {
4570 if (vect_print_dump_info (REPORT_DETAILS))
4571 fprintf (vect_dump, "use not simple.");
4572 return false;
4573 }
4574
4575 if (dt != vect_external_def && dt != vect_constant_def)
4576 return false;
4577 }
4578
4579 /* No transformation is required for the cases we currently support. */
4580 return true;
4581 }
4582
4583 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
4584
4585 static void
4586 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
4587 {
4588 ssa_op_iter op_iter;
4589 imm_use_iterator imm_iter;
4590 def_operand_p def_p;
4591 gimple ustmt;
4592
4593 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
4594 {
4595 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
4596 {
4597 basic_block bb;
4598
4599 if (!is_gimple_debug (ustmt))
4600 continue;
4601
4602 bb = gimple_bb (ustmt);
4603
4604 if (!flow_bb_inside_loop_p (loop, bb))
4605 {
4606 if (gimple_debug_bind_p (ustmt))
4607 {
4608 if (vect_print_dump_info (REPORT_DETAILS))
4609 fprintf (vect_dump, "killing debug use");
4610
4611 gimple_debug_bind_reset_value (ustmt);
4612 update_stmt (ustmt);
4613 }
4614 else
4615 gcc_unreachable ();
4616 }
4617 }
4618 }
4619 }
4620
4621 /* Function vect_transform_loop.
4622
4623 The analysis phase has determined that the loop is vectorizable.
4624 Vectorize the loop - created vectorized stmts to replace the scalar
4625 stmts in the loop, and update the loop exit condition. */
4626
4627 void
4628 vect_transform_loop (loop_vec_info loop_vinfo)
4629 {
4630 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4631 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4632 int nbbs = loop->num_nodes;
4633 gimple_stmt_iterator si;
4634 int i;
4635 tree ratio = NULL;
4636 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4637 bool strided_store;
4638 bool slp_scheduled = false;
4639 unsigned int nunits;
4640 tree cond_expr = NULL_TREE;
4641 gimple_seq cond_expr_stmt_list = NULL;
4642 bool do_peeling_for_loop_bound;
4643
4644 if (vect_print_dump_info (REPORT_DETAILS))
4645 fprintf (vect_dump, "=== vec_transform_loop ===");
4646
4647 /* Peel the loop if there are data refs with unknown alignment.
4648 Only one data ref with unknown store is allowed. */
4649
4650 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4651 vect_do_peeling_for_alignment (loop_vinfo);
4652
4653 do_peeling_for_loop_bound
4654 = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4655 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4656 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0));
4657
4658 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
4659 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4660 vect_loop_versioning (loop_vinfo,
4661 !do_peeling_for_loop_bound,
4662 &cond_expr, &cond_expr_stmt_list);
4663
4664 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4665 compile time constant), or it is a constant that doesn't divide by the
4666 vectorization factor, then an epilog loop needs to be created.
4667 We therefore duplicate the loop: the original loop will be vectorized,
4668 and will compute the first (n/VF) iterations. The second copy of the loop
4669 will remain scalar and will compute the remaining (n%VF) iterations.
4670 (VF is the vectorization factor). */
4671
4672 if (do_peeling_for_loop_bound)
4673 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
4674 cond_expr, cond_expr_stmt_list);
4675 else
4676 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
4677 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
4678
4679 /* 1) Make sure the loop header has exactly two entries
4680 2) Make sure we have a preheader basic block. */
4681
4682 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
4683
4684 split_edge (loop_preheader_edge (loop));
4685
4686 /* FORNOW: the vectorizer supports only loops which body consist
4687 of one basic block (header + empty latch). When the vectorizer will
4688 support more involved loop forms, the order by which the BBs are
4689 traversed need to be reconsidered. */
4690
4691 for (i = 0; i < nbbs; i++)
4692 {
4693 basic_block bb = bbs[i];
4694 stmt_vec_info stmt_info;
4695 gimple phi;
4696
4697 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
4698 {
4699 phi = gsi_stmt (si);
4700 if (vect_print_dump_info (REPORT_DETAILS))
4701 {
4702 fprintf (vect_dump, "------>vectorizing phi: ");
4703 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
4704 }
4705 stmt_info = vinfo_for_stmt (phi);
4706 if (!stmt_info)
4707 continue;
4708
4709 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4710 vect_loop_kill_debug_uses (loop, phi);
4711
4712 if (!STMT_VINFO_RELEVANT_P (stmt_info)
4713 && !STMT_VINFO_LIVE_P (stmt_info))
4714 continue;
4715
4716 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
4717 != (unsigned HOST_WIDE_INT) vectorization_factor)
4718 && vect_print_dump_info (REPORT_DETAILS))
4719 fprintf (vect_dump, "multiple-types.");
4720
4721 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
4722 {
4723 if (vect_print_dump_info (REPORT_DETAILS))
4724 fprintf (vect_dump, "transform phi.");
4725 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
4726 }
4727 }
4728
4729 for (si = gsi_start_bb (bb); !gsi_end_p (si);)
4730 {
4731 gimple stmt = gsi_stmt (si);
4732 bool is_store;
4733
4734 if (vect_print_dump_info (REPORT_DETAILS))
4735 {
4736 fprintf (vect_dump, "------>vectorizing statement: ");
4737 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
4738 }
4739
4740 stmt_info = vinfo_for_stmt (stmt);
4741
4742 /* vector stmts created in the outer-loop during vectorization of
4743 stmts in an inner-loop may not have a stmt_info, and do not
4744 need to be vectorized. */
4745 if (!stmt_info)
4746 {
4747 gsi_next (&si);
4748 continue;
4749 }
4750
4751 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4752 vect_loop_kill_debug_uses (loop, stmt);
4753
4754 if (!STMT_VINFO_RELEVANT_P (stmt_info)
4755 && !STMT_VINFO_LIVE_P (stmt_info))
4756 {
4757 gsi_next (&si);
4758 continue;
4759 }
4760
4761 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
4762 nunits =
4763 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4764 if (!STMT_SLP_TYPE (stmt_info)
4765 && nunits != (unsigned int) vectorization_factor
4766 && vect_print_dump_info (REPORT_DETAILS))
4767 /* For SLP VF is set according to unrolling factor, and not to
4768 vector size, hence for SLP this print is not valid. */
4769 fprintf (vect_dump, "multiple-types.");
4770
4771 /* SLP. Schedule all the SLP instances when the first SLP stmt is
4772 reached. */
4773 if (STMT_SLP_TYPE (stmt_info))
4774 {
4775 if (!slp_scheduled)
4776 {
4777 slp_scheduled = true;
4778
4779 if (vect_print_dump_info (REPORT_DETAILS))
4780 fprintf (vect_dump, "=== scheduling SLP instances ===");
4781
4782 vect_schedule_slp (loop_vinfo, NULL);
4783 }
4784
4785 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
4786 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
4787 {
4788 gsi_next (&si);
4789 continue;
4790 }
4791 }
4792
4793 /* -------- vectorize statement ------------ */
4794 if (vect_print_dump_info (REPORT_DETAILS))
4795 fprintf (vect_dump, "transform statement.");
4796
4797 strided_store = false;
4798 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
4799 if (is_store)
4800 {
4801 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4802 {
4803 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
4804 interleaving chain was completed - free all the stores in
4805 the chain. */
4806 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
4807 gsi_remove (&si, true);
4808 continue;
4809 }
4810 else
4811 {
4812 /* Free the attached stmt_vec_info and remove the stmt. */
4813 free_stmt_vec_info (stmt);
4814 gsi_remove (&si, true);
4815 continue;
4816 }
4817 }
4818 gsi_next (&si);
4819 } /* stmts in BB */
4820 } /* BBs in loop */
4821
4822 slpeel_make_loop_iterate_ntimes (loop, ratio);
4823
4824 /* The memory tags and pointers in vectorized statements need to
4825 have their SSA forms updated. FIXME, why can't this be delayed
4826 until all the loops have been transformed? */
4827 update_ssa (TODO_update_ssa);
4828
4829 if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4830 fprintf (vect_dump, "LOOP VECTORIZED.");
4831 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4832 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");
4833 }