re PR tree-optimization/46184 (ICE: SIGSEGV in vectorizable_reduction (tree-vect...
[gcc.git] / gcc / tree-vect-loop.c
1 /* Loop Vectorization
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
3 Free Software Foundation, Inc.
4 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
5 Ira Rosen <irar@il.ibm.com>
6
7 This file is part of GCC.
8
9 GCC is free software; you can redistribute it and/or modify it under
10 the terms of the GNU General Public License as published by the Free
11 Software Foundation; either version 3, or (at your option) any later
12 version.
13
14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with GCC; see the file COPYING3. If not see
21 <http://www.gnu.org/licenses/>. */
22
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "basic-block.h"
30 #include "tree-pretty-print.h"
31 #include "gimple-pretty-print.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
34 #include "cfgloop.h"
35 #include "cfglayout.h"
36 #include "expr.h"
37 #include "recog.h"
38 #include "optabs.h"
39 #include "params.h"
40 #include "diagnostic-core.h"
41 #include "toplev.h"
42 #include "tree-chrec.h"
43 #include "tree-scalar-evolution.h"
44 #include "tree-vectorizer.h"
45 #include "target.h"
46
47 /* Loop Vectorization Pass.
48
49 This pass tries to vectorize loops.
50
51 For example, the vectorizer transforms the following simple loop:
52
53 short a[N]; short b[N]; short c[N]; int i;
54
55 for (i=0; i<N; i++){
56 a[i] = b[i] + c[i];
57 }
58
59 as if it was manually vectorized by rewriting the source code into:
60
61 typedef int __attribute__((mode(V8HI))) v8hi;
62 short a[N]; short b[N]; short c[N]; int i;
63 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
64 v8hi va, vb, vc;
65
66 for (i=0; i<N/8; i++){
67 vb = pb[i];
68 vc = pc[i];
69 va = vb + vc;
70 pa[i] = va;
71 }
72
73 The main entry to this pass is vectorize_loops(), in which
74 the vectorizer applies a set of analyses on a given set of loops,
75 followed by the actual vectorization transformation for the loops that
76 had successfully passed the analysis phase.
77 Throughout this pass we make a distinction between two types of
78 data: scalars (which are represented by SSA_NAMES), and memory references
79 ("data-refs"). These two types of data require different handling both
80 during analysis and transformation. The types of data-refs that the
81 vectorizer currently supports are ARRAY_REFS which base is an array DECL
82 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
83 accesses are required to have a simple (consecutive) access pattern.
84
85 Analysis phase:
86 ===============
87 The driver for the analysis phase is vect_analyze_loop().
88 It applies a set of analyses, some of which rely on the scalar evolution
89 analyzer (scev) developed by Sebastian Pop.
90
91 During the analysis phase the vectorizer records some information
92 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
93 loop, as well as general information about the loop as a whole, which is
94 recorded in a "loop_vec_info" struct attached to each loop.
95
96 Transformation phase:
97 =====================
98 The loop transformation phase scans all the stmts in the loop, and
99 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
100 the loop that needs to be vectorized. It inserts the vector code sequence
101 just before the scalar stmt S, and records a pointer to the vector code
102 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
103 attached to S). This pointer will be used for the vectorization of following
104 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
105 otherwise, we rely on dead code elimination for removing it.
106
107 For example, say stmt S1 was vectorized into stmt VS1:
108
109 VS1: vb = px[i];
110 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
111 S2: a = b;
112
113 To vectorize stmt S2, the vectorizer first finds the stmt that defines
114 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
115 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
116 resulting sequence would be:
117
118 VS1: vb = px[i];
119 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
120 VS2: va = vb;
121 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
122
123 Operands that are not SSA_NAMEs, are data-refs that appear in
124 load/store operations (like 'x[i]' in S1), and are handled differently.
125
126 Target modeling:
127 =================
128 Currently the only target specific information that is used is the
129 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
130 Targets that can support different sizes of vectors, for now will need
131 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
132 flexibility will be added in the future.
133
134 Since we only vectorize operations which vector form can be
135 expressed using existing tree codes, to verify that an operation is
136 supported, the vectorizer checks the relevant optab at the relevant
137 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
138 the value found is CODE_FOR_nothing, then there's no target support, and
139 we can't vectorize the stmt.
140
141 For additional information on this project see:
142 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
143 */
144
145 /* Function vect_determine_vectorization_factor
146
147 Determine the vectorization factor (VF). VF is the number of data elements
148 that are operated upon in parallel in a single iteration of the vectorized
149 loop. For example, when vectorizing a loop that operates on 4byte elements,
150 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
151 elements can fit in a single vector register.
152
153 We currently support vectorization of loops in which all types operated upon
154 are of the same size. Therefore this function currently sets VF according to
155 the size of the types operated upon, and fails if there are multiple sizes
156 in the loop.
157
158 VF is also the factor by which the loop iterations are strip-mined, e.g.:
159 original loop:
160 for (i=0; i<N; i++){
161 a[i] = b[i] + c[i];
162 }
163
164 vectorized loop:
165 for (i=0; i<N; i+=VF){
166 a[i:VF] = b[i:VF] + c[i:VF];
167 }
168 */
169
170 static bool
171 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
172 {
173 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
174 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
175 int nbbs = loop->num_nodes;
176 gimple_stmt_iterator si;
177 unsigned int vectorization_factor = 0;
178 tree scalar_type;
179 gimple phi;
180 tree vectype;
181 unsigned int nunits;
182 stmt_vec_info stmt_info;
183 int i;
184 HOST_WIDE_INT dummy;
185
186 if (vect_print_dump_info (REPORT_DETAILS))
187 fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
188
189 for (i = 0; i < nbbs; i++)
190 {
191 basic_block bb = bbs[i];
192
193 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
194 {
195 phi = gsi_stmt (si);
196 stmt_info = vinfo_for_stmt (phi);
197 if (vect_print_dump_info (REPORT_DETAILS))
198 {
199 fprintf (vect_dump, "==> examining phi: ");
200 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
201 }
202
203 gcc_assert (stmt_info);
204
205 if (STMT_VINFO_RELEVANT_P (stmt_info))
206 {
207 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
208 scalar_type = TREE_TYPE (PHI_RESULT (phi));
209
210 if (vect_print_dump_info (REPORT_DETAILS))
211 {
212 fprintf (vect_dump, "get vectype for scalar type: ");
213 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
214 }
215
216 vectype = get_vectype_for_scalar_type (scalar_type);
217 if (!vectype)
218 {
219 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
220 {
221 fprintf (vect_dump,
222 "not vectorized: unsupported data-type ");
223 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
224 }
225 return false;
226 }
227 STMT_VINFO_VECTYPE (stmt_info) = vectype;
228
229 if (vect_print_dump_info (REPORT_DETAILS))
230 {
231 fprintf (vect_dump, "vectype: ");
232 print_generic_expr (vect_dump, vectype, TDF_SLIM);
233 }
234
235 nunits = TYPE_VECTOR_SUBPARTS (vectype);
236 if (vect_print_dump_info (REPORT_DETAILS))
237 fprintf (vect_dump, "nunits = %d", nunits);
238
239 if (!vectorization_factor
240 || (nunits > vectorization_factor))
241 vectorization_factor = nunits;
242 }
243 }
244
245 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
246 {
247 tree vf_vectype;
248 gimple stmt = gsi_stmt (si);
249 stmt_info = vinfo_for_stmt (stmt);
250
251 if (vect_print_dump_info (REPORT_DETAILS))
252 {
253 fprintf (vect_dump, "==> examining statement: ");
254 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
255 }
256
257 gcc_assert (stmt_info);
258
259 /* skip stmts which do not need to be vectorized. */
260 if (!STMT_VINFO_RELEVANT_P (stmt_info)
261 && !STMT_VINFO_LIVE_P (stmt_info))
262 {
263 if (vect_print_dump_info (REPORT_DETAILS))
264 fprintf (vect_dump, "skip.");
265 continue;
266 }
267
268 if (gimple_get_lhs (stmt) == NULL_TREE)
269 {
270 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
271 {
272 fprintf (vect_dump, "not vectorized: irregular stmt.");
273 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
274 }
275 return false;
276 }
277
278 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
279 {
280 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
281 {
282 fprintf (vect_dump, "not vectorized: vector stmt in loop:");
283 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
284 }
285 return false;
286 }
287
288 if (STMT_VINFO_VECTYPE (stmt_info))
289 {
290 /* The only case when a vectype had been already set is for stmts
291 that contain a dataref, or for "pattern-stmts" (stmts generated
292 by the vectorizer to represent/replace a certain idiom). */
293 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
294 || is_pattern_stmt_p (stmt_info));
295 vectype = STMT_VINFO_VECTYPE (stmt_info);
296 }
297 else
298 {
299 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)
300 && !is_pattern_stmt_p (stmt_info));
301
302 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
303 if (vect_print_dump_info (REPORT_DETAILS))
304 {
305 fprintf (vect_dump, "get vectype for scalar type: ");
306 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
307 }
308 vectype = get_vectype_for_scalar_type (scalar_type);
309 if (!vectype)
310 {
311 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
312 {
313 fprintf (vect_dump,
314 "not vectorized: unsupported data-type ");
315 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
316 }
317 return false;
318 }
319
320 STMT_VINFO_VECTYPE (stmt_info) = vectype;
321 }
322
323 /* The vectorization factor is according to the smallest
324 scalar type (or the largest vector size, but we only
325 support one vector size per loop). */
326 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
327 &dummy);
328 if (vect_print_dump_info (REPORT_DETAILS))
329 {
330 fprintf (vect_dump, "get vectype for scalar type: ");
331 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
332 }
333 vf_vectype = get_vectype_for_scalar_type (scalar_type);
334 if (!vf_vectype)
335 {
336 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
337 {
338 fprintf (vect_dump,
339 "not vectorized: unsupported data-type ");
340 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
341 }
342 return false;
343 }
344
345 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
346 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
347 {
348 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
349 {
350 fprintf (vect_dump,
351 "not vectorized: different sized vector "
352 "types in statement, ");
353 print_generic_expr (vect_dump, vectype, TDF_SLIM);
354 fprintf (vect_dump, " and ");
355 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
356 }
357 return false;
358 }
359
360 if (vect_print_dump_info (REPORT_DETAILS))
361 {
362 fprintf (vect_dump, "vectype: ");
363 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
364 }
365
366 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
367 if (vect_print_dump_info (REPORT_DETAILS))
368 fprintf (vect_dump, "nunits = %d", nunits);
369
370 if (!vectorization_factor
371 || (nunits > vectorization_factor))
372 vectorization_factor = nunits;
373 }
374 }
375
376 /* TODO: Analyze cost. Decide if worth while to vectorize. */
377 if (vect_print_dump_info (REPORT_DETAILS))
378 fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
379 if (vectorization_factor <= 1)
380 {
381 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
382 fprintf (vect_dump, "not vectorized: unsupported data-type");
383 return false;
384 }
385 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
386
387 return true;
388 }
389
390
391 /* Function vect_is_simple_iv_evolution.
392
393 FORNOW: A simple evolution of an induction variables in the loop is
394 considered a polynomial evolution with constant step. */
395
396 static bool
397 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
398 tree * step)
399 {
400 tree init_expr;
401 tree step_expr;
402 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
403
404 /* When there is no evolution in this loop, the evolution function
405 is not "simple". */
406 if (evolution_part == NULL_TREE)
407 return false;
408
409 /* When the evolution is a polynomial of degree >= 2
410 the evolution function is not "simple". */
411 if (tree_is_chrec (evolution_part))
412 return false;
413
414 step_expr = evolution_part;
415 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
416
417 if (vect_print_dump_info (REPORT_DETAILS))
418 {
419 fprintf (vect_dump, "step: ");
420 print_generic_expr (vect_dump, step_expr, TDF_SLIM);
421 fprintf (vect_dump, ", init: ");
422 print_generic_expr (vect_dump, init_expr, TDF_SLIM);
423 }
424
425 *init = init_expr;
426 *step = step_expr;
427
428 if (TREE_CODE (step_expr) != INTEGER_CST)
429 {
430 if (vect_print_dump_info (REPORT_DETAILS))
431 fprintf (vect_dump, "step unknown.");
432 return false;
433 }
434
435 return true;
436 }
437
438 /* Function vect_analyze_scalar_cycles_1.
439
440 Examine the cross iteration def-use cycles of scalar variables
441 in LOOP. LOOP_VINFO represents the loop that is now being
442 considered for vectorization (can be LOOP, or an outer-loop
443 enclosing LOOP). */
444
445 static void
446 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
447 {
448 basic_block bb = loop->header;
449 tree dumy;
450 VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
451 gimple_stmt_iterator gsi;
452 bool double_reduc;
453
454 if (vect_print_dump_info (REPORT_DETAILS))
455 fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
456
457 /* First - identify all inductions. Reduction detection assumes that all the
458 inductions have been identified, therefore, this order must not be
459 changed. */
460 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
461 {
462 gimple phi = gsi_stmt (gsi);
463 tree access_fn = NULL;
464 tree def = PHI_RESULT (phi);
465 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
466
467 if (vect_print_dump_info (REPORT_DETAILS))
468 {
469 fprintf (vect_dump, "Analyze phi: ");
470 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
471 }
472
473 /* Skip virtual phi's. The data dependences that are associated with
474 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
475 if (!is_gimple_reg (SSA_NAME_VAR (def)))
476 continue;
477
478 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
479
480 /* Analyze the evolution function. */
481 access_fn = analyze_scalar_evolution (loop, def);
482 if (access_fn && vect_print_dump_info (REPORT_DETAILS))
483 {
484 fprintf (vect_dump, "Access function of PHI: ");
485 print_generic_expr (vect_dump, access_fn, TDF_SLIM);
486 }
487
488 if (!access_fn
489 || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
490 {
491 VEC_safe_push (gimple, heap, worklist, phi);
492 continue;
493 }
494
495 if (vect_print_dump_info (REPORT_DETAILS))
496 fprintf (vect_dump, "Detected induction.");
497 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
498 }
499
500
501 /* Second - identify all reductions and nested cycles. */
502 while (VEC_length (gimple, worklist) > 0)
503 {
504 gimple phi = VEC_pop (gimple, worklist);
505 tree def = PHI_RESULT (phi);
506 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
507 gimple reduc_stmt;
508 bool nested_cycle;
509
510 if (vect_print_dump_info (REPORT_DETAILS))
511 {
512 fprintf (vect_dump, "Analyze phi: ");
513 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
514 }
515
516 gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
517 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
518
519 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
520 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
521 &double_reduc);
522 if (reduc_stmt)
523 {
524 if (double_reduc)
525 {
526 if (vect_print_dump_info (REPORT_DETAILS))
527 fprintf (vect_dump, "Detected double reduction.");
528
529 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
530 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
531 vect_double_reduction_def;
532 }
533 else
534 {
535 if (nested_cycle)
536 {
537 if (vect_print_dump_info (REPORT_DETAILS))
538 fprintf (vect_dump, "Detected vectorizable nested cycle.");
539
540 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
541 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
542 vect_nested_cycle;
543 }
544 else
545 {
546 if (vect_print_dump_info (REPORT_DETAILS))
547 fprintf (vect_dump, "Detected reduction.");
548
549 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
550 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
551 vect_reduction_def;
552 /* Store the reduction cycles for possible vectorization in
553 loop-aware SLP. */
554 VEC_safe_push (gimple, heap,
555 LOOP_VINFO_REDUCTIONS (loop_vinfo),
556 reduc_stmt);
557 }
558 }
559 }
560 else
561 if (vect_print_dump_info (REPORT_DETAILS))
562 fprintf (vect_dump, "Unknown def-use cycle pattern.");
563 }
564
565 VEC_free (gimple, heap, worklist);
566 }
567
568
569 /* Function vect_analyze_scalar_cycles.
570
571 Examine the cross iteration def-use cycles of scalar variables, by
572 analyzing the loop-header PHIs of scalar variables. Classify each
573 cycle as one of the following: invariant, induction, reduction, unknown.
574 We do that for the loop represented by LOOP_VINFO, and also to its
575 inner-loop, if exists.
576 Examples for scalar cycles:
577
578 Example1: reduction:
579
580 loop1:
581 for (i=0; i<N; i++)
582 sum += a[i];
583
584 Example2: induction:
585
586 loop2:
587 for (i=0; i<N; i++)
588 a[i] = i; */
589
590 static void
591 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
592 {
593 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
594
595 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
596
597 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
598 Reductions in such inner-loop therefore have different properties than
599 the reductions in the nest that gets vectorized:
600 1. When vectorized, they are executed in the same order as in the original
601 scalar loop, so we can't change the order of computation when
602 vectorizing them.
603 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
604 current checks are too strict. */
605
606 if (loop->inner)
607 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
608 }
609
610 /* Function vect_get_loop_niters.
611
612 Determine how many iterations the loop is executed.
613 If an expression that represents the number of iterations
614 can be constructed, place it in NUMBER_OF_ITERATIONS.
615 Return the loop exit condition. */
616
617 static gimple
618 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
619 {
620 tree niters;
621
622 if (vect_print_dump_info (REPORT_DETAILS))
623 fprintf (vect_dump, "=== get_loop_niters ===");
624
625 niters = number_of_exit_cond_executions (loop);
626
627 if (niters != NULL_TREE
628 && niters != chrec_dont_know)
629 {
630 *number_of_iterations = niters;
631
632 if (vect_print_dump_info (REPORT_DETAILS))
633 {
634 fprintf (vect_dump, "==> get_loop_niters:" );
635 print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
636 }
637 }
638
639 return get_loop_exit_condition (loop);
640 }
641
642
643 /* Function bb_in_loop_p
644
645 Used as predicate for dfs order traversal of the loop bbs. */
646
647 static bool
648 bb_in_loop_p (const_basic_block bb, const void *data)
649 {
650 const struct loop *const loop = (const struct loop *)data;
651 if (flow_bb_inside_loop_p (loop, bb))
652 return true;
653 return false;
654 }
655
656
657 /* Function new_loop_vec_info.
658
659 Create and initialize a new loop_vec_info struct for LOOP, as well as
660 stmt_vec_info structs for all the stmts in LOOP. */
661
662 static loop_vec_info
663 new_loop_vec_info (struct loop *loop)
664 {
665 loop_vec_info res;
666 basic_block *bbs;
667 gimple_stmt_iterator si;
668 unsigned int i, nbbs;
669
670 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
671 LOOP_VINFO_LOOP (res) = loop;
672
673 bbs = get_loop_body (loop);
674
675 /* Create/Update stmt_info for all stmts in the loop. */
676 for (i = 0; i < loop->num_nodes; i++)
677 {
678 basic_block bb = bbs[i];
679
680 /* BBs in a nested inner-loop will have been already processed (because
681 we will have called vect_analyze_loop_form for any nested inner-loop).
682 Therefore, for stmts in an inner-loop we just want to update the
683 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
684 loop_info of the outer-loop we are currently considering to vectorize
685 (instead of the loop_info of the inner-loop).
686 For stmts in other BBs we need to create a stmt_info from scratch. */
687 if (bb->loop_father != loop)
688 {
689 /* Inner-loop bb. */
690 gcc_assert (loop->inner && bb->loop_father == loop->inner);
691 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
692 {
693 gimple phi = gsi_stmt (si);
694 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
695 loop_vec_info inner_loop_vinfo =
696 STMT_VINFO_LOOP_VINFO (stmt_info);
697 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
698 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
699 }
700 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
701 {
702 gimple stmt = gsi_stmt (si);
703 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
704 loop_vec_info inner_loop_vinfo =
705 STMT_VINFO_LOOP_VINFO (stmt_info);
706 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
707 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
708 }
709 }
710 else
711 {
712 /* bb in current nest. */
713 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
714 {
715 gimple phi = gsi_stmt (si);
716 gimple_set_uid (phi, 0);
717 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
718 }
719
720 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
721 {
722 gimple stmt = gsi_stmt (si);
723 gimple_set_uid (stmt, 0);
724 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
725 }
726 }
727 }
728
729 /* CHECKME: We want to visit all BBs before their successors (except for
730 latch blocks, for which this assertion wouldn't hold). In the simple
731 case of the loop forms we allow, a dfs order of the BBs would the same
732 as reversed postorder traversal, so we are safe. */
733
734 free (bbs);
735 bbs = XCNEWVEC (basic_block, loop->num_nodes);
736 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
737 bbs, loop->num_nodes, loop);
738 gcc_assert (nbbs == loop->num_nodes);
739
740 LOOP_VINFO_BBS (res) = bbs;
741 LOOP_VINFO_NITERS (res) = NULL;
742 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
743 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
744 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
745 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
746 LOOP_VINFO_VECT_FACTOR (res) = 0;
747 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
748 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
749 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
750 LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
751 VEC_alloc (gimple, heap,
752 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
753 LOOP_VINFO_MAY_ALIAS_DDRS (res) =
754 VEC_alloc (ddr_p, heap,
755 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
756 LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
757 LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
758 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
759 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
760 LOOP_VINFO_PEELING_HTAB (res) = NULL;
761
762 return res;
763 }
764
765
766 /* Function destroy_loop_vec_info.
767
768 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
769 stmts in the loop. */
770
771 void
772 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
773 {
774 struct loop *loop;
775 basic_block *bbs;
776 int nbbs;
777 gimple_stmt_iterator si;
778 int j;
779 VEC (slp_instance, heap) *slp_instances;
780 slp_instance instance;
781
782 if (!loop_vinfo)
783 return;
784
785 loop = LOOP_VINFO_LOOP (loop_vinfo);
786
787 bbs = LOOP_VINFO_BBS (loop_vinfo);
788 nbbs = loop->num_nodes;
789
790 if (!clean_stmts)
791 {
792 free (LOOP_VINFO_BBS (loop_vinfo));
793 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
794 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
795 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
796
797 free (loop_vinfo);
798 loop->aux = NULL;
799 return;
800 }
801
802 for (j = 0; j < nbbs; j++)
803 {
804 basic_block bb = bbs[j];
805 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
806 free_stmt_vec_info (gsi_stmt (si));
807
808 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
809 {
810 gimple stmt = gsi_stmt (si);
811 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
812
813 if (stmt_info)
814 {
815 /* Check if this is a "pattern stmt" (introduced by the
816 vectorizer during the pattern recognition pass). */
817 bool remove_stmt_p = false;
818 gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
819 if (orig_stmt)
820 {
821 stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
822 if (orig_stmt_info
823 && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
824 remove_stmt_p = true;
825 }
826
827 /* Free stmt_vec_info. */
828 free_stmt_vec_info (stmt);
829
830 /* Remove dead "pattern stmts". */
831 if (remove_stmt_p)
832 gsi_remove (&si, true);
833 }
834 gsi_next (&si);
835 }
836 }
837
838 free (LOOP_VINFO_BBS (loop_vinfo));
839 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
840 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
841 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
842 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
843 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
844 FOR_EACH_VEC_ELT (slp_instance, slp_instances, j, instance)
845 vect_free_slp_instance (instance);
846
847 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
848 VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
849 VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
850
851 if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
852 htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
853
854 free (loop_vinfo);
855 loop->aux = NULL;
856 }
857
858
859 /* Function vect_analyze_loop_1.
860
861 Apply a set of analyses on LOOP, and create a loop_vec_info struct
862 for it. The different analyses will record information in the
863 loop_vec_info struct. This is a subset of the analyses applied in
864 vect_analyze_loop, to be applied on an inner-loop nested in the loop
865 that is now considered for (outer-loop) vectorization. */
866
867 static loop_vec_info
868 vect_analyze_loop_1 (struct loop *loop)
869 {
870 loop_vec_info loop_vinfo;
871
872 if (vect_print_dump_info (REPORT_DETAILS))
873 fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
874
875 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
876
877 loop_vinfo = vect_analyze_loop_form (loop);
878 if (!loop_vinfo)
879 {
880 if (vect_print_dump_info (REPORT_DETAILS))
881 fprintf (vect_dump, "bad inner-loop form.");
882 return NULL;
883 }
884
885 return loop_vinfo;
886 }
887
888
889 /* Function vect_analyze_loop_form.
890
891 Verify that certain CFG restrictions hold, including:
892 - the loop has a pre-header
893 - the loop has a single entry and exit
894 - the loop exit condition is simple enough, and the number of iterations
895 can be analyzed (a countable loop). */
896
897 loop_vec_info
898 vect_analyze_loop_form (struct loop *loop)
899 {
900 loop_vec_info loop_vinfo;
901 gimple loop_cond;
902 tree number_of_iterations = NULL;
903 loop_vec_info inner_loop_vinfo = NULL;
904
905 if (vect_print_dump_info (REPORT_DETAILS))
906 fprintf (vect_dump, "=== vect_analyze_loop_form ===");
907
908 /* Different restrictions apply when we are considering an inner-most loop,
909 vs. an outer (nested) loop.
910 (FORNOW. May want to relax some of these restrictions in the future). */
911
912 if (!loop->inner)
913 {
914 /* Inner-most loop. We currently require that the number of BBs is
915 exactly 2 (the header and latch). Vectorizable inner-most loops
916 look like this:
917
918 (pre-header)
919 |
920 header <--------+
921 | | |
922 | +--> latch --+
923 |
924 (exit-bb) */
925
926 if (loop->num_nodes != 2)
927 {
928 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
929 fprintf (vect_dump, "not vectorized: control flow in loop.");
930 return NULL;
931 }
932
933 if (empty_block_p (loop->header))
934 {
935 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
936 fprintf (vect_dump, "not vectorized: empty loop.");
937 return NULL;
938 }
939 }
940 else
941 {
942 struct loop *innerloop = loop->inner;
943 edge entryedge;
944
945 /* Nested loop. We currently require that the loop is doubly-nested,
946 contains a single inner loop, and the number of BBs is exactly 5.
947 Vectorizable outer-loops look like this:
948
949 (pre-header)
950 |
951 header <---+
952 | |
953 inner-loop |
954 | |
955 tail ------+
956 |
957 (exit-bb)
958
959 The inner-loop has the properties expected of inner-most loops
960 as described above. */
961
962 if ((loop->inner)->inner || (loop->inner)->next)
963 {
964 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
965 fprintf (vect_dump, "not vectorized: multiple nested loops.");
966 return NULL;
967 }
968
969 /* Analyze the inner-loop. */
970 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
971 if (!inner_loop_vinfo)
972 {
973 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
974 fprintf (vect_dump, "not vectorized: Bad inner loop.");
975 return NULL;
976 }
977
978 if (!expr_invariant_in_loop_p (loop,
979 LOOP_VINFO_NITERS (inner_loop_vinfo)))
980 {
981 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
982 fprintf (vect_dump,
983 "not vectorized: inner-loop count not invariant.");
984 destroy_loop_vec_info (inner_loop_vinfo, true);
985 return NULL;
986 }
987
988 if (loop->num_nodes != 5)
989 {
990 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
991 fprintf (vect_dump, "not vectorized: control flow in loop.");
992 destroy_loop_vec_info (inner_loop_vinfo, true);
993 return NULL;
994 }
995
996 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
997 entryedge = EDGE_PRED (innerloop->header, 0);
998 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
999 entryedge = EDGE_PRED (innerloop->header, 1);
1000
1001 if (entryedge->src != loop->header
1002 || !single_exit (innerloop)
1003 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1004 {
1005 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1006 fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
1007 destroy_loop_vec_info (inner_loop_vinfo, true);
1008 return NULL;
1009 }
1010
1011 if (vect_print_dump_info (REPORT_DETAILS))
1012 fprintf (vect_dump, "Considering outer-loop vectorization.");
1013 }
1014
1015 if (!single_exit (loop)
1016 || EDGE_COUNT (loop->header->preds) != 2)
1017 {
1018 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1019 {
1020 if (!single_exit (loop))
1021 fprintf (vect_dump, "not vectorized: multiple exits.");
1022 else if (EDGE_COUNT (loop->header->preds) != 2)
1023 fprintf (vect_dump, "not vectorized: too many incoming edges.");
1024 }
1025 if (inner_loop_vinfo)
1026 destroy_loop_vec_info (inner_loop_vinfo, true);
1027 return NULL;
1028 }
1029
1030 /* We assume that the loop exit condition is at the end of the loop. i.e,
1031 that the loop is represented as a do-while (with a proper if-guard
1032 before the loop if needed), where the loop header contains all the
1033 executable statements, and the latch is empty. */
1034 if (!empty_block_p (loop->latch)
1035 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1036 {
1037 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1038 fprintf (vect_dump, "not vectorized: unexpected loop form.");
1039 if (inner_loop_vinfo)
1040 destroy_loop_vec_info (inner_loop_vinfo, true);
1041 return NULL;
1042 }
1043
1044 /* Make sure there exists a single-predecessor exit bb: */
1045 if (!single_pred_p (single_exit (loop)->dest))
1046 {
1047 edge e = single_exit (loop);
1048 if (!(e->flags & EDGE_ABNORMAL))
1049 {
1050 split_loop_exit_edge (e);
1051 if (vect_print_dump_info (REPORT_DETAILS))
1052 fprintf (vect_dump, "split exit edge.");
1053 }
1054 else
1055 {
1056 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1057 fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1058 if (inner_loop_vinfo)
1059 destroy_loop_vec_info (inner_loop_vinfo, true);
1060 return NULL;
1061 }
1062 }
1063
1064 loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1065 if (!loop_cond)
1066 {
1067 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1068 fprintf (vect_dump, "not vectorized: complicated exit condition.");
1069 if (inner_loop_vinfo)
1070 destroy_loop_vec_info (inner_loop_vinfo, true);
1071 return NULL;
1072 }
1073
1074 if (!number_of_iterations)
1075 {
1076 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1077 fprintf (vect_dump,
1078 "not vectorized: number of iterations cannot be computed.");
1079 if (inner_loop_vinfo)
1080 destroy_loop_vec_info (inner_loop_vinfo, true);
1081 return NULL;
1082 }
1083
1084 if (chrec_contains_undetermined (number_of_iterations))
1085 {
1086 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1087 fprintf (vect_dump, "Infinite number of iterations.");
1088 if (inner_loop_vinfo)
1089 destroy_loop_vec_info (inner_loop_vinfo, true);
1090 return NULL;
1091 }
1092
1093 if (!NITERS_KNOWN_P (number_of_iterations))
1094 {
1095 if (vect_print_dump_info (REPORT_DETAILS))
1096 {
1097 fprintf (vect_dump, "Symbolic number of iterations is ");
1098 print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1099 }
1100 }
1101 else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1102 {
1103 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1104 fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1105 if (inner_loop_vinfo)
1106 destroy_loop_vec_info (inner_loop_vinfo, false);
1107 return NULL;
1108 }
1109
1110 loop_vinfo = new_loop_vec_info (loop);
1111 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1112 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1113
1114 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1115
1116 /* CHECKME: May want to keep it around it in the future. */
1117 if (inner_loop_vinfo)
1118 destroy_loop_vec_info (inner_loop_vinfo, false);
1119
1120 gcc_assert (!loop->aux);
1121 loop->aux = loop_vinfo;
1122 return loop_vinfo;
1123 }
1124
1125
1126 /* Get cost by calling cost target builtin. */
1127
1128 static inline int
1129 vect_get_cost (enum vect_cost_for_stmt type_of_cost)
1130 {
1131 tree dummy_type = NULL;
1132 int dummy = 0;
1133
1134 return targetm.vectorize.builtin_vectorization_cost (type_of_cost,
1135 dummy_type, dummy);
1136 }
1137
1138
1139 /* Function vect_analyze_loop_operations.
1140
1141 Scan the loop stmts and make sure they are all vectorizable. */
1142
1143 static bool
1144 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1145 {
1146 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1147 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1148 int nbbs = loop->num_nodes;
1149 gimple_stmt_iterator si;
1150 unsigned int vectorization_factor = 0;
1151 int i;
1152 gimple phi;
1153 stmt_vec_info stmt_info;
1154 bool need_to_vectorize = false;
1155 int min_profitable_iters;
1156 int min_scalar_loop_bound;
1157 unsigned int th;
1158 bool only_slp_in_loop = true, ok;
1159
1160 if (vect_print_dump_info (REPORT_DETAILS))
1161 fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1162
1163 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1164 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1165
1166 for (i = 0; i < nbbs; i++)
1167 {
1168 basic_block bb = bbs[i];
1169
1170 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1171 {
1172 phi = gsi_stmt (si);
1173 ok = true;
1174
1175 stmt_info = vinfo_for_stmt (phi);
1176 if (vect_print_dump_info (REPORT_DETAILS))
1177 {
1178 fprintf (vect_dump, "examining phi: ");
1179 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1180 }
1181
1182 if (! is_loop_header_bb_p (bb))
1183 {
1184 /* inner-loop loop-closed exit phi in outer-loop vectorization
1185 (i.e. a phi in the tail of the outer-loop).
1186 FORNOW: we currently don't support the case that these phis
1187 are not used in the outerloop (unless it is double reduction,
1188 i.e., this phi is vect_reduction_def), cause this case
1189 requires to actually do something here. */
1190 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1191 || STMT_VINFO_LIVE_P (stmt_info))
1192 && STMT_VINFO_DEF_TYPE (stmt_info)
1193 != vect_double_reduction_def)
1194 {
1195 if (vect_print_dump_info (REPORT_DETAILS))
1196 fprintf (vect_dump,
1197 "Unsupported loop-closed phi in outer-loop.");
1198 return false;
1199 }
1200 continue;
1201 }
1202
1203 gcc_assert (stmt_info);
1204
1205 if (STMT_VINFO_LIVE_P (stmt_info))
1206 {
1207 /* FORNOW: not yet supported. */
1208 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1209 fprintf (vect_dump, "not vectorized: value used after loop.");
1210 return false;
1211 }
1212
1213 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1214 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1215 {
1216 /* A scalar-dependence cycle that we don't support. */
1217 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1218 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1219 return false;
1220 }
1221
1222 if (STMT_VINFO_RELEVANT_P (stmt_info))
1223 {
1224 need_to_vectorize = true;
1225 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1226 ok = vectorizable_induction (phi, NULL, NULL);
1227 }
1228
1229 if (!ok)
1230 {
1231 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1232 {
1233 fprintf (vect_dump,
1234 "not vectorized: relevant phi not supported: ");
1235 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1236 }
1237 return false;
1238 }
1239 }
1240
1241 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1242 {
1243 gimple stmt = gsi_stmt (si);
1244 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1245
1246 gcc_assert (stmt_info);
1247
1248 if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1249 return false;
1250
1251 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1252 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1253 && !PURE_SLP_STMT (stmt_info))
1254 /* STMT needs both SLP and loop-based vectorization. */
1255 only_slp_in_loop = false;
1256 }
1257 } /* bbs */
1258
1259 /* All operations in the loop are either irrelevant (deal with loop
1260 control, or dead), or only used outside the loop and can be moved
1261 out of the loop (e.g. invariants, inductions). The loop can be
1262 optimized away by scalar optimizations. We're better off not
1263 touching this loop. */
1264 if (!need_to_vectorize)
1265 {
1266 if (vect_print_dump_info (REPORT_DETAILS))
1267 fprintf (vect_dump,
1268 "All the computation can be taken out of the loop.");
1269 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1270 fprintf (vect_dump,
1271 "not vectorized: redundant loop. no profit to vectorize.");
1272 return false;
1273 }
1274
1275 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1276 vectorization factor of the loop is the unrolling factor required by the
1277 SLP instances. If that unrolling factor is 1, we say, that we perform
1278 pure SLP on loop - cross iteration parallelism is not exploited. */
1279 if (only_slp_in_loop)
1280 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1281 else
1282 vectorization_factor = least_common_multiple (vectorization_factor,
1283 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1284
1285 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1286
1287 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1288 && vect_print_dump_info (REPORT_DETAILS))
1289 fprintf (vect_dump,
1290 "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1291 vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1292
1293 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1294 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1295 {
1296 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1297 fprintf (vect_dump, "not vectorized: iteration count too small.");
1298 if (vect_print_dump_info (REPORT_DETAILS))
1299 fprintf (vect_dump,"not vectorized: iteration count smaller than "
1300 "vectorization factor.");
1301 return false;
1302 }
1303
1304 /* Analyze cost. Decide if worth while to vectorize. */
1305
1306 /* Once VF is set, SLP costs should be updated since the number of created
1307 vector stmts depends on VF. */
1308 vect_update_slp_costs_according_to_vf (loop_vinfo);
1309
1310 min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1311 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1312
1313 if (min_profitable_iters < 0)
1314 {
1315 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1316 fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1317 if (vect_print_dump_info (REPORT_DETAILS))
1318 fprintf (vect_dump, "not vectorized: vector version will never be "
1319 "profitable.");
1320 return false;
1321 }
1322
1323 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1324 * vectorization_factor) - 1);
1325
1326 /* Use the cost model only if it is more conservative than user specified
1327 threshold. */
1328
1329 th = (unsigned) min_scalar_loop_bound;
1330 if (min_profitable_iters
1331 && (!min_scalar_loop_bound
1332 || min_profitable_iters > min_scalar_loop_bound))
1333 th = (unsigned) min_profitable_iters;
1334
1335 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1336 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1337 {
1338 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1339 fprintf (vect_dump, "not vectorized: vectorization not "
1340 "profitable.");
1341 if (vect_print_dump_info (REPORT_DETAILS))
1342 fprintf (vect_dump, "not vectorized: iteration count smaller than "
1343 "user specified loop bound parameter or minimum "
1344 "profitable iterations (whichever is more conservative).");
1345 return false;
1346 }
1347
1348 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1349 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1350 || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1351 {
1352 if (vect_print_dump_info (REPORT_DETAILS))
1353 fprintf (vect_dump, "epilog loop required.");
1354 if (!vect_can_advance_ivs_p (loop_vinfo))
1355 {
1356 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1357 fprintf (vect_dump,
1358 "not vectorized: can't create epilog loop 1.");
1359 return false;
1360 }
1361 if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1362 {
1363 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1364 fprintf (vect_dump,
1365 "not vectorized: can't create epilog loop 2.");
1366 return false;
1367 }
1368 }
1369
1370 return true;
1371 }
1372
1373
1374 /* Function vect_analyze_loop_2.
1375
1376 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1377 for it. The different analyses will record information in the
1378 loop_vec_info struct. */
1379 static bool
1380 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1381 {
1382 bool ok, dummy;
1383 int max_vf = MAX_VECTORIZATION_FACTOR;
1384 int min_vf = 2;
1385
1386 /* Find all data references in the loop (which correspond to vdefs/vuses)
1387 and analyze their evolution in the loop. Also adjust the minimal
1388 vectorization factor according to the loads and stores.
1389
1390 FORNOW: Handle only simple, array references, which
1391 alignment can be forced, and aligned pointer-references. */
1392
1393 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1394 if (!ok)
1395 {
1396 if (vect_print_dump_info (REPORT_DETAILS))
1397 fprintf (vect_dump, "bad data references.");
1398 return false;
1399 }
1400
1401 /* Classify all cross-iteration scalar data-flow cycles.
1402 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1403
1404 vect_analyze_scalar_cycles (loop_vinfo);
1405
1406 vect_pattern_recog (loop_vinfo);
1407
1408 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1409
1410 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1411 if (!ok)
1412 {
1413 if (vect_print_dump_info (REPORT_DETAILS))
1414 fprintf (vect_dump, "unexpected pattern.");
1415 return false;
1416 }
1417
1418 /* Analyze data dependences between the data-refs in the loop
1419 and adjust the maximum vectorization factor according to
1420 the dependences.
1421 FORNOW: fail at the first data dependence that we encounter. */
1422
1423 ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf, &dummy);
1424 if (!ok
1425 || max_vf < min_vf)
1426 {
1427 if (vect_print_dump_info (REPORT_DETAILS))
1428 fprintf (vect_dump, "bad data dependence.");
1429 return false;
1430 }
1431
1432 ok = vect_determine_vectorization_factor (loop_vinfo);
1433 if (!ok)
1434 {
1435 if (vect_print_dump_info (REPORT_DETAILS))
1436 fprintf (vect_dump, "can't determine vectorization factor.");
1437 return false;
1438 }
1439 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1440 {
1441 if (vect_print_dump_info (REPORT_DETAILS))
1442 fprintf (vect_dump, "bad data dependence.");
1443 return false;
1444 }
1445
1446 /* Analyze the alignment of the data-refs in the loop.
1447 Fail if a data reference is found that cannot be vectorized. */
1448
1449 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1450 if (!ok)
1451 {
1452 if (vect_print_dump_info (REPORT_DETAILS))
1453 fprintf (vect_dump, "bad data alignment.");
1454 return false;
1455 }
1456
1457 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1458 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1459
1460 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1461 if (!ok)
1462 {
1463 if (vect_print_dump_info (REPORT_DETAILS))
1464 fprintf (vect_dump, "bad data access.");
1465 return false;
1466 }
1467
1468 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1469 It is important to call pruning after vect_analyze_data_ref_accesses,
1470 since we use grouping information gathered by interleaving analysis. */
1471 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1472 if (!ok)
1473 {
1474 if (vect_print_dump_info (REPORT_DETAILS))
1475 fprintf (vect_dump, "too long list of versioning for alias "
1476 "run-time tests.");
1477 return false;
1478 }
1479
1480 /* This pass will decide on using loop versioning and/or loop peeling in
1481 order to enhance the alignment of data references in the loop. */
1482
1483 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1484 if (!ok)
1485 {
1486 if (vect_print_dump_info (REPORT_DETAILS))
1487 fprintf (vect_dump, "bad data alignment.");
1488 return false;
1489 }
1490
1491 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1492 ok = vect_analyze_slp (loop_vinfo, NULL);
1493 if (ok)
1494 {
1495 /* Decide which possible SLP instances to SLP. */
1496 vect_make_slp_decision (loop_vinfo);
1497
1498 /* Find stmts that need to be both vectorized and SLPed. */
1499 vect_detect_hybrid_slp (loop_vinfo);
1500 }
1501
1502 /* Scan all the operations in the loop and make sure they are
1503 vectorizable. */
1504
1505 ok = vect_analyze_loop_operations (loop_vinfo);
1506 if (!ok)
1507 {
1508 if (vect_print_dump_info (REPORT_DETAILS))
1509 fprintf (vect_dump, "bad operation or unsupported loop bound.");
1510 return false;
1511 }
1512
1513 return true;
1514 }
1515
1516 /* Function vect_analyze_loop.
1517
1518 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1519 for it. The different analyses will record information in the
1520 loop_vec_info struct. */
1521 loop_vec_info
1522 vect_analyze_loop (struct loop *loop)
1523 {
1524 loop_vec_info loop_vinfo;
1525 unsigned int vector_sizes;
1526
1527 /* Autodetect first vector size we try. */
1528 current_vector_size = 0;
1529 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1530
1531 if (vect_print_dump_info (REPORT_DETAILS))
1532 fprintf (vect_dump, "===== analyze_loop_nest =====");
1533
1534 if (loop_outer (loop)
1535 && loop_vec_info_for_loop (loop_outer (loop))
1536 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1537 {
1538 if (vect_print_dump_info (REPORT_DETAILS))
1539 fprintf (vect_dump, "outer-loop already vectorized.");
1540 return NULL;
1541 }
1542
1543 while (1)
1544 {
1545 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1546 loop_vinfo = vect_analyze_loop_form (loop);
1547 if (!loop_vinfo)
1548 {
1549 if (vect_print_dump_info (REPORT_DETAILS))
1550 fprintf (vect_dump, "bad loop form.");
1551 return NULL;
1552 }
1553
1554 if (vect_analyze_loop_2 (loop_vinfo))
1555 {
1556 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1557
1558 return loop_vinfo;
1559 }
1560
1561 destroy_loop_vec_info (loop_vinfo, true);
1562
1563 vector_sizes &= ~current_vector_size;
1564 if (vector_sizes == 0
1565 || current_vector_size == 0)
1566 return NULL;
1567
1568 /* Try the next biggest vector size. */
1569 current_vector_size = 1 << floor_log2 (vector_sizes);
1570 if (vect_print_dump_info (REPORT_DETAILS))
1571 fprintf (vect_dump, "***** Re-trying analysis with "
1572 "vector size %d\n", current_vector_size);
1573 }
1574 }
1575
1576
1577 /* Function reduction_code_for_scalar_code
1578
1579 Input:
1580 CODE - tree_code of a reduction operations.
1581
1582 Output:
1583 REDUC_CODE - the corresponding tree-code to be used to reduce the
1584 vector of partial results into a single scalar result (which
1585 will also reside in a vector) or ERROR_MARK if the operation is
1586 a supported reduction operation, but does not have such tree-code.
1587
1588 Return FALSE if CODE currently cannot be vectorized as reduction. */
1589
1590 static bool
1591 reduction_code_for_scalar_code (enum tree_code code,
1592 enum tree_code *reduc_code)
1593 {
1594 switch (code)
1595 {
1596 case MAX_EXPR:
1597 *reduc_code = REDUC_MAX_EXPR;
1598 return true;
1599
1600 case MIN_EXPR:
1601 *reduc_code = REDUC_MIN_EXPR;
1602 return true;
1603
1604 case PLUS_EXPR:
1605 *reduc_code = REDUC_PLUS_EXPR;
1606 return true;
1607
1608 case MULT_EXPR:
1609 case MINUS_EXPR:
1610 case BIT_IOR_EXPR:
1611 case BIT_XOR_EXPR:
1612 case BIT_AND_EXPR:
1613 *reduc_code = ERROR_MARK;
1614 return true;
1615
1616 default:
1617 return false;
1618 }
1619 }
1620
1621
1622 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1623 STMT is printed with a message MSG. */
1624
1625 static void
1626 report_vect_op (gimple stmt, const char *msg)
1627 {
1628 fprintf (vect_dump, "%s", msg);
1629 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1630 }
1631
1632
1633 /* Function vect_is_simple_reduction_1
1634
1635 (1) Detect a cross-iteration def-use cycle that represents a simple
1636 reduction computation. We look for the following pattern:
1637
1638 loop_header:
1639 a1 = phi < a0, a2 >
1640 a3 = ...
1641 a2 = operation (a3, a1)
1642
1643 such that:
1644 1. operation is commutative and associative and it is safe to
1645 change the order of the computation (if CHECK_REDUCTION is true)
1646 2. no uses for a2 in the loop (a2 is used out of the loop)
1647 3. no uses of a1 in the loop besides the reduction operation.
1648
1649 Condition 1 is tested here.
1650 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1651
1652 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1653 nested cycles, if CHECK_REDUCTION is false.
1654
1655 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1656 reductions:
1657
1658 a1 = phi < a0, a2 >
1659 inner loop (def of a3)
1660 a2 = phi < a3 >
1661
1662 If MODIFY is true it tries also to rework the code in-place to enable
1663 detection of more reduction patterns. For the time being we rewrite
1664 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
1665 */
1666
1667 static gimple
1668 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
1669 bool check_reduction, bool *double_reduc,
1670 bool modify)
1671 {
1672 struct loop *loop = (gimple_bb (phi))->loop_father;
1673 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1674 edge latch_e = loop_latch_edge (loop);
1675 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1676 gimple def_stmt, def1 = NULL, def2 = NULL;
1677 enum tree_code orig_code, code;
1678 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1679 tree type;
1680 int nloop_uses;
1681 tree name;
1682 imm_use_iterator imm_iter;
1683 use_operand_p use_p;
1684 bool phi_def;
1685
1686 *double_reduc = false;
1687
1688 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1689 otherwise, we assume outer loop vectorization. */
1690 gcc_assert ((check_reduction && loop == vect_loop)
1691 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1692
1693 name = PHI_RESULT (phi);
1694 nloop_uses = 0;
1695 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1696 {
1697 gimple use_stmt = USE_STMT (use_p);
1698 if (is_gimple_debug (use_stmt))
1699 continue;
1700 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1701 && vinfo_for_stmt (use_stmt)
1702 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1703 nloop_uses++;
1704 if (nloop_uses > 1)
1705 {
1706 if (vect_print_dump_info (REPORT_DETAILS))
1707 fprintf (vect_dump, "reduction used in loop.");
1708 return NULL;
1709 }
1710 }
1711
1712 if (TREE_CODE (loop_arg) != SSA_NAME)
1713 {
1714 if (vect_print_dump_info (REPORT_DETAILS))
1715 {
1716 fprintf (vect_dump, "reduction: not ssa_name: ");
1717 print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
1718 }
1719 return NULL;
1720 }
1721
1722 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
1723 if (!def_stmt)
1724 {
1725 if (vect_print_dump_info (REPORT_DETAILS))
1726 fprintf (vect_dump, "reduction: no def_stmt.");
1727 return NULL;
1728 }
1729
1730 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
1731 {
1732 if (vect_print_dump_info (REPORT_DETAILS))
1733 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1734 return NULL;
1735 }
1736
1737 if (is_gimple_assign (def_stmt))
1738 {
1739 name = gimple_assign_lhs (def_stmt);
1740 phi_def = false;
1741 }
1742 else
1743 {
1744 name = PHI_RESULT (def_stmt);
1745 phi_def = true;
1746 }
1747
1748 nloop_uses = 0;
1749 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1750 {
1751 gimple use_stmt = USE_STMT (use_p);
1752 if (is_gimple_debug (use_stmt))
1753 continue;
1754 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1755 && vinfo_for_stmt (use_stmt)
1756 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1757 nloop_uses++;
1758 if (nloop_uses > 1)
1759 {
1760 if (vect_print_dump_info (REPORT_DETAILS))
1761 fprintf (vect_dump, "reduction used in loop.");
1762 return NULL;
1763 }
1764 }
1765
1766 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
1767 defined in the inner loop. */
1768 if (phi_def)
1769 {
1770 op1 = PHI_ARG_DEF (def_stmt, 0);
1771
1772 if (gimple_phi_num_args (def_stmt) != 1
1773 || TREE_CODE (op1) != SSA_NAME)
1774 {
1775 if (vect_print_dump_info (REPORT_DETAILS))
1776 fprintf (vect_dump, "unsupported phi node definition.");
1777
1778 return NULL;
1779 }
1780
1781 def1 = SSA_NAME_DEF_STMT (op1);
1782 if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1783 && loop->inner
1784 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
1785 && is_gimple_assign (def1))
1786 {
1787 if (vect_print_dump_info (REPORT_DETAILS))
1788 report_vect_op (def_stmt, "detected double reduction: ");
1789
1790 *double_reduc = true;
1791 return def_stmt;
1792 }
1793
1794 return NULL;
1795 }
1796
1797 code = orig_code = gimple_assign_rhs_code (def_stmt);
1798
1799 /* We can handle "res -= x[i]", which is non-associative by
1800 simply rewriting this into "res += -x[i]". Avoid changing
1801 gimple instruction for the first simple tests and only do this
1802 if we're allowed to change code at all. */
1803 if (code == MINUS_EXPR && modify)
1804 code = PLUS_EXPR;
1805
1806 if (check_reduction
1807 && (!commutative_tree_code (code) || !associative_tree_code (code)))
1808 {
1809 if (vect_print_dump_info (REPORT_DETAILS))
1810 report_vect_op (def_stmt, "reduction: not commutative/associative: ");
1811 return NULL;
1812 }
1813
1814 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
1815 {
1816 if (code != COND_EXPR)
1817 {
1818 if (vect_print_dump_info (REPORT_DETAILS))
1819 report_vect_op (def_stmt, "reduction: not binary operation: ");
1820
1821 return NULL;
1822 }
1823
1824 op3 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
1825 if (COMPARISON_CLASS_P (op3))
1826 {
1827 op4 = TREE_OPERAND (op3, 1);
1828 op3 = TREE_OPERAND (op3, 0);
1829 }
1830
1831 op1 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 1);
1832 op2 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 2);
1833
1834 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
1835 {
1836 if (vect_print_dump_info (REPORT_DETAILS))
1837 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1838
1839 return NULL;
1840 }
1841 }
1842 else
1843 {
1844 op1 = gimple_assign_rhs1 (def_stmt);
1845 op2 = gimple_assign_rhs2 (def_stmt);
1846
1847 if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
1848 {
1849 if (vect_print_dump_info (REPORT_DETAILS))
1850 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1851
1852 return NULL;
1853 }
1854 }
1855
1856 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
1857 if ((TREE_CODE (op1) == SSA_NAME
1858 && !types_compatible_p (type,TREE_TYPE (op1)))
1859 || (TREE_CODE (op2) == SSA_NAME
1860 && !types_compatible_p (type, TREE_TYPE (op2)))
1861 || (op3 && TREE_CODE (op3) == SSA_NAME
1862 && !types_compatible_p (type, TREE_TYPE (op3)))
1863 || (op4 && TREE_CODE (op4) == SSA_NAME
1864 && !types_compatible_p (type, TREE_TYPE (op4))))
1865 {
1866 if (vect_print_dump_info (REPORT_DETAILS))
1867 {
1868 fprintf (vect_dump, "reduction: multiple types: operation type: ");
1869 print_generic_expr (vect_dump, type, TDF_SLIM);
1870 fprintf (vect_dump, ", operands types: ");
1871 print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
1872 fprintf (vect_dump, ",");
1873 print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
1874 if (op3)
1875 {
1876 fprintf (vect_dump, ",");
1877 print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
1878 }
1879
1880 if (op4)
1881 {
1882 fprintf (vect_dump, ",");
1883 print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
1884 }
1885 }
1886
1887 return NULL;
1888 }
1889
1890 /* Check that it's ok to change the order of the computation.
1891 Generally, when vectorizing a reduction we change the order of the
1892 computation. This may change the behavior of the program in some
1893 cases, so we need to check that this is ok. One exception is when
1894 vectorizing an outer-loop: the inner-loop is executed sequentially,
1895 and therefore vectorizing reductions in the inner-loop during
1896 outer-loop vectorization is safe. */
1897
1898 /* CHECKME: check for !flag_finite_math_only too? */
1899 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
1900 && check_reduction)
1901 {
1902 /* Changing the order of operations changes the semantics. */
1903 if (vect_print_dump_info (REPORT_DETAILS))
1904 report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
1905 return NULL;
1906 }
1907 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
1908 && check_reduction)
1909 {
1910 /* Changing the order of operations changes the semantics. */
1911 if (vect_print_dump_info (REPORT_DETAILS))
1912 report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
1913 return NULL;
1914 }
1915 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
1916 {
1917 /* Changing the order of operations changes the semantics. */
1918 if (vect_print_dump_info (REPORT_DETAILS))
1919 report_vect_op (def_stmt,
1920 "reduction: unsafe fixed-point math optimization: ");
1921 return NULL;
1922 }
1923
1924 /* If we detected "res -= x[i]" earlier, rewrite it into
1925 "res += -x[i]" now. If this turns out to be useless reassoc
1926 will clean it up again. */
1927 if (orig_code == MINUS_EXPR)
1928 {
1929 tree rhs = gimple_assign_rhs2 (def_stmt);
1930 tree negrhs = make_ssa_name (SSA_NAME_VAR (rhs), NULL);
1931 gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
1932 rhs, NULL);
1933 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
1934 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
1935 loop_info, NULL));
1936 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
1937 gimple_assign_set_rhs2 (def_stmt, negrhs);
1938 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
1939 update_stmt (def_stmt);
1940 }
1941
1942 /* Reduction is safe. We're dealing with one of the following:
1943 1) integer arithmetic and no trapv
1944 2) floating point arithmetic, and special flags permit this optimization
1945 3) nested cycle (i.e., outer loop vectorization). */
1946 if (TREE_CODE (op1) == SSA_NAME)
1947 def1 = SSA_NAME_DEF_STMT (op1);
1948
1949 if (TREE_CODE (op2) == SSA_NAME)
1950 def2 = SSA_NAME_DEF_STMT (op2);
1951
1952 if (code != COND_EXPR
1953 && (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2)))
1954 {
1955 if (vect_print_dump_info (REPORT_DETAILS))
1956 report_vect_op (def_stmt, "reduction: no defs for operands: ");
1957 return NULL;
1958 }
1959
1960 /* Check that one def is the reduction def, defined by PHI,
1961 the other def is either defined in the loop ("vect_internal_def"),
1962 or it's an induction (defined by a loop-header phi-node). */
1963
1964 if (def2 && def2 == phi
1965 && (code == COND_EXPR
1966 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
1967 && (is_gimple_assign (def1)
1968 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1969 == vect_induction_def
1970 || (gimple_code (def1) == GIMPLE_PHI
1971 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1972 == vect_internal_def
1973 && !is_loop_header_bb_p (gimple_bb (def1)))))))
1974 {
1975 if (vect_print_dump_info (REPORT_DETAILS))
1976 report_vect_op (def_stmt, "detected reduction: ");
1977 return def_stmt;
1978 }
1979 else if (def1 && def1 == phi
1980 && (code == COND_EXPR
1981 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
1982 && (is_gimple_assign (def2)
1983 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1984 == vect_induction_def
1985 || (gimple_code (def2) == GIMPLE_PHI
1986 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1987 == vect_internal_def
1988 && !is_loop_header_bb_p (gimple_bb (def2)))))))
1989 {
1990 if (check_reduction)
1991 {
1992 /* Swap operands (just for simplicity - so that the rest of the code
1993 can assume that the reduction variable is always the last (second)
1994 argument). */
1995 if (vect_print_dump_info (REPORT_DETAILS))
1996 report_vect_op (def_stmt,
1997 "detected reduction: need to swap operands: ");
1998
1999 swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2000 gimple_assign_rhs2_ptr (def_stmt));
2001 }
2002 else
2003 {
2004 if (vect_print_dump_info (REPORT_DETAILS))
2005 report_vect_op (def_stmt, "detected reduction: ");
2006 }
2007
2008 return def_stmt;
2009 }
2010 else
2011 {
2012 if (vect_print_dump_info (REPORT_DETAILS))
2013 report_vect_op (def_stmt, "reduction: unknown pattern: ");
2014
2015 return NULL;
2016 }
2017 }
2018
2019 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2020 in-place. Arguments as there. */
2021
2022 static gimple
2023 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2024 bool check_reduction, bool *double_reduc)
2025 {
2026 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2027 double_reduc, false);
2028 }
2029
2030 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2031 in-place if it enables detection of more reductions. Arguments
2032 as there. */
2033
2034 gimple
2035 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2036 bool check_reduction, bool *double_reduc)
2037 {
2038 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2039 double_reduc, true);
2040 }
2041
2042 /* Calculate the cost of one scalar iteration of the loop. */
2043 int
2044 vect_get_single_scalar_iteraion_cost (loop_vec_info loop_vinfo)
2045 {
2046 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2047 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2048 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2049 int innerloop_iters, i, stmt_cost;
2050
2051 /* Count statements in scalar loop. Using this as scalar cost for a single
2052 iteration for now.
2053
2054 TODO: Add outer loop support.
2055
2056 TODO: Consider assigning different costs to different scalar
2057 statements. */
2058
2059 /* FORNOW. */
2060 innerloop_iters = 1;
2061 if (loop->inner)
2062 innerloop_iters = 50; /* FIXME */
2063
2064 for (i = 0; i < nbbs; i++)
2065 {
2066 gimple_stmt_iterator si;
2067 basic_block bb = bbs[i];
2068
2069 if (bb->loop_father == loop->inner)
2070 factor = innerloop_iters;
2071 else
2072 factor = 1;
2073
2074 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2075 {
2076 gimple stmt = gsi_stmt (si);
2077 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2078
2079 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2080 continue;
2081
2082 /* Skip stmts that are not vectorized inside the loop. */
2083 if (stmt_info
2084 && !STMT_VINFO_RELEVANT_P (stmt_info)
2085 && (!STMT_VINFO_LIVE_P (stmt_info)
2086 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2087 continue;
2088
2089 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2090 {
2091 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2092 stmt_cost = vect_get_cost (scalar_load);
2093 else
2094 stmt_cost = vect_get_cost (scalar_store);
2095 }
2096 else
2097 stmt_cost = vect_get_cost (scalar_stmt);
2098
2099 scalar_single_iter_cost += stmt_cost * factor;
2100 }
2101 }
2102 return scalar_single_iter_cost;
2103 }
2104
2105 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2106 int
2107 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2108 int *peel_iters_epilogue,
2109 int scalar_single_iter_cost)
2110 {
2111 int peel_guard_costs = 0;
2112 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2113
2114 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2115 {
2116 *peel_iters_epilogue = vf/2;
2117 if (vect_print_dump_info (REPORT_COST))
2118 fprintf (vect_dump, "cost model: "
2119 "epilogue peel iters set to vf/2 because "
2120 "loop iterations are unknown .");
2121
2122 /* If peeled iterations are known but number of scalar loop
2123 iterations are unknown, count a taken branch per peeled loop. */
2124 peel_guard_costs = 2 * vect_get_cost (cond_branch_taken);
2125 }
2126 else
2127 {
2128 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2129 peel_iters_prologue = niters < peel_iters_prologue ?
2130 niters : peel_iters_prologue;
2131 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2132 }
2133
2134 return (peel_iters_prologue * scalar_single_iter_cost)
2135 + (*peel_iters_epilogue * scalar_single_iter_cost)
2136 + peel_guard_costs;
2137 }
2138
2139 /* Function vect_estimate_min_profitable_iters
2140
2141 Return the number of iterations required for the vector version of the
2142 loop to be profitable relative to the cost of the scalar version of the
2143 loop.
2144
2145 TODO: Take profile info into account before making vectorization
2146 decisions, if available. */
2147
2148 int
2149 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
2150 {
2151 int i;
2152 int min_profitable_iters;
2153 int peel_iters_prologue;
2154 int peel_iters_epilogue;
2155 int vec_inside_cost = 0;
2156 int vec_outside_cost = 0;
2157 int scalar_single_iter_cost = 0;
2158 int scalar_outside_cost = 0;
2159 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2160 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2161 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2162 int nbbs = loop->num_nodes;
2163 int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2164 int peel_guard_costs = 0;
2165 int innerloop_iters = 0, factor;
2166 VEC (slp_instance, heap) *slp_instances;
2167 slp_instance instance;
2168
2169 /* Cost model disabled. */
2170 if (!flag_vect_cost_model)
2171 {
2172 if (vect_print_dump_info (REPORT_COST))
2173 fprintf (vect_dump, "cost model disabled.");
2174 return 0;
2175 }
2176
2177 /* Requires loop versioning tests to handle misalignment. */
2178 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2179 {
2180 /* FIXME: Make cost depend on complexity of individual check. */
2181 vec_outside_cost +=
2182 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
2183 if (vect_print_dump_info (REPORT_COST))
2184 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2185 "versioning to treat misalignment.\n");
2186 }
2187
2188 /* Requires loop versioning with alias checks. */
2189 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2190 {
2191 /* FIXME: Make cost depend on complexity of individual check. */
2192 vec_outside_cost +=
2193 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
2194 if (vect_print_dump_info (REPORT_COST))
2195 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2196 "versioning aliasing.\n");
2197 }
2198
2199 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2200 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2201 vec_outside_cost += vect_get_cost (cond_branch_taken);
2202
2203 /* Count statements in scalar loop. Using this as scalar cost for a single
2204 iteration for now.
2205
2206 TODO: Add outer loop support.
2207
2208 TODO: Consider assigning different costs to different scalar
2209 statements. */
2210
2211 /* FORNOW. */
2212 if (loop->inner)
2213 innerloop_iters = 50; /* FIXME */
2214
2215 for (i = 0; i < nbbs; i++)
2216 {
2217 gimple_stmt_iterator si;
2218 basic_block bb = bbs[i];
2219
2220 if (bb->loop_father == loop->inner)
2221 factor = innerloop_iters;
2222 else
2223 factor = 1;
2224
2225 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2226 {
2227 gimple stmt = gsi_stmt (si);
2228 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2229 /* Skip stmts that are not vectorized inside the loop. */
2230 if (!STMT_VINFO_RELEVANT_P (stmt_info)
2231 && (!STMT_VINFO_LIVE_P (stmt_info)
2232 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2233 continue;
2234 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
2235 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
2236 some of the "outside" costs are generated inside the outer-loop. */
2237 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
2238 }
2239 }
2240
2241 scalar_single_iter_cost = vect_get_single_scalar_iteraion_cost (loop_vinfo);
2242
2243 /* Add additional cost for the peeled instructions in prologue and epilogue
2244 loop.
2245
2246 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2247 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2248
2249 TODO: Build an expression that represents peel_iters for prologue and
2250 epilogue to be used in a run-time test. */
2251
2252 if (npeel < 0)
2253 {
2254 peel_iters_prologue = vf/2;
2255 if (vect_print_dump_info (REPORT_COST))
2256 fprintf (vect_dump, "cost model: "
2257 "prologue peel iters set to vf/2.");
2258
2259 /* If peeling for alignment is unknown, loop bound of main loop becomes
2260 unknown. */
2261 peel_iters_epilogue = vf/2;
2262 if (vect_print_dump_info (REPORT_COST))
2263 fprintf (vect_dump, "cost model: "
2264 "epilogue peel iters set to vf/2 because "
2265 "peeling for alignment is unknown .");
2266
2267 /* If peeled iterations are unknown, count a taken branch and a not taken
2268 branch per peeled loop. Even if scalar loop iterations are known,
2269 vector iterations are not known since peeled prologue iterations are
2270 not known. Hence guards remain the same. */
2271 peel_guard_costs += 2 * (vect_get_cost (cond_branch_taken)
2272 + vect_get_cost (cond_branch_not_taken));
2273 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2274 + (peel_iters_epilogue * scalar_single_iter_cost)
2275 + peel_guard_costs;
2276 }
2277 else
2278 {
2279 peel_iters_prologue = npeel;
2280 vec_outside_cost += vect_get_known_peeling_cost (loop_vinfo,
2281 peel_iters_prologue, &peel_iters_epilogue,
2282 scalar_single_iter_cost);
2283 }
2284
2285 /* FORNOW: The scalar outside cost is incremented in one of the
2286 following ways:
2287
2288 1. The vectorizer checks for alignment and aliasing and generates
2289 a condition that allows dynamic vectorization. A cost model
2290 check is ANDED with the versioning condition. Hence scalar code
2291 path now has the added cost of the versioning check.
2292
2293 if (cost > th & versioning_check)
2294 jmp to vector code
2295
2296 Hence run-time scalar is incremented by not-taken branch cost.
2297
2298 2. The vectorizer then checks if a prologue is required. If the
2299 cost model check was not done before during versioning, it has to
2300 be done before the prologue check.
2301
2302 if (cost <= th)
2303 prologue = scalar_iters
2304 if (prologue == 0)
2305 jmp to vector code
2306 else
2307 execute prologue
2308 if (prologue == num_iters)
2309 go to exit
2310
2311 Hence the run-time scalar cost is incremented by a taken branch,
2312 plus a not-taken branch, plus a taken branch cost.
2313
2314 3. The vectorizer then checks if an epilogue is required. If the
2315 cost model check was not done before during prologue check, it
2316 has to be done with the epilogue check.
2317
2318 if (prologue == 0)
2319 jmp to vector code
2320 else
2321 execute prologue
2322 if (prologue == num_iters)
2323 go to exit
2324 vector code:
2325 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2326 jmp to epilogue
2327
2328 Hence the run-time scalar cost should be incremented by 2 taken
2329 branches.
2330
2331 TODO: The back end may reorder the BBS's differently and reverse
2332 conditions/branch directions. Change the estimates below to
2333 something more reasonable. */
2334
2335 /* If the number of iterations is known and we do not do versioning, we can
2336 decide whether to vectorize at compile time. Hence the scalar version
2337 do not carry cost model guard costs. */
2338 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2339 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2340 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2341 {
2342 /* Cost model check occurs at versioning. */
2343 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2344 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2345 scalar_outside_cost += vect_get_cost (cond_branch_not_taken);
2346 else
2347 {
2348 /* Cost model check occurs at prologue generation. */
2349 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2350 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken)
2351 + vect_get_cost (cond_branch_not_taken);
2352 /* Cost model check occurs at epilogue generation. */
2353 else
2354 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken);
2355 }
2356 }
2357
2358 /* Add SLP costs. */
2359 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2360 FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance)
2361 {
2362 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2363 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2364 }
2365
2366 /* Calculate number of iterations required to make the vector version
2367 profitable, relative to the loop bodies only. The following condition
2368 must hold true:
2369 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2370 where
2371 SIC = scalar iteration cost, VIC = vector iteration cost,
2372 VOC = vector outside cost, VF = vectorization factor,
2373 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2374 SOC = scalar outside cost for run time cost model check. */
2375
2376 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2377 {
2378 if (vec_outside_cost <= 0)
2379 min_profitable_iters = 1;
2380 else
2381 {
2382 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2383 - vec_inside_cost * peel_iters_prologue
2384 - vec_inside_cost * peel_iters_epilogue)
2385 / ((scalar_single_iter_cost * vf)
2386 - vec_inside_cost);
2387
2388 if ((scalar_single_iter_cost * vf * min_profitable_iters)
2389 <= ((vec_inside_cost * min_profitable_iters)
2390 + ((vec_outside_cost - scalar_outside_cost) * vf)))
2391 min_profitable_iters++;
2392 }
2393 }
2394 /* vector version will never be profitable. */
2395 else
2396 {
2397 if (vect_print_dump_info (REPORT_COST))
2398 fprintf (vect_dump, "cost model: the vector iteration cost = %d "
2399 "divided by the scalar iteration cost = %d "
2400 "is greater or equal to the vectorization factor = %d.",
2401 vec_inside_cost, scalar_single_iter_cost, vf);
2402 return -1;
2403 }
2404
2405 if (vect_print_dump_info (REPORT_COST))
2406 {
2407 fprintf (vect_dump, "Cost model analysis: \n");
2408 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
2409 vec_inside_cost);
2410 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
2411 vec_outside_cost);
2412 fprintf (vect_dump, " Scalar iteration cost: %d\n",
2413 scalar_single_iter_cost);
2414 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
2415 fprintf (vect_dump, " prologue iterations: %d\n",
2416 peel_iters_prologue);
2417 fprintf (vect_dump, " epilogue iterations: %d\n",
2418 peel_iters_epilogue);
2419 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
2420 min_profitable_iters);
2421 }
2422
2423 min_profitable_iters =
2424 min_profitable_iters < vf ? vf : min_profitable_iters;
2425
2426 /* Because the condition we create is:
2427 if (niters <= min_profitable_iters)
2428 then skip the vectorized loop. */
2429 min_profitable_iters--;
2430
2431 if (vect_print_dump_info (REPORT_COST))
2432 fprintf (vect_dump, " Profitability threshold = %d\n",
2433 min_profitable_iters);
2434
2435 return min_profitable_iters;
2436 }
2437
2438
2439 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2440 functions. Design better to avoid maintenance issues. */
2441
2442 /* Function vect_model_reduction_cost.
2443
2444 Models cost for a reduction operation, including the vector ops
2445 generated within the strip-mine loop, the initial definition before
2446 the loop, and the epilogue code that must be generated. */
2447
2448 static bool
2449 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2450 int ncopies)
2451 {
2452 int outer_cost = 0;
2453 enum tree_code code;
2454 optab optab;
2455 tree vectype;
2456 gimple stmt, orig_stmt;
2457 tree reduction_op;
2458 enum machine_mode mode;
2459 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2460 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2461
2462
2463 /* Cost of reduction op inside loop. */
2464 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2465 += ncopies * vect_get_cost (vector_stmt);
2466
2467 stmt = STMT_VINFO_STMT (stmt_info);
2468
2469 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2470 {
2471 case GIMPLE_SINGLE_RHS:
2472 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2473 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2474 break;
2475 case GIMPLE_UNARY_RHS:
2476 reduction_op = gimple_assign_rhs1 (stmt);
2477 break;
2478 case GIMPLE_BINARY_RHS:
2479 reduction_op = gimple_assign_rhs2 (stmt);
2480 break;
2481 default:
2482 gcc_unreachable ();
2483 }
2484
2485 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2486 if (!vectype)
2487 {
2488 if (vect_print_dump_info (REPORT_COST))
2489 {
2490 fprintf (vect_dump, "unsupported data-type ");
2491 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2492 }
2493 return false;
2494 }
2495
2496 mode = TYPE_MODE (vectype);
2497 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2498
2499 if (!orig_stmt)
2500 orig_stmt = STMT_VINFO_STMT (stmt_info);
2501
2502 code = gimple_assign_rhs_code (orig_stmt);
2503
2504 /* Add in cost for initial definition. */
2505 outer_cost += vect_get_cost (scalar_to_vec);
2506
2507 /* Determine cost of epilogue code.
2508
2509 We have a reduction operator that will reduce the vector in one statement.
2510 Also requires scalar extract. */
2511
2512 if (!nested_in_vect_loop_p (loop, orig_stmt))
2513 {
2514 if (reduc_code != ERROR_MARK)
2515 outer_cost += vect_get_cost (vector_stmt)
2516 + vect_get_cost (vec_to_scalar);
2517 else
2518 {
2519 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2520 tree bitsize =
2521 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2522 int element_bitsize = tree_low_cst (bitsize, 1);
2523 int nelements = vec_size_in_bits / element_bitsize;
2524
2525 optab = optab_for_tree_code (code, vectype, optab_default);
2526
2527 /* We have a whole vector shift available. */
2528 if (VECTOR_MODE_P (mode)
2529 && optab_handler (optab, mode) != CODE_FOR_nothing
2530 && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
2531 /* Final reduction via vector shifts and the reduction operator. Also
2532 requires scalar extract. */
2533 outer_cost += ((exact_log2(nelements) * 2)
2534 * vect_get_cost (vector_stmt)
2535 + vect_get_cost (vec_to_scalar));
2536 else
2537 /* Use extracts and reduction op for final reduction. For N elements,
2538 we have N extracts and N-1 reduction ops. */
2539 outer_cost += ((nelements + nelements - 1)
2540 * vect_get_cost (vector_stmt));
2541 }
2542 }
2543
2544 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2545
2546 if (vect_print_dump_info (REPORT_COST))
2547 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2548 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2549 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2550
2551 return true;
2552 }
2553
2554
2555 /* Function vect_model_induction_cost.
2556
2557 Models cost for induction operations. */
2558
2559 static void
2560 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2561 {
2562 /* loop cost for vec_loop. */
2563 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2564 = ncopies * vect_get_cost (vector_stmt);
2565 /* prologue cost for vec_init and vec_step. */
2566 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)
2567 = 2 * vect_get_cost (scalar_to_vec);
2568
2569 if (vect_print_dump_info (REPORT_COST))
2570 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2571 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2572 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2573 }
2574
2575
2576 /* Function get_initial_def_for_induction
2577
2578 Input:
2579 STMT - a stmt that performs an induction operation in the loop.
2580 IV_PHI - the initial value of the induction variable
2581
2582 Output:
2583 Return a vector variable, initialized with the first VF values of
2584 the induction variable. E.g., for an iv with IV_PHI='X' and
2585 evolution S, for a vector of 4 units, we want to return:
2586 [X, X + S, X + 2*S, X + 3*S]. */
2587
2588 static tree
2589 get_initial_def_for_induction (gimple iv_phi)
2590 {
2591 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2592 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2593 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2594 tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
2595 tree vectype;
2596 int nunits;
2597 edge pe = loop_preheader_edge (loop);
2598 struct loop *iv_loop;
2599 basic_block new_bb;
2600 tree vec, vec_init, vec_step, t;
2601 tree access_fn;
2602 tree new_var;
2603 tree new_name;
2604 gimple init_stmt, induction_phi, new_stmt;
2605 tree induc_def, vec_def, vec_dest;
2606 tree init_expr, step_expr;
2607 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2608 int i;
2609 bool ok;
2610 int ncopies;
2611 tree expr;
2612 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2613 bool nested_in_vect_loop = false;
2614 gimple_seq stmts = NULL;
2615 imm_use_iterator imm_iter;
2616 use_operand_p use_p;
2617 gimple exit_phi;
2618 edge latch_e;
2619 tree loop_arg;
2620 gimple_stmt_iterator si;
2621 basic_block bb = gimple_bb (iv_phi);
2622 tree stepvectype;
2623
2624 vectype = get_vectype_for_scalar_type (scalar_type);
2625 gcc_assert (vectype);
2626 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2627 ncopies = vf / nunits;
2628
2629 gcc_assert (phi_info);
2630 gcc_assert (ncopies >= 1);
2631
2632 /* Find the first insertion point in the BB. */
2633 si = gsi_after_labels (bb);
2634
2635 if (INTEGRAL_TYPE_P (scalar_type))
2636 step_expr = build_int_cst (scalar_type, 0);
2637 else if (POINTER_TYPE_P (scalar_type))
2638 step_expr = size_zero_node;
2639 else
2640 step_expr = build_real (scalar_type, dconst0);
2641
2642 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
2643 if (nested_in_vect_loop_p (loop, iv_phi))
2644 {
2645 nested_in_vect_loop = true;
2646 iv_loop = loop->inner;
2647 }
2648 else
2649 iv_loop = loop;
2650 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2651
2652 latch_e = loop_latch_edge (iv_loop);
2653 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2654
2655 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2656 gcc_assert (access_fn);
2657 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2658 &init_expr, &step_expr);
2659 gcc_assert (ok);
2660 pe = loop_preheader_edge (iv_loop);
2661
2662 /* Create the vector that holds the initial_value of the induction. */
2663 if (nested_in_vect_loop)
2664 {
2665 /* iv_loop is nested in the loop to be vectorized. init_expr had already
2666 been created during vectorization of previous stmts. We obtain it
2667 from the STMT_VINFO_VEC_STMT of the defining stmt. */
2668 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2669 loop_preheader_edge (iv_loop));
2670 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2671 }
2672 else
2673 {
2674 /* iv_loop is the loop to be vectorized. Create:
2675 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
2676 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2677 add_referenced_var (new_var);
2678
2679 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2680 if (stmts)
2681 {
2682 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2683 gcc_assert (!new_bb);
2684 }
2685
2686 t = NULL_TREE;
2687 t = tree_cons (NULL_TREE, init_expr, t);
2688 for (i = 1; i < nunits; i++)
2689 {
2690 /* Create: new_name_i = new_name + step_expr */
2691 enum tree_code code = POINTER_TYPE_P (scalar_type)
2692 ? POINTER_PLUS_EXPR : PLUS_EXPR;
2693 init_stmt = gimple_build_assign_with_ops (code, new_var,
2694 new_name, step_expr);
2695 new_name = make_ssa_name (new_var, init_stmt);
2696 gimple_assign_set_lhs (init_stmt, new_name);
2697
2698 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
2699 gcc_assert (!new_bb);
2700
2701 if (vect_print_dump_info (REPORT_DETAILS))
2702 {
2703 fprintf (vect_dump, "created new init_stmt: ");
2704 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
2705 }
2706 t = tree_cons (NULL_TREE, new_name, t);
2707 }
2708 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
2709 vec = build_constructor_from_list (vectype, nreverse (t));
2710 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
2711 }
2712
2713
2714 /* Create the vector that holds the step of the induction. */
2715 if (nested_in_vect_loop)
2716 /* iv_loop is nested in the loop to be vectorized. Generate:
2717 vec_step = [S, S, S, S] */
2718 new_name = step_expr;
2719 else
2720 {
2721 /* iv_loop is the loop to be vectorized. Generate:
2722 vec_step = [VF*S, VF*S, VF*S, VF*S] */
2723 expr = build_int_cst (TREE_TYPE (step_expr), vf);
2724 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2725 expr, step_expr);
2726 }
2727
2728 t = unshare_expr (new_name);
2729 gcc_assert (CONSTANT_CLASS_P (new_name));
2730 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
2731 gcc_assert (stepvectype);
2732 vec = build_vector_from_val (stepvectype, t);
2733 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2734
2735
2736 /* Create the following def-use cycle:
2737 loop prolog:
2738 vec_init = ...
2739 vec_step = ...
2740 loop:
2741 vec_iv = PHI <vec_init, vec_loop>
2742 ...
2743 STMT
2744 ...
2745 vec_loop = vec_iv + vec_step; */
2746
2747 /* Create the induction-phi that defines the induction-operand. */
2748 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
2749 add_referenced_var (vec_dest);
2750 induction_phi = create_phi_node (vec_dest, iv_loop->header);
2751 set_vinfo_for_stmt (induction_phi,
2752 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
2753 induc_def = PHI_RESULT (induction_phi);
2754
2755 /* Create the iv update inside the loop */
2756 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2757 induc_def, vec_step);
2758 vec_def = make_ssa_name (vec_dest, new_stmt);
2759 gimple_assign_set_lhs (new_stmt, vec_def);
2760 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2761 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
2762 NULL));
2763
2764 /* Set the arguments of the phi node: */
2765 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
2766 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
2767 UNKNOWN_LOCATION);
2768
2769
2770 /* In case that vectorization factor (VF) is bigger than the number
2771 of elements that we can fit in a vectype (nunits), we have to generate
2772 more than one vector stmt - i.e - we need to "unroll" the
2773 vector stmt by a factor VF/nunits. For more details see documentation
2774 in vectorizable_operation. */
2775
2776 if (ncopies > 1)
2777 {
2778 stmt_vec_info prev_stmt_vinfo;
2779 /* FORNOW. This restriction should be relaxed. */
2780 gcc_assert (!nested_in_vect_loop);
2781
2782 /* Create the vector that holds the step of the induction. */
2783 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
2784 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2785 expr, step_expr);
2786 t = unshare_expr (new_name);
2787 gcc_assert (CONSTANT_CLASS_P (new_name));
2788 vec = build_vector_from_val (stepvectype, t);
2789 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2790
2791 vec_def = induc_def;
2792 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
2793 for (i = 1; i < ncopies; i++)
2794 {
2795 /* vec_i = vec_prev + vec_step */
2796 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2797 vec_def, vec_step);
2798 vec_def = make_ssa_name (vec_dest, new_stmt);
2799 gimple_assign_set_lhs (new_stmt, vec_def);
2800
2801 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2802 set_vinfo_for_stmt (new_stmt,
2803 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2804 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
2805 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
2806 }
2807 }
2808
2809 if (nested_in_vect_loop)
2810 {
2811 /* Find the loop-closed exit-phi of the induction, and record
2812 the final vector of induction results: */
2813 exit_phi = NULL;
2814 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
2815 {
2816 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
2817 {
2818 exit_phi = USE_STMT (use_p);
2819 break;
2820 }
2821 }
2822 if (exit_phi)
2823 {
2824 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2825 /* FORNOW. Currently not supporting the case that an inner-loop induction
2826 is not used in the outer-loop (i.e. only outside the outer-loop). */
2827 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2828 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2829
2830 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
2831 if (vect_print_dump_info (REPORT_DETAILS))
2832 {
2833 fprintf (vect_dump, "vector of inductions after inner-loop:");
2834 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
2835 }
2836 }
2837 }
2838
2839
2840 if (vect_print_dump_info (REPORT_DETAILS))
2841 {
2842 fprintf (vect_dump, "transform induction: created def-use cycle: ");
2843 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
2844 fprintf (vect_dump, "\n");
2845 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
2846 }
2847
2848 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
2849 return induc_def;
2850 }
2851
2852
2853 /* Function get_initial_def_for_reduction
2854
2855 Input:
2856 STMT - a stmt that performs a reduction operation in the loop.
2857 INIT_VAL - the initial value of the reduction variable
2858
2859 Output:
2860 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2861 of the reduction (used for adjusting the epilog - see below).
2862 Return a vector variable, initialized according to the operation that STMT
2863 performs. This vector will be used as the initial value of the
2864 vector of partial results.
2865
2866 Option1 (adjust in epilog): Initialize the vector as follows:
2867 add/bit or/xor: [0,0,...,0,0]
2868 mult/bit and: [1,1,...,1,1]
2869 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
2870 and when necessary (e.g. add/mult case) let the caller know
2871 that it needs to adjust the result by init_val.
2872
2873 Option2: Initialize the vector as follows:
2874 add/bit or/xor: [init_val,0,0,...,0]
2875 mult/bit and: [init_val,1,1,...,1]
2876 min/max/cond_expr: [init_val,init_val,...,init_val]
2877 and no adjustments are needed.
2878
2879 For example, for the following code:
2880
2881 s = init_val;
2882 for (i=0;i<n;i++)
2883 s = s + a[i];
2884
2885 STMT is 's = s + a[i]', and the reduction variable is 's'.
2886 For a vector of 4 units, we want to return either [0,0,0,init_val],
2887 or [0,0,0,0] and let the caller know that it needs to adjust
2888 the result at the end by 'init_val'.
2889
2890 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2891 initialization vector is simpler (same element in all entries), if
2892 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
2893
2894 A cost model should help decide between these two schemes. */
2895
2896 tree
2897 get_initial_def_for_reduction (gimple stmt, tree init_val,
2898 tree *adjustment_def)
2899 {
2900 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2901 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2902 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2903 tree scalar_type = TREE_TYPE (init_val);
2904 tree vectype = get_vectype_for_scalar_type (scalar_type);
2905 int nunits;
2906 enum tree_code code = gimple_assign_rhs_code (stmt);
2907 tree def_for_init;
2908 tree init_def;
2909 tree t = NULL_TREE;
2910 int i;
2911 bool nested_in_vect_loop = false;
2912 tree init_value;
2913 REAL_VALUE_TYPE real_init_val = dconst0;
2914 int int_init_val = 0;
2915 gimple def_stmt = NULL;
2916
2917 gcc_assert (vectype);
2918 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2919
2920 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
2921 || SCALAR_FLOAT_TYPE_P (scalar_type));
2922
2923 if (nested_in_vect_loop_p (loop, stmt))
2924 nested_in_vect_loop = true;
2925 else
2926 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2927
2928 /* In case of double reduction we only create a vector variable to be put
2929 in the reduction phi node. The actual statement creation is done in
2930 vect_create_epilog_for_reduction. */
2931 if (adjustment_def && nested_in_vect_loop
2932 && TREE_CODE (init_val) == SSA_NAME
2933 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
2934 && gimple_code (def_stmt) == GIMPLE_PHI
2935 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2936 && vinfo_for_stmt (def_stmt)
2937 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2938 == vect_double_reduction_def)
2939 {
2940 *adjustment_def = NULL;
2941 return vect_create_destination_var (init_val, vectype);
2942 }
2943
2944 if (TREE_CONSTANT (init_val))
2945 {
2946 if (SCALAR_FLOAT_TYPE_P (scalar_type))
2947 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
2948 else
2949 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
2950 }
2951 else
2952 init_value = init_val;
2953
2954 switch (code)
2955 {
2956 case WIDEN_SUM_EXPR:
2957 case DOT_PROD_EXPR:
2958 case PLUS_EXPR:
2959 case MINUS_EXPR:
2960 case BIT_IOR_EXPR:
2961 case BIT_XOR_EXPR:
2962 case MULT_EXPR:
2963 case BIT_AND_EXPR:
2964 /* ADJUSMENT_DEF is NULL when called from
2965 vect_create_epilog_for_reduction to vectorize double reduction. */
2966 if (adjustment_def)
2967 {
2968 if (nested_in_vect_loop)
2969 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
2970 NULL);
2971 else
2972 *adjustment_def = init_val;
2973 }
2974
2975 if (code == MULT_EXPR)
2976 {
2977 real_init_val = dconst1;
2978 int_init_val = 1;
2979 }
2980
2981 if (code == BIT_AND_EXPR)
2982 int_init_val = -1;
2983
2984 if (SCALAR_FLOAT_TYPE_P (scalar_type))
2985 def_for_init = build_real (scalar_type, real_init_val);
2986 else
2987 def_for_init = build_int_cst (scalar_type, int_init_val);
2988
2989 /* Create a vector of '0' or '1' except the first element. */
2990 for (i = nunits - 2; i >= 0; --i)
2991 t = tree_cons (NULL_TREE, def_for_init, t);
2992
2993 /* Option1: the first element is '0' or '1' as well. */
2994 if (adjustment_def)
2995 {
2996 t = tree_cons (NULL_TREE, def_for_init, t);
2997 init_def = build_vector (vectype, t);
2998 break;
2999 }
3000
3001 /* Option2: the first element is INIT_VAL. */
3002 t = tree_cons (NULL_TREE, init_value, t);
3003 if (TREE_CONSTANT (init_val))
3004 init_def = build_vector (vectype, t);
3005 else
3006 init_def = build_constructor_from_list (vectype, t);
3007
3008 break;
3009
3010 case MIN_EXPR:
3011 case MAX_EXPR:
3012 case COND_EXPR:
3013 if (adjustment_def)
3014 {
3015 *adjustment_def = NULL_TREE;
3016 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3017 break;
3018 }
3019
3020 init_def = build_vector_from_val (vectype, init_value);
3021 break;
3022
3023 default:
3024 gcc_unreachable ();
3025 }
3026
3027 return init_def;
3028 }
3029
3030
3031 /* Function vect_create_epilog_for_reduction
3032
3033 Create code at the loop-epilog to finalize the result of a reduction
3034 computation.
3035
3036 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3037 reduction statements.
3038 STMT is the scalar reduction stmt that is being vectorized.
3039 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3040 number of elements that we can fit in a vectype (nunits). In this case
3041 we have to generate more than one vector stmt - i.e - we need to "unroll"
3042 the vector stmt by a factor VF/nunits. For more details see documentation
3043 in vectorizable_operation.
3044 REDUC_CODE is the tree-code for the epilog reduction.
3045 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3046 computation.
3047 REDUC_INDEX is the index of the operand in the right hand side of the
3048 statement that is defined by REDUCTION_PHI.
3049 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3050 SLP_NODE is an SLP node containing a group of reduction statements. The
3051 first one in this group is STMT.
3052
3053 This function:
3054 1. Creates the reduction def-use cycles: sets the arguments for
3055 REDUCTION_PHIS:
3056 The loop-entry argument is the vectorized initial-value of the reduction.
3057 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3058 sums.
3059 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3060 by applying the operation specified by REDUC_CODE if available, or by
3061 other means (whole-vector shifts or a scalar loop).
3062 The function also creates a new phi node at the loop exit to preserve
3063 loop-closed form, as illustrated below.
3064
3065 The flow at the entry to this function:
3066
3067 loop:
3068 vec_def = phi <null, null> # REDUCTION_PHI
3069 VECT_DEF = vector_stmt # vectorized form of STMT
3070 s_loop = scalar_stmt # (scalar) STMT
3071 loop_exit:
3072 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3073 use <s_out0>
3074 use <s_out0>
3075
3076 The above is transformed by this function into:
3077
3078 loop:
3079 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3080 VECT_DEF = vector_stmt # vectorized form of STMT
3081 s_loop = scalar_stmt # (scalar) STMT
3082 loop_exit:
3083 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3084 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3085 v_out2 = reduce <v_out1>
3086 s_out3 = extract_field <v_out2, 0>
3087 s_out4 = adjust_result <s_out3>
3088 use <s_out4>
3089 use <s_out4>
3090 */
3091
3092 static void
3093 vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
3094 int ncopies, enum tree_code reduc_code,
3095 VEC (gimple, heap) *reduction_phis,
3096 int reduc_index, bool double_reduc,
3097 slp_tree slp_node)
3098 {
3099 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3100 stmt_vec_info prev_phi_info;
3101 tree vectype;
3102 enum machine_mode mode;
3103 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3104 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3105 basic_block exit_bb;
3106 tree scalar_dest;
3107 tree scalar_type;
3108 gimple new_phi = NULL, phi;
3109 gimple_stmt_iterator exit_gsi;
3110 tree vec_dest;
3111 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3112 gimple epilog_stmt = NULL;
3113 enum tree_code code = gimple_assign_rhs_code (stmt);
3114 gimple exit_phi;
3115 tree bitsize, bitpos;
3116 tree adjustment_def = NULL;
3117 tree vec_initial_def = NULL;
3118 tree reduction_op, expr, def;
3119 tree orig_name, scalar_result;
3120 imm_use_iterator imm_iter, phi_imm_iter;
3121 use_operand_p use_p, phi_use_p;
3122 bool extract_scalar_result = false;
3123 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3124 bool nested_in_vect_loop = false;
3125 VEC (gimple, heap) *new_phis = NULL;
3126 enum vect_def_type dt = vect_unknown_def_type;
3127 int j, i;
3128 VEC (tree, heap) *scalar_results = NULL;
3129 unsigned int group_size = 1, k, ratio;
3130 VEC (tree, heap) *vec_initial_defs = NULL;
3131 VEC (gimple, heap) *phis;
3132
3133 if (slp_node)
3134 group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node));
3135
3136 if (nested_in_vect_loop_p (loop, stmt))
3137 {
3138 outer_loop = loop;
3139 loop = loop->inner;
3140 nested_in_vect_loop = true;
3141 gcc_assert (!slp_node);
3142 }
3143
3144 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3145 {
3146 case GIMPLE_SINGLE_RHS:
3147 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3148 == ternary_op);
3149 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3150 break;
3151 case GIMPLE_UNARY_RHS:
3152 reduction_op = gimple_assign_rhs1 (stmt);
3153 break;
3154 case GIMPLE_BINARY_RHS:
3155 reduction_op = reduc_index ?
3156 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3157 break;
3158 default:
3159 gcc_unreachable ();
3160 }
3161
3162 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3163 gcc_assert (vectype);
3164 mode = TYPE_MODE (vectype);
3165
3166 /* 1. Create the reduction def-use cycle:
3167 Set the arguments of REDUCTION_PHIS, i.e., transform
3168
3169 loop:
3170 vec_def = phi <null, null> # REDUCTION_PHI
3171 VECT_DEF = vector_stmt # vectorized form of STMT
3172 ...
3173
3174 into:
3175
3176 loop:
3177 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3178 VECT_DEF = vector_stmt # vectorized form of STMT
3179 ...
3180
3181 (in case of SLP, do it for all the phis). */
3182
3183 /* Get the loop-entry arguments. */
3184 if (slp_node)
3185 vect_get_slp_defs (reduction_op, NULL_TREE, slp_node, &vec_initial_defs,
3186 NULL, reduc_index);
3187 else
3188 {
3189 vec_initial_defs = VEC_alloc (tree, heap, 1);
3190 /* For the case of reduction, vect_get_vec_def_for_operand returns
3191 the scalar def before the loop, that defines the initial value
3192 of the reduction variable. */
3193 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3194 &adjustment_def);
3195 VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
3196 }
3197
3198 /* Set phi nodes arguments. */
3199 FOR_EACH_VEC_ELT (gimple, reduction_phis, i, phi)
3200 {
3201 tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
3202 tree def = VEC_index (tree, vect_defs, i);
3203 for (j = 0; j < ncopies; j++)
3204 {
3205 /* Set the loop-entry arg of the reduction-phi. */
3206 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3207 UNKNOWN_LOCATION);
3208
3209 /* Set the loop-latch arg for the reduction-phi. */
3210 if (j > 0)
3211 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3212
3213 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3214
3215 if (vect_print_dump_info (REPORT_DETAILS))
3216 {
3217 fprintf (vect_dump, "transform reduction: created def-use"
3218 " cycle: ");
3219 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
3220 fprintf (vect_dump, "\n");
3221 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0,
3222 TDF_SLIM);
3223 }
3224
3225 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3226 }
3227 }
3228
3229 VEC_free (tree, heap, vec_initial_defs);
3230
3231 /* 2. Create epilog code.
3232 The reduction epilog code operates across the elements of the vector
3233 of partial results computed by the vectorized loop.
3234 The reduction epilog code consists of:
3235
3236 step 1: compute the scalar result in a vector (v_out2)
3237 step 2: extract the scalar result (s_out3) from the vector (v_out2)
3238 step 3: adjust the scalar result (s_out3) if needed.
3239
3240 Step 1 can be accomplished using one the following three schemes:
3241 (scheme 1) using reduc_code, if available.
3242 (scheme 2) using whole-vector shifts, if available.
3243 (scheme 3) using a scalar loop. In this case steps 1+2 above are
3244 combined.
3245
3246 The overall epilog code looks like this:
3247
3248 s_out0 = phi <s_loop> # original EXIT_PHI
3249 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3250 v_out2 = reduce <v_out1> # step 1
3251 s_out3 = extract_field <v_out2, 0> # step 2
3252 s_out4 = adjust_result <s_out3> # step 3
3253
3254 (step 3 is optional, and steps 1 and 2 may be combined).
3255 Lastly, the uses of s_out0 are replaced by s_out4. */
3256
3257
3258 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3259 v_out1 = phi <VECT_DEF>
3260 Store them in NEW_PHIS. */
3261
3262 exit_bb = single_exit (loop)->dest;
3263 prev_phi_info = NULL;
3264 new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3265 FOR_EACH_VEC_ELT (tree, vect_defs, i, def)
3266 {
3267 for (j = 0; j < ncopies; j++)
3268 {
3269 phi = create_phi_node (SSA_NAME_VAR (def), exit_bb);
3270 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3271 if (j == 0)
3272 VEC_quick_push (gimple, new_phis, phi);
3273 else
3274 {
3275 def = vect_get_vec_def_for_stmt_copy (dt, def);
3276 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3277 }
3278
3279 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3280 prev_phi_info = vinfo_for_stmt (phi);
3281 }
3282 }
3283
3284 /* The epilogue is created for the outer-loop, i.e., for the loop being
3285 vectorized. */
3286 if (double_reduc)
3287 {
3288 loop = outer_loop;
3289 exit_bb = single_exit (loop)->dest;
3290 }
3291
3292 exit_gsi = gsi_after_labels (exit_bb);
3293
3294 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3295 (i.e. when reduc_code is not available) and in the final adjustment
3296 code (if needed). Also get the original scalar reduction variable as
3297 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
3298 represents a reduction pattern), the tree-code and scalar-def are
3299 taken from the original stmt that the pattern-stmt (STMT) replaces.
3300 Otherwise (it is a regular reduction) - the tree-code and scalar-def
3301 are taken from STMT. */
3302
3303 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3304 if (!orig_stmt)
3305 {
3306 /* Regular reduction */
3307 orig_stmt = stmt;
3308 }
3309 else
3310 {
3311 /* Reduction pattern */
3312 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3313 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3314 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3315 }
3316
3317 code = gimple_assign_rhs_code (orig_stmt);
3318 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3319 partial results are added and not subtracted. */
3320 if (code == MINUS_EXPR)
3321 code = PLUS_EXPR;
3322
3323 scalar_dest = gimple_assign_lhs (orig_stmt);
3324 scalar_type = TREE_TYPE (scalar_dest);
3325 scalar_results = VEC_alloc (tree, heap, group_size);
3326 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3327 bitsize = TYPE_SIZE (scalar_type);
3328
3329 /* In case this is a reduction in an inner-loop while vectorizing an outer
3330 loop - we don't need to extract a single scalar result at the end of the
3331 inner-loop (unless it is double reduction, i.e., the use of reduction is
3332 outside the outer-loop). The final vector of partial results will be used
3333 in the vectorized outer-loop, or reduced to a scalar result at the end of
3334 the outer-loop. */
3335 if (nested_in_vect_loop && !double_reduc)
3336 goto vect_finalize_reduction;
3337
3338 /* 2.3 Create the reduction code, using one of the three schemes described
3339 above. In SLP we simply need to extract all the elements from the
3340 vector (without reducing them), so we use scalar shifts. */
3341 if (reduc_code != ERROR_MARK && !slp_node)
3342 {
3343 tree tmp;
3344
3345 /*** Case 1: Create:
3346 v_out2 = reduc_expr <v_out1> */
3347
3348 if (vect_print_dump_info (REPORT_DETAILS))
3349 fprintf (vect_dump, "Reduce using direct vector reduction.");
3350
3351 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3352 new_phi = VEC_index (gimple, new_phis, 0);
3353 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
3354 epilog_stmt = gimple_build_assign (vec_dest, tmp);
3355 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3356 gimple_assign_set_lhs (epilog_stmt, new_temp);
3357 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3358
3359 extract_scalar_result = true;
3360 }
3361 else
3362 {
3363 enum tree_code shift_code = ERROR_MARK;
3364 bool have_whole_vector_shift = true;
3365 int bit_offset;
3366 int element_bitsize = tree_low_cst (bitsize, 1);
3367 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3368 tree vec_temp;
3369
3370 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3371 shift_code = VEC_RSHIFT_EXPR;
3372 else
3373 have_whole_vector_shift = false;
3374
3375 /* Regardless of whether we have a whole vector shift, if we're
3376 emulating the operation via tree-vect-generic, we don't want
3377 to use it. Only the first round of the reduction is likely
3378 to still be profitable via emulation. */
3379 /* ??? It might be better to emit a reduction tree code here, so that
3380 tree-vect-generic can expand the first round via bit tricks. */
3381 if (!VECTOR_MODE_P (mode))
3382 have_whole_vector_shift = false;
3383 else
3384 {
3385 optab optab = optab_for_tree_code (code, vectype, optab_default);
3386 if (optab_handler (optab, mode) == CODE_FOR_nothing)
3387 have_whole_vector_shift = false;
3388 }
3389
3390 if (have_whole_vector_shift && !slp_node)
3391 {
3392 /*** Case 2: Create:
3393 for (offset = VS/2; offset >= element_size; offset/=2)
3394 {
3395 Create: va' = vec_shift <va, offset>
3396 Create: va = vop <va, va'>
3397 } */
3398
3399 if (vect_print_dump_info (REPORT_DETAILS))
3400 fprintf (vect_dump, "Reduce using vector shifts");
3401
3402 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3403 new_phi = VEC_index (gimple, new_phis, 0);
3404 new_temp = PHI_RESULT (new_phi);
3405 for (bit_offset = vec_size_in_bits/2;
3406 bit_offset >= element_bitsize;
3407 bit_offset /= 2)
3408 {
3409 tree bitpos = size_int (bit_offset);
3410
3411 epilog_stmt = gimple_build_assign_with_ops (shift_code,
3412 vec_dest, new_temp, bitpos);
3413 new_name = make_ssa_name (vec_dest, epilog_stmt);
3414 gimple_assign_set_lhs (epilog_stmt, new_name);
3415 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3416
3417 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3418 new_name, new_temp);
3419 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3420 gimple_assign_set_lhs (epilog_stmt, new_temp);
3421 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3422 }
3423
3424 extract_scalar_result = true;
3425 }
3426 else
3427 {
3428 tree rhs;
3429
3430 /*** Case 3: Create:
3431 s = extract_field <v_out2, 0>
3432 for (offset = element_size;
3433 offset < vector_size;
3434 offset += element_size;)
3435 {
3436 Create: s' = extract_field <v_out2, offset>
3437 Create: s = op <s, s'> // For non SLP cases
3438 } */
3439
3440 if (vect_print_dump_info (REPORT_DETAILS))
3441 fprintf (vect_dump, "Reduce using scalar code. ");
3442
3443 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3444 FOR_EACH_VEC_ELT (gimple, new_phis, i, new_phi)
3445 {
3446 vec_temp = PHI_RESULT (new_phi);
3447 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3448 bitsize_zero_node);
3449 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3450 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3451 gimple_assign_set_lhs (epilog_stmt, new_temp);
3452 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3453
3454 /* In SLP we don't need to apply reduction operation, so we just
3455 collect s' values in SCALAR_RESULTS. */
3456 if (slp_node)
3457 VEC_safe_push (tree, heap, scalar_results, new_temp);
3458
3459 for (bit_offset = element_bitsize;
3460 bit_offset < vec_size_in_bits;
3461 bit_offset += element_bitsize)
3462 {
3463 tree bitpos = bitsize_int (bit_offset);
3464 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
3465 bitsize, bitpos);
3466
3467 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3468 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3469 gimple_assign_set_lhs (epilog_stmt, new_name);
3470 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3471
3472 if (slp_node)
3473 {
3474 /* In SLP we don't need to apply reduction operation, so
3475 we just collect s' values in SCALAR_RESULTS. */
3476 new_temp = new_name;
3477 VEC_safe_push (tree, heap, scalar_results, new_name);
3478 }
3479 else
3480 {
3481 epilog_stmt = gimple_build_assign_with_ops (code,
3482 new_scalar_dest, new_name, new_temp);
3483 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3484 gimple_assign_set_lhs (epilog_stmt, new_temp);
3485 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3486 }
3487 }
3488 }
3489
3490 /* The only case where we need to reduce scalar results in SLP, is
3491 unrolling. If the size of SCALAR_RESULTS is greater than
3492 GROUP_SIZE, we reduce them combining elements modulo
3493 GROUP_SIZE. */
3494 if (slp_node)
3495 {
3496 tree res, first_res, new_res;
3497 gimple new_stmt;
3498
3499 /* Reduce multiple scalar results in case of SLP unrolling. */
3500 for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
3501 j++)
3502 {
3503 first_res = VEC_index (tree, scalar_results, j % group_size);
3504 new_stmt = gimple_build_assign_with_ops (code,
3505 new_scalar_dest, first_res, res);
3506 new_res = make_ssa_name (new_scalar_dest, new_stmt);
3507 gimple_assign_set_lhs (new_stmt, new_res);
3508 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
3509 VEC_replace (tree, scalar_results, j % group_size, new_res);
3510 }
3511 }
3512 else
3513 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
3514 VEC_safe_push (tree, heap, scalar_results, new_temp);
3515
3516 extract_scalar_result = false;
3517 }
3518 }
3519
3520 /* 2.4 Extract the final scalar result. Create:
3521 s_out3 = extract_field <v_out2, bitpos> */
3522
3523 if (extract_scalar_result)
3524 {
3525 tree rhs;
3526
3527 if (vect_print_dump_info (REPORT_DETAILS))
3528 fprintf (vect_dump, "extract scalar result");
3529
3530 if (BYTES_BIG_ENDIAN)
3531 bitpos = size_binop (MULT_EXPR,
3532 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3533 TYPE_SIZE (scalar_type));
3534 else
3535 bitpos = bitsize_zero_node;
3536
3537 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3538 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3539 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3540 gimple_assign_set_lhs (epilog_stmt, new_temp);
3541 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3542 VEC_safe_push (tree, heap, scalar_results, new_temp);
3543 }
3544
3545 vect_finalize_reduction:
3546
3547 if (double_reduc)
3548 loop = loop->inner;
3549
3550 /* 2.5 Adjust the final result by the initial value of the reduction
3551 variable. (When such adjustment is not needed, then
3552 'adjustment_def' is zero). For example, if code is PLUS we create:
3553 new_temp = loop_exit_def + adjustment_def */
3554
3555 if (adjustment_def)
3556 {
3557 gcc_assert (!slp_node);
3558 if (nested_in_vect_loop)
3559 {
3560 new_phi = VEC_index (gimple, new_phis, 0);
3561 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3562 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3563 new_dest = vect_create_destination_var (scalar_dest, vectype);
3564 }
3565 else
3566 {
3567 new_temp = VEC_index (tree, scalar_results, 0);
3568 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3569 expr = build2 (code, scalar_type, new_temp, adjustment_def);
3570 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3571 }
3572
3573 epilog_stmt = gimple_build_assign (new_dest, expr);
3574 new_temp = make_ssa_name (new_dest, epilog_stmt);
3575 gimple_assign_set_lhs (epilog_stmt, new_temp);
3576 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3577 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3578 if (nested_in_vect_loop)
3579 {
3580 set_vinfo_for_stmt (epilog_stmt,
3581 new_stmt_vec_info (epilog_stmt, loop_vinfo,
3582 NULL));
3583 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3584 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
3585
3586 if (!double_reduc)
3587 VEC_quick_push (tree, scalar_results, new_temp);
3588 else
3589 VEC_replace (tree, scalar_results, 0, new_temp);
3590 }
3591 else
3592 VEC_replace (tree, scalar_results, 0, new_temp);
3593
3594 VEC_replace (gimple, new_phis, 0, epilog_stmt);
3595 }
3596
3597 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
3598 phis with new adjusted scalar results, i.e., replace use <s_out0>
3599 with use <s_out4>.
3600
3601 Transform:
3602 loop_exit:
3603 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3604 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3605 v_out2 = reduce <v_out1>
3606 s_out3 = extract_field <v_out2, 0>
3607 s_out4 = adjust_result <s_out3>
3608 use <s_out0>
3609 use <s_out0>
3610
3611 into:
3612
3613 loop_exit:
3614 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3615 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3616 v_out2 = reduce <v_out1>
3617 s_out3 = extract_field <v_out2, 0>
3618 s_out4 = adjust_result <s_out3>
3619 use <s_out4>
3620 use <s_out4> */
3621
3622 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
3623 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
3624 need to match SCALAR_RESULTS with corresponding statements. The first
3625 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
3626 the first vector stmt, etc.
3627 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
3628 if (group_size > VEC_length (gimple, new_phis))
3629 {
3630 ratio = group_size / VEC_length (gimple, new_phis);
3631 gcc_assert (!(group_size % VEC_length (gimple, new_phis)));
3632 }
3633 else
3634 ratio = 1;
3635
3636 for (k = 0; k < group_size; k++)
3637 {
3638 if (k % ratio == 0)
3639 {
3640 epilog_stmt = VEC_index (gimple, new_phis, k / ratio);
3641 reduction_phi = VEC_index (gimple, reduction_phis, k / ratio);
3642 }
3643
3644 if (slp_node)
3645 {
3646 gimple current_stmt = VEC_index (gimple,
3647 SLP_TREE_SCALAR_STMTS (slp_node), k);
3648
3649 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
3650 /* SLP statements can't participate in patterns. */
3651 gcc_assert (!orig_stmt);
3652 scalar_dest = gimple_assign_lhs (current_stmt);
3653 }
3654
3655 phis = VEC_alloc (gimple, heap, 3);
3656 /* Find the loop-closed-use at the loop exit of the original scalar
3657 result. (The reduction result is expected to have two immediate uses -
3658 one at the latch block, and one at the loop exit). */
3659 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3660 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3661 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
3662
3663 /* We expect to have found an exit_phi because of loop-closed-ssa
3664 form. */
3665 gcc_assert (!VEC_empty (gimple, phis));
3666
3667 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
3668 {
3669 if (outer_loop)
3670 {
3671 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
3672 gimple vect_phi;
3673
3674 /* FORNOW. Currently not supporting the case that an inner-loop
3675 reduction is not used in the outer-loop (but only outside the
3676 outer-loop), unless it is double reduction. */
3677 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
3678 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
3679 || double_reduc);
3680
3681 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
3682 if (!double_reduc
3683 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
3684 != vect_double_reduction_def)
3685 continue;
3686
3687 /* Handle double reduction:
3688
3689 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
3690 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
3691 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
3692 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
3693
3694 At that point the regular reduction (stmt2 and stmt3) is
3695 already vectorized, as well as the exit phi node, stmt4.
3696 Here we vectorize the phi node of double reduction, stmt1, and
3697 update all relevant statements. */
3698
3699 /* Go through all the uses of s2 to find double reduction phi
3700 node, i.e., stmt1 above. */
3701 orig_name = PHI_RESULT (exit_phi);
3702 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3703 {
3704 stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
3705 stmt_vec_info new_phi_vinfo;
3706 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
3707 basic_block bb = gimple_bb (use_stmt);
3708 gimple use;
3709
3710 /* Check that USE_STMT is really double reduction phi
3711 node. */
3712 if (gimple_code (use_stmt) != GIMPLE_PHI
3713 || gimple_phi_num_args (use_stmt) != 2
3714 || !use_stmt_vinfo
3715 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
3716 != vect_double_reduction_def
3717 || bb->loop_father != outer_loop)
3718 continue;
3719
3720 /* Create vector phi node for double reduction:
3721 vs1 = phi <vs0, vs2>
3722 vs1 was created previously in this function by a call to
3723 vect_get_vec_def_for_operand and is stored in
3724 vec_initial_def;
3725 vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
3726 vs0 is created here. */
3727
3728 /* Create vector phi node. */
3729 vect_phi = create_phi_node (vec_initial_def, bb);
3730 new_phi_vinfo = new_stmt_vec_info (vect_phi,
3731 loop_vec_info_for_loop (outer_loop), NULL);
3732 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
3733
3734 /* Create vs0 - initial def of the double reduction phi. */
3735 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
3736 loop_preheader_edge (outer_loop));
3737 init_def = get_initial_def_for_reduction (stmt,
3738 preheader_arg, NULL);
3739 vect_phi_init = vect_init_vector (use_stmt, init_def,
3740 vectype, NULL);
3741
3742 /* Update phi node arguments with vs0 and vs2. */
3743 add_phi_arg (vect_phi, vect_phi_init,
3744 loop_preheader_edge (outer_loop),
3745 UNKNOWN_LOCATION);
3746 add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
3747 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
3748 if (vect_print_dump_info (REPORT_DETAILS))
3749 {
3750 fprintf (vect_dump, "created double reduction phi "
3751 "node: ");
3752 print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
3753 }
3754
3755 vect_phi_res = PHI_RESULT (vect_phi);
3756
3757 /* Replace the use, i.e., set the correct vs1 in the regular
3758 reduction phi node. FORNOW, NCOPIES is always 1, so the
3759 loop is redundant. */
3760 use = reduction_phi;
3761 for (j = 0; j < ncopies; j++)
3762 {
3763 edge pr_edge = loop_preheader_edge (loop);
3764 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
3765 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
3766 }
3767 }
3768 }
3769 }
3770
3771 VEC_free (gimple, heap, phis);
3772 if (nested_in_vect_loop)
3773 {
3774 if (double_reduc)
3775 loop = outer_loop;
3776 else
3777 continue;
3778 }
3779
3780 phis = VEC_alloc (gimple, heap, 3);
3781 /* Find the loop-closed-use at the loop exit of the original scalar
3782 result. (The reduction result is expected to have two immediate uses,
3783 one at the latch block, and one at the loop exit). For double
3784 reductions we are looking for exit phis of the outer loop. */
3785 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3786 {
3787 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3788 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
3789 else
3790 {
3791 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
3792 {
3793 tree phi_res = PHI_RESULT (USE_STMT (use_p));
3794
3795 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
3796 {
3797 if (!flow_bb_inside_loop_p (loop,
3798 gimple_bb (USE_STMT (phi_use_p))))
3799 VEC_safe_push (gimple, heap, phis,
3800 USE_STMT (phi_use_p));
3801 }
3802 }
3803 }
3804 }
3805
3806 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
3807 {
3808 /* Replace the uses: */
3809 orig_name = PHI_RESULT (exit_phi);
3810 scalar_result = VEC_index (tree, scalar_results, k);
3811 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3812 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3813 SET_USE (use_p, scalar_result);
3814 }
3815
3816 VEC_free (gimple, heap, phis);
3817 }
3818
3819 VEC_free (tree, heap, scalar_results);
3820 VEC_free (gimple, heap, new_phis);
3821 }
3822
3823
3824 /* Function vectorizable_reduction.
3825
3826 Check if STMT performs a reduction operation that can be vectorized.
3827 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3828 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
3829 Return FALSE if not a vectorizable STMT, TRUE otherwise.
3830
3831 This function also handles reduction idioms (patterns) that have been
3832 recognized in advance during vect_pattern_recog. In this case, STMT may be
3833 of this form:
3834 X = pattern_expr (arg0, arg1, ..., X)
3835 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
3836 sequence that had been detected and replaced by the pattern-stmt (STMT).
3837
3838 In some cases of reduction patterns, the type of the reduction variable X is
3839 different than the type of the other arguments of STMT.
3840 In such cases, the vectype that is used when transforming STMT into a vector
3841 stmt is different than the vectype that is used to determine the
3842 vectorization factor, because it consists of a different number of elements
3843 than the actual number of elements that are being operated upon in parallel.
3844
3845 For example, consider an accumulation of shorts into an int accumulator.
3846 On some targets it's possible to vectorize this pattern operating on 8
3847 shorts at a time (hence, the vectype for purposes of determining the
3848 vectorization factor should be V8HI); on the other hand, the vectype that
3849 is used to create the vector form is actually V4SI (the type of the result).
3850
3851 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
3852 indicates what is the actual level of parallelism (V8HI in the example), so
3853 that the right vectorization factor would be derived. This vectype
3854 corresponds to the type of arguments to the reduction stmt, and should *NOT*
3855 be used to create the vectorized stmt. The right vectype for the vectorized
3856 stmt is obtained from the type of the result X:
3857 get_vectype_for_scalar_type (TREE_TYPE (X))
3858
3859 This means that, contrary to "regular" reductions (or "regular" stmts in
3860 general), the following equation:
3861 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
3862 does *NOT* necessarily hold for reduction patterns. */
3863
3864 bool
3865 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
3866 gimple *vec_stmt, slp_tree slp_node)
3867 {
3868 tree vec_dest;
3869 tree scalar_dest;
3870 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
3871 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3872 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
3873 tree vectype_in = NULL_TREE;
3874 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3875 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3876 enum tree_code code, orig_code, epilog_reduc_code;
3877 enum machine_mode vec_mode;
3878 int op_type;
3879 optab optab, reduc_optab;
3880 tree new_temp = NULL_TREE;
3881 tree def;
3882 gimple def_stmt;
3883 enum vect_def_type dt;
3884 gimple new_phi = NULL;
3885 tree scalar_type;
3886 bool is_simple_use;
3887 gimple orig_stmt;
3888 stmt_vec_info orig_stmt_info;
3889 tree expr = NULL_TREE;
3890 int i;
3891 int ncopies;
3892 int epilog_copies;
3893 stmt_vec_info prev_stmt_info, prev_phi_info;
3894 bool single_defuse_cycle = false;
3895 tree reduc_def = NULL_TREE;
3896 gimple new_stmt = NULL;
3897 int j;
3898 tree ops[3];
3899 bool nested_cycle = false, found_nested_cycle_def = false;
3900 gimple reduc_def_stmt = NULL;
3901 /* The default is that the reduction variable is the last in statement. */
3902 int reduc_index = 2;
3903 bool double_reduc = false, dummy;
3904 basic_block def_bb;
3905 struct loop * def_stmt_loop, *outer_loop = NULL;
3906 tree def_arg;
3907 gimple def_arg_stmt;
3908 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
3909 VEC (gimple, heap) *phis = NULL;
3910 int vec_num;
3911 tree def0, def1, tem;
3912
3913 if (nested_in_vect_loop_p (loop, stmt))
3914 {
3915 outer_loop = loop;
3916 loop = loop->inner;
3917 nested_cycle = true;
3918 }
3919
3920 /* 1. Is vectorizable reduction? */
3921 /* Not supportable if the reduction variable is used in the loop. */
3922 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
3923 return false;
3924
3925 /* Reductions that are not used even in an enclosing outer-loop,
3926 are expected to be "live" (used out of the loop). */
3927 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
3928 && !STMT_VINFO_LIVE_P (stmt_info))
3929 return false;
3930
3931 /* Make sure it was already recognized as a reduction computation. */
3932 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
3933 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
3934 return false;
3935
3936 /* 2. Has this been recognized as a reduction pattern?
3937
3938 Check if STMT represents a pattern that has been recognized
3939 in earlier analysis stages. For stmts that represent a pattern,
3940 the STMT_VINFO_RELATED_STMT field records the last stmt in
3941 the original sequence that constitutes the pattern. */
3942
3943 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3944 if (orig_stmt)
3945 {
3946 orig_stmt_info = vinfo_for_stmt (orig_stmt);
3947 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
3948 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
3949 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
3950 }
3951
3952 /* 3. Check the operands of the operation. The first operands are defined
3953 inside the loop body. The last operand is the reduction variable,
3954 which is defined by the loop-header-phi. */
3955
3956 gcc_assert (is_gimple_assign (stmt));
3957
3958 /* Flatten RHS. */
3959 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3960 {
3961 case GIMPLE_SINGLE_RHS:
3962 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
3963 if (op_type == ternary_op)
3964 {
3965 tree rhs = gimple_assign_rhs1 (stmt);
3966 ops[0] = TREE_OPERAND (rhs, 0);
3967 ops[1] = TREE_OPERAND (rhs, 1);
3968 ops[2] = TREE_OPERAND (rhs, 2);
3969 code = TREE_CODE (rhs);
3970 }
3971 else
3972 return false;
3973 break;
3974
3975 case GIMPLE_BINARY_RHS:
3976 code = gimple_assign_rhs_code (stmt);
3977 op_type = TREE_CODE_LENGTH (code);
3978 gcc_assert (op_type == binary_op);
3979 ops[0] = gimple_assign_rhs1 (stmt);
3980 ops[1] = gimple_assign_rhs2 (stmt);
3981 break;
3982
3983 case GIMPLE_UNARY_RHS:
3984 return false;
3985
3986 default:
3987 gcc_unreachable ();
3988 }
3989
3990 scalar_dest = gimple_assign_lhs (stmt);
3991 scalar_type = TREE_TYPE (scalar_dest);
3992 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
3993 && !SCALAR_FLOAT_TYPE_P (scalar_type))
3994 return false;
3995
3996 /* All uses but the last are expected to be defined in the loop.
3997 The last use is the reduction variable. In case of nested cycle this
3998 assumption is not true: we use reduc_index to record the index of the
3999 reduction variable. */
4000 for (i = 0; i < op_type-1; i++)
4001 {
4002 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4003 if (i == 0 && code == COND_EXPR)
4004 continue;
4005
4006 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL,
4007 &def_stmt, &def, &dt, &tem);
4008 if (!vectype_in)
4009 vectype_in = tem;
4010 gcc_assert (is_simple_use);
4011 if (dt != vect_internal_def
4012 && dt != vect_external_def
4013 && dt != vect_constant_def
4014 && dt != vect_induction_def
4015 && !(dt == vect_nested_cycle && nested_cycle))
4016 return false;
4017
4018 if (dt == vect_nested_cycle)
4019 {
4020 found_nested_cycle_def = true;
4021 reduc_def_stmt = def_stmt;
4022 reduc_index = i;
4023 }
4024 }
4025
4026 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL, &def_stmt,
4027 &def, &dt, &tem);
4028 if (!vectype_in)
4029 vectype_in = tem;
4030 gcc_assert (is_simple_use);
4031 gcc_assert (dt == vect_reduction_def
4032 || dt == vect_nested_cycle
4033 || ((dt == vect_internal_def || dt == vect_external_def
4034 || dt == vect_constant_def || dt == vect_induction_def)
4035 && nested_cycle && found_nested_cycle_def));
4036 if (!found_nested_cycle_def)
4037 reduc_def_stmt = def_stmt;
4038
4039 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4040 if (orig_stmt)
4041 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4042 reduc_def_stmt,
4043 !nested_cycle,
4044 &dummy));
4045 else
4046 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4047 !nested_cycle, &dummy));
4048
4049 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4050 return false;
4051
4052 if (slp_node)
4053 ncopies = 1;
4054 else
4055 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4056 / TYPE_VECTOR_SUBPARTS (vectype_in));
4057
4058 gcc_assert (ncopies >= 1);
4059
4060 vec_mode = TYPE_MODE (vectype_in);
4061
4062 if (code == COND_EXPR)
4063 {
4064 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0))
4065 {
4066 if (vect_print_dump_info (REPORT_DETAILS))
4067 fprintf (vect_dump, "unsupported condition in reduction");
4068
4069 return false;
4070 }
4071 }
4072 else
4073 {
4074 /* 4. Supportable by target? */
4075
4076 /* 4.1. check support for the operation in the loop */
4077 optab = optab_for_tree_code (code, vectype_in, optab_default);
4078 if (!optab)
4079 {
4080 if (vect_print_dump_info (REPORT_DETAILS))
4081 fprintf (vect_dump, "no optab.");
4082
4083 return false;
4084 }
4085
4086 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4087 {
4088 if (vect_print_dump_info (REPORT_DETAILS))
4089 fprintf (vect_dump, "op not supported by target.");
4090
4091 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4092 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4093 < vect_min_worthwhile_factor (code))
4094 return false;
4095
4096 if (vect_print_dump_info (REPORT_DETAILS))
4097 fprintf (vect_dump, "proceeding using word mode.");
4098 }
4099
4100 /* Worthwhile without SIMD support? */
4101 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4102 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4103 < vect_min_worthwhile_factor (code))
4104 {
4105 if (vect_print_dump_info (REPORT_DETAILS))
4106 fprintf (vect_dump, "not worthwhile without SIMD support.");
4107
4108 return false;
4109 }
4110 }
4111
4112 /* 4.2. Check support for the epilog operation.
4113
4114 If STMT represents a reduction pattern, then the type of the
4115 reduction variable may be different than the type of the rest
4116 of the arguments. For example, consider the case of accumulation
4117 of shorts into an int accumulator; The original code:
4118 S1: int_a = (int) short_a;
4119 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
4120
4121 was replaced with:
4122 STMT: int_acc = widen_sum <short_a, int_acc>
4123
4124 This means that:
4125 1. The tree-code that is used to create the vector operation in the
4126 epilog code (that reduces the partial results) is not the
4127 tree-code of STMT, but is rather the tree-code of the original
4128 stmt from the pattern that STMT is replacing. I.e, in the example
4129 above we want to use 'widen_sum' in the loop, but 'plus' in the
4130 epilog.
4131 2. The type (mode) we use to check available target support
4132 for the vector operation to be created in the *epilog*, is
4133 determined by the type of the reduction variable (in the example
4134 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4135 However the type (mode) we use to check available target support
4136 for the vector operation to be created *inside the loop*, is
4137 determined by the type of the other arguments to STMT (in the
4138 example we'd check this: optab_handler (widen_sum_optab,
4139 vect_short_mode)).
4140
4141 This is contrary to "regular" reductions, in which the types of all
4142 the arguments are the same as the type of the reduction variable.
4143 For "regular" reductions we can therefore use the same vector type
4144 (and also the same tree-code) when generating the epilog code and
4145 when generating the code inside the loop. */
4146
4147 if (orig_stmt)
4148 {
4149 /* This is a reduction pattern: get the vectype from the type of the
4150 reduction variable, and get the tree-code from orig_stmt. */
4151 orig_code = gimple_assign_rhs_code (orig_stmt);
4152 gcc_assert (vectype_out);
4153 vec_mode = TYPE_MODE (vectype_out);
4154 }
4155 else
4156 {
4157 /* Regular reduction: use the same vectype and tree-code as used for
4158 the vector code inside the loop can be used for the epilog code. */
4159 orig_code = code;
4160 }
4161
4162 if (nested_cycle)
4163 {
4164 def_bb = gimple_bb (reduc_def_stmt);
4165 def_stmt_loop = def_bb->loop_father;
4166 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4167 loop_preheader_edge (def_stmt_loop));
4168 if (TREE_CODE (def_arg) == SSA_NAME
4169 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4170 && gimple_code (def_arg_stmt) == GIMPLE_PHI
4171 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4172 && vinfo_for_stmt (def_arg_stmt)
4173 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4174 == vect_double_reduction_def)
4175 double_reduc = true;
4176 }
4177
4178 epilog_reduc_code = ERROR_MARK;
4179 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4180 {
4181 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4182 optab_default);
4183 if (!reduc_optab)
4184 {
4185 if (vect_print_dump_info (REPORT_DETAILS))
4186 fprintf (vect_dump, "no optab for reduction.");
4187
4188 epilog_reduc_code = ERROR_MARK;
4189 }
4190
4191 if (reduc_optab
4192 && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4193 {
4194 if (vect_print_dump_info (REPORT_DETAILS))
4195 fprintf (vect_dump, "reduc op not supported by target.");
4196
4197 epilog_reduc_code = ERROR_MARK;
4198 }
4199 }
4200 else
4201 {
4202 if (!nested_cycle || double_reduc)
4203 {
4204 if (vect_print_dump_info (REPORT_DETAILS))
4205 fprintf (vect_dump, "no reduc code for scalar code.");
4206
4207 return false;
4208 }
4209 }
4210
4211 if (double_reduc && ncopies > 1)
4212 {
4213 if (vect_print_dump_info (REPORT_DETAILS))
4214 fprintf (vect_dump, "multiple types in double reduction");
4215
4216 return false;
4217 }
4218
4219 if (!vec_stmt) /* transformation not required. */
4220 {
4221 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4222 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4223 return false;
4224 return true;
4225 }
4226
4227 /** Transform. **/
4228
4229 if (vect_print_dump_info (REPORT_DETAILS))
4230 fprintf (vect_dump, "transform reduction.");
4231
4232 /* FORNOW: Multiple types are not supported for condition. */
4233 if (code == COND_EXPR)
4234 gcc_assert (ncopies == 1);
4235
4236 /* Create the destination vector */
4237 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4238
4239 /* In case the vectorization factor (VF) is bigger than the number
4240 of elements that we can fit in a vectype (nunits), we have to generate
4241 more than one vector stmt - i.e - we need to "unroll" the
4242 vector stmt by a factor VF/nunits. For more details see documentation
4243 in vectorizable_operation. */
4244
4245 /* If the reduction is used in an outer loop we need to generate
4246 VF intermediate results, like so (e.g. for ncopies=2):
4247 r0 = phi (init, r0)
4248 r1 = phi (init, r1)
4249 r0 = x0 + r0;
4250 r1 = x1 + r1;
4251 (i.e. we generate VF results in 2 registers).
4252 In this case we have a separate def-use cycle for each copy, and therefore
4253 for each copy we get the vector def for the reduction variable from the
4254 respective phi node created for this copy.
4255
4256 Otherwise (the reduction is unused in the loop nest), we can combine
4257 together intermediate results, like so (e.g. for ncopies=2):
4258 r = phi (init, r)
4259 r = x0 + r;
4260 r = x1 + r;
4261 (i.e. we generate VF/2 results in a single register).
4262 In this case for each copy we get the vector def for the reduction variable
4263 from the vectorized reduction operation generated in the previous iteration.
4264 */
4265
4266 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
4267 {
4268 single_defuse_cycle = true;
4269 epilog_copies = 1;
4270 }
4271 else
4272 epilog_copies = ncopies;
4273
4274 prev_stmt_info = NULL;
4275 prev_phi_info = NULL;
4276 if (slp_node)
4277 {
4278 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4279 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
4280 == TYPE_VECTOR_SUBPARTS (vectype_in));
4281 }
4282 else
4283 {
4284 vec_num = 1;
4285 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4286 if (op_type == ternary_op)
4287 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4288 }
4289
4290 phis = VEC_alloc (gimple, heap, vec_num);
4291 vect_defs = VEC_alloc (tree, heap, vec_num);
4292 if (!slp_node)
4293 VEC_quick_push (tree, vect_defs, NULL_TREE);
4294
4295 for (j = 0; j < ncopies; j++)
4296 {
4297 if (j == 0 || !single_defuse_cycle)
4298 {
4299 for (i = 0; i < vec_num; i++)
4300 {
4301 /* Create the reduction-phi that defines the reduction
4302 operand. */
4303 new_phi = create_phi_node (vec_dest, loop->header);
4304 set_vinfo_for_stmt (new_phi,
4305 new_stmt_vec_info (new_phi, loop_vinfo,
4306 NULL));
4307 if (j == 0 || slp_node)
4308 VEC_quick_push (gimple, phis, new_phi);
4309 }
4310 }
4311
4312 if (code == COND_EXPR)
4313 {
4314 gcc_assert (!slp_node);
4315 vectorizable_condition (stmt, gsi, vec_stmt,
4316 PHI_RESULT (VEC_index (gimple, phis, 0)),
4317 reduc_index);
4318 /* Multiple types are not supported for condition. */
4319 break;
4320 }
4321
4322 /* Handle uses. */
4323 if (j == 0)
4324 {
4325 tree op0, op1 = NULL_TREE;
4326
4327 op0 = ops[!reduc_index];
4328 if (op_type == ternary_op)
4329 {
4330 if (reduc_index == 0)
4331 op1 = ops[2];
4332 else
4333 op1 = ops[1];
4334 }
4335
4336 if (slp_node)
4337 vect_get_slp_defs (op0, op1, slp_node, &vec_oprnds0, &vec_oprnds1,
4338 -1);
4339 else
4340 {
4341 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
4342 stmt, NULL);
4343 VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
4344 if (op_type == ternary_op)
4345 {
4346 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
4347 NULL);
4348 VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
4349 }
4350 }
4351 }
4352 else
4353 {
4354 if (!slp_node)
4355 {
4356 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
4357 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
4358 VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0);
4359 if (op_type == ternary_op)
4360 {
4361 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
4362 loop_vec_def1);
4363 VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1);
4364 }
4365 }
4366
4367 if (single_defuse_cycle)
4368 reduc_def = gimple_assign_lhs (new_stmt);
4369
4370 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
4371 }
4372
4373 FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, def0)
4374 {
4375 if (slp_node)
4376 reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
4377 else
4378 {
4379 if (!single_defuse_cycle || j == 0)
4380 reduc_def = PHI_RESULT (new_phi);
4381 }
4382
4383 def1 = ((op_type == ternary_op)
4384 ? VEC_index (tree, vec_oprnds1, i) : NULL);
4385 if (op_type == binary_op)
4386 {
4387 if (reduc_index == 0)
4388 expr = build2 (code, vectype_out, reduc_def, def0);
4389 else
4390 expr = build2 (code, vectype_out, def0, reduc_def);
4391 }
4392 else
4393 {
4394 if (reduc_index == 0)
4395 expr = build3 (code, vectype_out, reduc_def, def0, def1);
4396 else
4397 {
4398 if (reduc_index == 1)
4399 expr = build3 (code, vectype_out, def0, reduc_def, def1);
4400 else
4401 expr = build3 (code, vectype_out, def0, def1, reduc_def);
4402 }
4403 }
4404
4405 new_stmt = gimple_build_assign (vec_dest, expr);
4406 new_temp = make_ssa_name (vec_dest, new_stmt);
4407 gimple_assign_set_lhs (new_stmt, new_temp);
4408 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4409 if (slp_node)
4410 {
4411 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4412 VEC_quick_push (tree, vect_defs, new_temp);
4413 }
4414 else
4415 VEC_replace (tree, vect_defs, 0, new_temp);
4416 }
4417
4418 if (slp_node)
4419 continue;
4420
4421 if (j == 0)
4422 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4423 else
4424 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4425
4426 prev_stmt_info = vinfo_for_stmt (new_stmt);
4427 prev_phi_info = vinfo_for_stmt (new_phi);
4428 }
4429
4430 /* Finalize the reduction-phi (set its arguments) and create the
4431 epilog reduction code. */
4432 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
4433 {
4434 new_temp = gimple_assign_lhs (*vec_stmt);
4435 VEC_replace (tree, vect_defs, 0, new_temp);
4436 }
4437
4438 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
4439 epilog_reduc_code, phis, reduc_index,
4440 double_reduc, slp_node);
4441
4442 VEC_free (gimple, heap, phis);
4443 VEC_free (tree, heap, vec_oprnds0);
4444 if (vec_oprnds1)
4445 VEC_free (tree, heap, vec_oprnds1);
4446
4447 return true;
4448 }
4449
4450 /* Function vect_min_worthwhile_factor.
4451
4452 For a loop where we could vectorize the operation indicated by CODE,
4453 return the minimum vectorization factor that makes it worthwhile
4454 to use generic vectors. */
4455 int
4456 vect_min_worthwhile_factor (enum tree_code code)
4457 {
4458 switch (code)
4459 {
4460 case PLUS_EXPR:
4461 case MINUS_EXPR:
4462 case NEGATE_EXPR:
4463 return 4;
4464
4465 case BIT_AND_EXPR:
4466 case BIT_IOR_EXPR:
4467 case BIT_XOR_EXPR:
4468 case BIT_NOT_EXPR:
4469 return 2;
4470
4471 default:
4472 return INT_MAX;
4473 }
4474 }
4475
4476
4477 /* Function vectorizable_induction
4478
4479 Check if PHI performs an induction computation that can be vectorized.
4480 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
4481 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
4482 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4483
4484 bool
4485 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4486 gimple *vec_stmt)
4487 {
4488 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
4489 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4490 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4491 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4492 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4493 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4494 tree vec_def;
4495
4496 gcc_assert (ncopies >= 1);
4497 /* FORNOW. This restriction should be relaxed. */
4498 if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
4499 {
4500 if (vect_print_dump_info (REPORT_DETAILS))
4501 fprintf (vect_dump, "multiple types in nested loop.");
4502 return false;
4503 }
4504
4505 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4506 return false;
4507
4508 /* FORNOW: SLP not supported. */
4509 if (STMT_SLP_TYPE (stmt_info))
4510 return false;
4511
4512 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
4513
4514 if (gimple_code (phi) != GIMPLE_PHI)
4515 return false;
4516
4517 if (!vec_stmt) /* transformation not required. */
4518 {
4519 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
4520 if (vect_print_dump_info (REPORT_DETAILS))
4521 fprintf (vect_dump, "=== vectorizable_induction ===");
4522 vect_model_induction_cost (stmt_info, ncopies);
4523 return true;
4524 }
4525
4526 /** Transform. **/
4527
4528 if (vect_print_dump_info (REPORT_DETAILS))
4529 fprintf (vect_dump, "transform induction phi.");
4530
4531 vec_def = get_initial_def_for_induction (phi);
4532 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
4533 return true;
4534 }
4535
4536 /* Function vectorizable_live_operation.
4537
4538 STMT computes a value that is used outside the loop. Check if
4539 it can be supported. */
4540
4541 bool
4542 vectorizable_live_operation (gimple stmt,
4543 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4544 gimple *vec_stmt ATTRIBUTE_UNUSED)
4545 {
4546 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4547 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4548 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4549 int i;
4550 int op_type;
4551 tree op;
4552 tree def;
4553 gimple def_stmt;
4554 enum vect_def_type dt;
4555 enum tree_code code;
4556 enum gimple_rhs_class rhs_class;
4557
4558 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
4559
4560 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
4561 return false;
4562
4563 if (!is_gimple_assign (stmt))
4564 return false;
4565
4566 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4567 return false;
4568
4569 /* FORNOW. CHECKME. */
4570 if (nested_in_vect_loop_p (loop, stmt))
4571 return false;
4572
4573 code = gimple_assign_rhs_code (stmt);
4574 op_type = TREE_CODE_LENGTH (code);
4575 rhs_class = get_gimple_rhs_class (code);
4576 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
4577 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
4578
4579 /* FORNOW: support only if all uses are invariant. This means
4580 that the scalar operations can remain in place, unvectorized.
4581 The original last scalar value that they compute will be used. */
4582
4583 for (i = 0; i < op_type; i++)
4584 {
4585 if (rhs_class == GIMPLE_SINGLE_RHS)
4586 op = TREE_OPERAND (gimple_op (stmt, 1), i);
4587 else
4588 op = gimple_op (stmt, i + 1);
4589 if (op
4590 && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
4591 {
4592 if (vect_print_dump_info (REPORT_DETAILS))
4593 fprintf (vect_dump, "use not simple.");
4594 return false;
4595 }
4596
4597 if (dt != vect_external_def && dt != vect_constant_def)
4598 return false;
4599 }
4600
4601 /* No transformation is required for the cases we currently support. */
4602 return true;
4603 }
4604
4605 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
4606
4607 static void
4608 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
4609 {
4610 ssa_op_iter op_iter;
4611 imm_use_iterator imm_iter;
4612 def_operand_p def_p;
4613 gimple ustmt;
4614
4615 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
4616 {
4617 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
4618 {
4619 basic_block bb;
4620
4621 if (!is_gimple_debug (ustmt))
4622 continue;
4623
4624 bb = gimple_bb (ustmt);
4625
4626 if (!flow_bb_inside_loop_p (loop, bb))
4627 {
4628 if (gimple_debug_bind_p (ustmt))
4629 {
4630 if (vect_print_dump_info (REPORT_DETAILS))
4631 fprintf (vect_dump, "killing debug use");
4632
4633 gimple_debug_bind_reset_value (ustmt);
4634 update_stmt (ustmt);
4635 }
4636 else
4637 gcc_unreachable ();
4638 }
4639 }
4640 }
4641 }
4642
4643 /* Function vect_transform_loop.
4644
4645 The analysis phase has determined that the loop is vectorizable.
4646 Vectorize the loop - created vectorized stmts to replace the scalar
4647 stmts in the loop, and update the loop exit condition. */
4648
4649 void
4650 vect_transform_loop (loop_vec_info loop_vinfo)
4651 {
4652 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4653 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4654 int nbbs = loop->num_nodes;
4655 gimple_stmt_iterator si;
4656 int i;
4657 tree ratio = NULL;
4658 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4659 bool strided_store;
4660 bool slp_scheduled = false;
4661 unsigned int nunits;
4662 tree cond_expr = NULL_TREE;
4663 gimple_seq cond_expr_stmt_list = NULL;
4664 bool do_peeling_for_loop_bound;
4665
4666 if (vect_print_dump_info (REPORT_DETAILS))
4667 fprintf (vect_dump, "=== vec_transform_loop ===");
4668
4669 /* Peel the loop if there are data refs with unknown alignment.
4670 Only one data ref with unknown store is allowed. */
4671
4672 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4673 vect_do_peeling_for_alignment (loop_vinfo);
4674
4675 do_peeling_for_loop_bound
4676 = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4677 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4678 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0));
4679
4680 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
4681 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4682 vect_loop_versioning (loop_vinfo,
4683 !do_peeling_for_loop_bound,
4684 &cond_expr, &cond_expr_stmt_list);
4685
4686 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4687 compile time constant), or it is a constant that doesn't divide by the
4688 vectorization factor, then an epilog loop needs to be created.
4689 We therefore duplicate the loop: the original loop will be vectorized,
4690 and will compute the first (n/VF) iterations. The second copy of the loop
4691 will remain scalar and will compute the remaining (n%VF) iterations.
4692 (VF is the vectorization factor). */
4693
4694 if (do_peeling_for_loop_bound)
4695 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
4696 cond_expr, cond_expr_stmt_list);
4697 else
4698 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
4699 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
4700
4701 /* 1) Make sure the loop header has exactly two entries
4702 2) Make sure we have a preheader basic block. */
4703
4704 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
4705
4706 split_edge (loop_preheader_edge (loop));
4707
4708 /* FORNOW: the vectorizer supports only loops which body consist
4709 of one basic block (header + empty latch). When the vectorizer will
4710 support more involved loop forms, the order by which the BBs are
4711 traversed need to be reconsidered. */
4712
4713 for (i = 0; i < nbbs; i++)
4714 {
4715 basic_block bb = bbs[i];
4716 stmt_vec_info stmt_info;
4717 gimple phi;
4718
4719 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
4720 {
4721 phi = gsi_stmt (si);
4722 if (vect_print_dump_info (REPORT_DETAILS))
4723 {
4724 fprintf (vect_dump, "------>vectorizing phi: ");
4725 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
4726 }
4727 stmt_info = vinfo_for_stmt (phi);
4728 if (!stmt_info)
4729 continue;
4730
4731 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4732 vect_loop_kill_debug_uses (loop, phi);
4733
4734 if (!STMT_VINFO_RELEVANT_P (stmt_info)
4735 && !STMT_VINFO_LIVE_P (stmt_info))
4736 continue;
4737
4738 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
4739 != (unsigned HOST_WIDE_INT) vectorization_factor)
4740 && vect_print_dump_info (REPORT_DETAILS))
4741 fprintf (vect_dump, "multiple-types.");
4742
4743 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
4744 {
4745 if (vect_print_dump_info (REPORT_DETAILS))
4746 fprintf (vect_dump, "transform phi.");
4747 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
4748 }
4749 }
4750
4751 for (si = gsi_start_bb (bb); !gsi_end_p (si);)
4752 {
4753 gimple stmt = gsi_stmt (si);
4754 bool is_store;
4755
4756 if (vect_print_dump_info (REPORT_DETAILS))
4757 {
4758 fprintf (vect_dump, "------>vectorizing statement: ");
4759 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
4760 }
4761
4762 stmt_info = vinfo_for_stmt (stmt);
4763
4764 /* vector stmts created in the outer-loop during vectorization of
4765 stmts in an inner-loop may not have a stmt_info, and do not
4766 need to be vectorized. */
4767 if (!stmt_info)
4768 {
4769 gsi_next (&si);
4770 continue;
4771 }
4772
4773 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4774 vect_loop_kill_debug_uses (loop, stmt);
4775
4776 if (!STMT_VINFO_RELEVANT_P (stmt_info)
4777 && !STMT_VINFO_LIVE_P (stmt_info))
4778 {
4779 gsi_next (&si);
4780 continue;
4781 }
4782
4783 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
4784 nunits =
4785 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4786 if (!STMT_SLP_TYPE (stmt_info)
4787 && nunits != (unsigned int) vectorization_factor
4788 && vect_print_dump_info (REPORT_DETAILS))
4789 /* For SLP VF is set according to unrolling factor, and not to
4790 vector size, hence for SLP this print is not valid. */
4791 fprintf (vect_dump, "multiple-types.");
4792
4793 /* SLP. Schedule all the SLP instances when the first SLP stmt is
4794 reached. */
4795 if (STMT_SLP_TYPE (stmt_info))
4796 {
4797 if (!slp_scheduled)
4798 {
4799 slp_scheduled = true;
4800
4801 if (vect_print_dump_info (REPORT_DETAILS))
4802 fprintf (vect_dump, "=== scheduling SLP instances ===");
4803
4804 vect_schedule_slp (loop_vinfo, NULL);
4805 }
4806
4807 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
4808 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
4809 {
4810 gsi_next (&si);
4811 continue;
4812 }
4813 }
4814
4815 /* -------- vectorize statement ------------ */
4816 if (vect_print_dump_info (REPORT_DETAILS))
4817 fprintf (vect_dump, "transform statement.");
4818
4819 strided_store = false;
4820 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
4821 if (is_store)
4822 {
4823 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4824 {
4825 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
4826 interleaving chain was completed - free all the stores in
4827 the chain. */
4828 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
4829 gsi_remove (&si, true);
4830 continue;
4831 }
4832 else
4833 {
4834 /* Free the attached stmt_vec_info and remove the stmt. */
4835 free_stmt_vec_info (stmt);
4836 gsi_remove (&si, true);
4837 continue;
4838 }
4839 }
4840 gsi_next (&si);
4841 } /* stmts in BB */
4842 } /* BBs in loop */
4843
4844 slpeel_make_loop_iterate_ntimes (loop, ratio);
4845
4846 /* The memory tags and pointers in vectorized statements need to
4847 have their SSA forms updated. FIXME, why can't this be delayed
4848 until all the loops have been transformed? */
4849 update_ssa (TODO_update_ssa);
4850
4851 if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4852 fprintf (vect_dump, "LOOP VECTORIZED.");
4853 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4854 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");
4855 }